# Imports and functions

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings

from sklearn.model_selection import  train_test_split
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler

from sklearn.metrics import log_loss, balanced_accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import StackingClassifier

In [None]:
def print_performance(X_train, y_train, X_val, y_val, model):
    print("BALANCED ACCURACY TRAIN:", balanced_accuracy_score(y_train, model.predict(X_train)))
    print("LOGLOSS TRAIN:", log_loss(y_train, model.predict_proba(X_train)))
    print("BALANCED ACCURACY VALIDATION:", balanced_accuracy_score(y_val, model.predict(X_val)))
    print("LOGLOSS VALIDATION:", log_loss(y_val, model.predict_proba(X_val)))

In [None]:
def transform_dataset(dataset, fit, test):
    # drop id col
    dataset.drop('id', axis=1, inplace=True)
    # split label - data
    if not test:
        X, y = dataset.drop('target', axis=1), dataset[['target']]
    else:
        X = dataset
    # encode labels
    if not test:
        y = class_enc.fit_transform(y).flatten()
    # normalization
    if fit:
        X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    else:
        X = pd.DataFrame(scaler.transform(X), columns=X.columns)
    
    if not test:
        return X, y
    else:
        return X

# Load data

In [None]:
train_data = pd.read_csv("/kaggle/input/tabular-playground-series-may-2021/train.csv")
test_data = pd.read_csv("/kaggle/input/tabular-playground-series-may-2021/test.csv")

In [None]:
train_data.head()

In [None]:
test_data.head()

# Exploratory analysis

In [None]:
train_data.info()

In [None]:
train_data.describe()

In [None]:
fig, ax = plt.subplots(figsize=(14,10))
cax = ax.matshow(train_data.corr(), cmap='gray', interpolation='nearest')
ax.set_title("Correlation")
ax.set_xticks(range(len(train_data.columns)-1))
ax.set_xticklabels(train_data.columns[:-1], fontdict={'rotation':'vertical'})
ax.set_yticks(range(len(train_data.columns)-1))
ax.set_yticklabels(train_data.columns[:-1])
plt.colorbar(cax)
plt.show()

In [None]:
# instances of each class
train_data['target'].value_counts()

# Preprocessing

In [None]:
# split train - validation
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=159)

In [None]:
# transform train and validation data
class_enc = OrdinalEncoder(categories=[['Class_1', 'Class_2', 'Class_3', 'Class_4']])
scaler = MinMaxScaler()

X_train, y_train = transform_dataset(train_data, fit=True, test=False)
X_val, y_val = transform_dataset(val_data, fit=False, test=False)

# Modeling

In [None]:
# models
model0 = LinearDiscriminantAnalysis()
model1 = CatBoostClassifier(loss_function='MultiClass',
                           iterations=500,
                           learning_rate=0.1,
                           depth=4,
                           random_seed=159, 
                           verbose=0)
model2 = XGBClassifier(n_estimators=100,
                       learning_rate=0.1,
                       random_state=159, 
                       use_label_encoder=False, 
                       verbosity=0)
model3 = LGBMClassifier(learning_rate=0.05,
                        n_estimators=100,
                        max_depth=30,
                        objective='multiclass', 
                        random_state=159)
model4 = RandomForestClassifier(n_estimators=500,
                                max_depth=15,
                                random_state=159)
model5 = MLPClassifier(hidden_layer_sizes=(10,),
                       learning_rate_init=0.001,
                        random_state=159,
                        max_iter=300)

In [None]:
# stack models
warnings.filterwarnings(action='ignore', category=UserWarning)  # ignore XGBClassifier warning
stack = StackingClassifier(estimators=[('model0',model0), ('model1',model1), ('model2',model2),
                                       ('model3',model3), ('model4',model4), ('model5',model5)], 
                           final_estimator=LogisticRegression(max_iter=300),
                           cv=3,
                           passthrough=False)
stack.fit(X_train, y_train)

In [None]:
print_performance(X_train, y_train, X_val, y_val, stack)

In [None]:
# reduce penalizations
preds = stack.predict_proba(X_val)

log_loss(y_val, np.clip(preds, 0.05, 0.95))

# Train final model + results

In [None]:
train_data = pd.read_csv("/kaggle/input/tabular-playground-series-may-2021/train.csv")

# process train data
X_train, y_train = transform_dataset(train_data, fit=True, test=False)
# process test data
test_id = test_data['id']
test_data = transform_dataset(test_data, fit=False, test=True)

In [None]:
# models
model0 = LinearDiscriminantAnalysis()
model1 = CatBoostClassifier(loss_function='MultiClass',
                           iterations=500,
                           learning_rate=0.1,
                           depth=4,
                           random_seed=159, 
                           verbose=0)
model2 = XGBClassifier(n_estimators=100,
                       learning_rate=0.1,
                       random_state=159, 
                       use_label_encoder=False, 
                       verbosity=0)
model3 = LGBMClassifier(learning_rate=0.05,
                        n_estimators=100,
                        max_depth=30,
                        objective='multiclass', 
                        random_state=159)
model4 = RandomForestClassifier(n_estimators=500,
                                max_depth=15,
                                random_state=159)
model5 = MLPClassifier(hidden_layer_sizes=(10,),
                       learning_rate_init=0.001,
                        random_state=159,
                        max_iter=300)

stack = StackingClassifier(estimators=[('model0',model0), ('model1',model1), ('model2',model2),
                                       ('model3',model3), ('model4',model4), ('model5',model5)], 
                           final_estimator=LogisticRegression(max_iter=300),
                           cv=3,
                           passthrough=False)
stack.fit(X_train, y_train)

In [None]:
# reduce penalizations
preds = np.clip(stack.predict_proba(test_data), 0.05, 0.95)

In [None]:
results = pd.DataFrame(columns=['id','Class_1','Class_2','Class_3','Class_4'])

In [None]:
results['id'] = test_id
results['Class_1'] = preds[:,0]
results['Class_2'] = preds[:,1]
results['Class_3'] = preds[:,2]
results['Class_4'] = preds[:,3]

results.head()

In [None]:
results.to_csv('predictions.csv', index=False)