In [None]:
# Key ideas from here (Embedding) https://www.kaggle.com/alexryzhkov/python-keras-nn-residual

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt

import optuna
from optuna.samplers import TPESampler

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix,classification_report,log_loss 
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

import tensorflow.keras as keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Embedding
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import initializers
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras import activations,callbacks
from tensorflow.keras import layers

import warnings
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jun-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jun-2021/test.csv')
sub = pd.read_csv('../input/tabular-playground-series-jun-2021/sample_submission.csv')
train.head()

In [None]:
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

# counts number of occurrences by class
sorted(train['target'].value_counts())

In [None]:
# transform target into 9 columns, one for each class
label_dict = {val:idx for idx, val in enumerate(sorted(train['target'].unique()))}
train['target'] = train['target'].map(label_dict)

lencoder = LabelEncoder()
target = pd.DataFrame(lencoder.fit_transform(train['target']),columns=['target'])

#target = train['target']
train.drop(['target'], inplace=True, axis=1)

In [None]:
sns.countplot(x = 'target', data = target)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size = 0.30, 
                                                  stratify = target, random_state = 2021)

In [None]:
# First model: MLP
y_train_MLP = to_categorical(y_train) 
y_val_MLP = to_categorical(y_val)

num_features = 75
num_classes = 9

In [None]:
model = Sequential([
        layers.Input (shape = (X_train.shape[1],)),
        layers.Embedding(400,8),
        layers.Flatten(),
        BatchNormalization(),
        Dropout(0.2),
        Dense(64, activation='relu'),
        BatchNormalization(),
        Dropout(0.2),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dense(num_classes, activation = 'softmax')
    ])

model.summary()

In [None]:
model.compile(loss = CategoricalCrossentropy(),
              optimizer = keras.optimizers.Adam(learning_rate = 2e-4), 
              metrics = 'CategoricalAccuracy')

In [None]:
es = callbacks.EarlyStopping(
    monitor='val_loss', min_delta = 0.001, patience = 5, verbose = 0,
    mode = 'min', baseline = None, restore_best_weights = True)

plateau = callbacks.ReduceLROnPlateau(
    monitor = 'val_loss', factor = 0.7, patience=5, verbose = 0,
    mode = 'min', min_delta = 0.0001, cooldown = 0, min_lr = 10e-7)

In [None]:
history = model.fit(X_train, y_train_MLP,
            batch_size = 256, epochs = 100, verbose = 2,
            validation_data = (X_val, y_val_MLP),
            callbacks=[es,plateau])

In [None]:
# Model 2: lgbm
params = {'objective': 'multiclass','num_class' : 9,  'metric': 'multi_logloss', 
              'verbosity' : -1, 'boosting_type' : 'gbdt', 'bagging_freq' : 1}

In [None]:
def objective(trial): 
    num_iterations = trial.suggest_int('num_iterations',50,500)
    max_depth = trial.suggest_int('max_depth',3,20)
    num_leaves = trial.suggest_int('num_leaves',10,30)
    learning_rate = trial.suggest_uniform('learning_rate',0.01,0.2)
    subsample = trial.suggest_uniform('subsample',0.1, 0.5)
    feature_fraction = trial.suggest_uniform('feature fraction',0.5, 0.9)
    #min_child_samples = trial.suggest_int('min_child_samples', 1, 110),
    #min_child_weight = trial.suggest_loguniform('min_child_weight' , 1e-5 , 1),
    lambda_l2 = trial.suggest_uniform('lambda_l2',1e-5,20)   
    
    model = LGBMClassifier(**params,
            num_iterations = num_iterations,
            max_depth = max_depth,
            num_leaves = num_leaves,
            learning_rate = learning_rate,
            subsample = subsample,
            feature_fraction = feature_fraction,
            #min_child_samples = min_child_samples,
            #min_child_weight = min_child_weight,
            lambda_l2 = lambda_l2
            )
    
    nll = cross_val_score(model,X_train,y_train,scoring = 'neg_log_loss', cv = 5).mean()
    return -1*nll

In [None]:
sampler = TPESampler(seed=1111)
study = optuna.create_study(direction = 'minimize', sampler = sampler)
study.optimize(objective,n_trials = 1)
print('numbers of the finished trials:' , len(study.trials))
print(study.best_value)
print(study.best_params)

In [None]:
lgbm = LGBMClassifier(**params, 
                    num_iterations = 381,
                    max_depth = 11,
                    num_leaves = 26,
                    learning_rate = 0.034036199129501656,
                    subsample = 0.3426597278002642,
                    feature_fraction = 0.6864483899756816,
                    #min_child_samples = 27,
                    #min_child_weight = 0.04781667419116532,
                    lambda_l2 = 4.742650116506535)

lgbm.fit(X_train,y_train,verbose = False)

In [None]:
# Third model: RF
rf_model = RandomForestClassifier(n_estimators = 50, 
                                  criterion = 'entropy') 

rf_model.fit(X_train, y_train)

In [None]:
# Blending model: Logistic Regression

X1 = model.predict(X_val.values)
X2 = lgbm.predict_proba(X_val)
#X3 = rf_model.predict_proba(X_val)
X = np.concatenate((X_val,X1, X2), axis = 1)

clf = LogisticRegression(multi_class = 'multinomial',fit_intercept = False)
model1 = clf.fit(X, y_val)

test1 = model.predict(test.values)
test2 = lgbm.predict_proba(test)
#test3 = rf_model.predict_proba(test)

test_final1 = np.concatenate((test, test1, test2), axis = 1)

In [None]:
sub = pd.read_csv('../input/tabular-playground-series-jun-2021/sample_submission.csv')
sub[sub.columns[1:]] = model1.predict_proba(test_final1)
sub.to_csv("my_submission.csv",index = False)
sub.head()