In [None]:
import gc

import pandas as pd
import numpy as np

from scipy.sparse import hstack

from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder,StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score,roc_auc_score, log_loss
from sklearn.model_selection import StratifiedKFold, KFold, cross_val_score,cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif, chi2, SelectPercentile
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.calibration import CalibratedClassifierCV, calibration_curve

import optuna
import lightgbm as lgbm
import xgboost as xgb
import pickle
import category_encoders as ce
from catboost import CatBoostClassifier, Pool

import string
import re

import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow.keras.backend as K
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import activations,callbacks
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import initializers

from keras.models import Model

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jun-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jun-2021/test.csv')
sample_sub = pd.read_csv('../input/tabular-playground-series-jun-2021/sample_submission.csv')

In [None]:
train.head()

In [None]:
test.head()

## Logistic Regression

X = train.drop(['id', 'target'], axis=1, inplace=False).copy()
y = train['target'].values

X_test = test.drop(['id'], axis=1, inplace=False).copy()
random_seed = 0

encoder = OneHotEncoder()
all_encoded = encoder.fit_transform(X.append(X_test))

X = all_encoded.tocsr()[0:len(X)]
X_test = all_encoded[len(train):]

params = {
    'penalty': 'l2',
    'multi_class':'ovr',
    'solver':'lbfgs',
    'C':0.01,
    'max_iter':10000,
    'class_weight':None
}

name = 'Logistic_regression'
k = 5
seed_list = [0, 1, 2]
kf = StratifiedKFold(n_splits = k, shuffle=True, random_state=random_seed)
oof = np.zeros((len(train), 9))
test_preds_list = []
score_list = []
fold = 1

splits = list(kf.split(X, y))
for train_idx, val_idx in splits:
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    val_preds_list = []
    
    for seed in seed_list:
        base_model = LogisticRegression(**params, random_state=seed)
        model = CalibratedClassifierCV(base_model, method='sigmoid', cv=k)
        
        model.fit(X_train, y_train)
        
        val_preds_list.append(model.predict_proba(X_val))
        test_preds_list.append(model.predict_proba(X_test))
        
    oof[val_idx] = np.mean(val_preds_list, axis=0)
    score = log_loss(y_val, oof[val_idx])
    print(f'fold: {fold}, log_loss: {score}')
    score_list.append(score)
    
    fold += 1
    
cv_logloss = np.mean(score_list)
print(f'{name}, log_loss: {cv_logloss}')

preds = np.mean(test_preds_list, axis = 0)

sample_sub[['Class_1','Class_2','Class_3','Class_4','Class_5','Class_6','Class_7','Class_8','Class_9']] = preds
sample_sub.to_csv('submission_1.csv',index=None)

## Random Forest

In [None]:
X = train.drop(['id', 'target'], axis=1, inplace=False).copy()
y = train['target'].values

X_test = test.drop(['id'], axis=1, inplace=False).copy()
random_seed = 0

In [None]:
encoder = OrdinalEncoder()
all_encoded = encoder.fit_transform(X.append(X_test))

X = all_encoded[0:len(X)]
X_test = all_encoded[len(X):]

In [None]:
params = {
          'bootstrap':True,
          'max_depth':30,
          'max_features':'auto'  ,
          'min_samples_leaf' :10,
          'min_samples_split':5,
          'n_estimators':500
          }

In [None]:
name = 'Random_forest'
k = 5
seed_list = [0, 1, 2]
kf = StratifiedKFold(n_splits = k, shuffle=True, random_state=random_seed)
oof = np.zeros((len(train), 9))
test_preds_list = []
score_list = []
fold = 1

splits = list(kf.split(X, y))
for train_idx, val_idx in splits:
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    val_preds_list = []
    
    for seed in seed_list:
        base_model = RandomForestClassifier(**params, random_state = seed)
        model = CalibratedClassifierCV(base_model, method='sigmoid', cv=k)
        
        model.fit(X_train, y_train)
        
        val_preds_list.append(model.predict_proba(X_val))
        test_preds_list.append(model.predict_proba(X_test))
    
    oof[val_idx] = np.mean(val_preds_list, axis=0)
    score = log_loss(y_val, oof[val_idx])
    print(f'fold: {fold}, log_loss: {score}')
    score_list.append(score)
    fold += 1
    
cv_logloss = np.mean(score_list)
print(f'{name}, log_loss: {cv_logloss}')

preds = np.mean(test_preds_list, axis = 0)

sample_sub[['Class_1','Class_2','Class_3','Class_4','Class_5','Class_6','Class_7','Class_8','Class_9']] = preds
sample_sub.to_csv('submission_2.csv',index=None)

## LightGBM

In [None]:
X = train.drop(['id', 'target'], axis=1, inplace=False).copy()
y = train['target'].values

X_test = test.drop(['id'], axis=1, inplace=False).copy()
random_seed = 0

In [None]:
encoder = OrdinalEncoder()
all_encoded = encoder.fit_transform(X.append(X_test))
X = all_encoded[0:len(X)]
X_test = all_encoded[len(X):]

In [None]:
params = {
  'learning_rate': .02,
   'max_depth': 3,
    'num_leaves': 6,
    'min_split_gain': 0.17865452483871047,
    'reg_alpha': 9.540720621520459,
    'reg_lambda': 4.5781292529661375,
    'colsample_bytree': 0.0644950794287173,
    'subsample': 0.9314592865852914,
    'subsample_freq': 7,
    'min_child_samples': 57
}

In [None]:
params_lgbm = params
params_lgbm['boosting_type'] = 'gbdt'
params_lgbm['device'] = 'gpu'
params_lgbm ['objective'] = 'multiclasss'
params_lgbm ['num_classes'] = 9,

params_lgbm ['metric'] = 'multi_logloss'
params_lgbm ['verbosity'] = -1
params_lgbm ['n_estimators']= 500
#params_lgbm["cat_feature"] = cat_features

name = 'lighgbm_3seeds_5fold'
k = 5
seed_list = [0, 1, 2]
kf = StratifiedKFold(n_splits = k, shuffle=True, random_state=random_seed)
off = np.zeros((len(train), 9))
test_preds_list = []
score_list = []
fold = 1

splits = list(kf.split(X, y))
for train_idx, val_idx in splits:
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    val_preds_list = []
    
    for seed in seed_list:
        params_lgbm['random_state'] = seed
        model = lgbm.LGBMClassifier(**params_lgbm)
    
        model.fit(X_train, y_train, eval_set = [(X_train,y_train),(X_val,y_val)],
                 early_stopping_rounds=100,
                 eval_names=['train','val'],verbose=200)
        
        val_preds_list.append(model.predict_proba(X_val))
        test_preds_list.append(model.predict_proba(X_test))
    
    oof[val_idx] = np.mean(val_preds_list, axis=0)
    score = log_loss(y_val, oof[val_idx])
    print(f'fold: {fold}, log_loss: {score}')
    score_list.append(score)
    fold += 1
    
cv_logloss = np.mean(score_list)
print(f'{name}, log_loss: {cv_logloss}')

preds = np.mean(test_preds_list, axis = 0)

sample_sub[['Class_1','Class_2','Class_3','Class_4','Class_5','Class_6','Class_7','Class_8','Class_9']] = preds
sample_sub.to_csv('submission_3.csv',index=None)

## XGBoost

In [None]:
X = train.drop(labels=['id','target'],axis=1,inplace=False).copy()
y = train['target'].map({"Class_1":0,"Class_2":1,"Class_3":2,"Class_4":3,"Class_5":4,"Class_6":5, "Class_7":6, "Class_8":7, "Class_9":8}).values
X_test = test.drop(labels=['id'],axis=1,inplace=False).copy()
random_seed = 0

encoder = OrdinalEncoder()
all_encoded = encoder.fit_transform(X.append(X_test))
X = all_encoded[0:len(X)]
X_test = all_encoded[len(X):]

In [None]:
params =  {'lambda': 1.3718620937297796, 
           'alpha': 6.395781966352342, 
           'colsample_bytree': 0.2390564723786096, 
           'colsample_bynode': 0.7459555518737353, 
           'colsample_bylevel': 0.36002014547566097, 
           'subsample': 0.6302863949739616,
           'eta': 0.01, 
           'grow_policy': 'lossguide', 
           'max_depth': 19, 
           'min_child_weight': 28, 
           'max_bin': 258, 
           'deterministic_histogram': False}

In [None]:
params_xgb = params
params_xgb["tree_method"] = "gpu_hist"
params_xgb["predictor"] = 'gpu_predictor'
params_xgb["objective"] = 'multi:softprob'
params_xgb["num_class"] = 9
#params_xgb["eval_metric"] ='logloss'

name = 'xgboost_3seeds_5fold'
k=5
seed_list=[0,1,2]
kf = StratifiedKFold(n_splits=k,shuffle=True,random_state=random_seed)
oof = np.zeros((len(train),9))
test_preds_list = []
score_list = []
fold=1
  
splits = list(kf.split(X,y))
fold = 1
for train_idx, val_idx in splits:
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    val_preds_list = []

    for seed in seed_list:
    
        # fit and run model
        params_xgb['seed'] = seed
    
        dtrain = xgb.DMatrix(data=X_train, label=y_train)
        dval = xgb.DMatrix(data=X_val, label=y_val)
        dtest = xgb.DMatrix(data=X_test)
    
        model = xgb.train(params_xgb, dtrain,\
                       evals=[(dtrain,'train'),(dval,'val')],\
                       verbose_eval=100,
                       early_stopping_rounds=100,
                       num_boost_round=100000)
    
    

    
        val_preds_list.append(model.predict(dval))
        test_preds_list.append(model.predict(dtest))
    
    oof[val_idx] = np.mean(val_preds_list,axis=0)
    score = log_loss(y_val, oof[val_idx])
    print(f"fold: {fold},log_loss: {score}")
    score_list.append(score)
    # print(f"fold: {fold}, class0 tr %: {y_train.value_counts()[0]/len(y_train)}, class0 val %: {y_val.value_counts()[0]/len(y_val)} ")
    fold +=1
  
cv_logloss = np.mean(score_list)
print(f"{name} ,log_loss: {cv_logloss}")

preds= np.mean(test_preds_list,axis=0)

sample_sub[['Class_1','Class_2','Class_3','Class_4','Class_5','Class_6','Class_7','Class_8','Class_9']] = preds
sample_sub.to_csv('submission_4.csv',index=None)

## CatBoost

In [None]:
X = train.drop(labels=['id','target'],axis=1,inplace=False).copy()
y = train['target'].values
X_test = test.drop(labels=['id'],axis=1,inplace=False).copy()
random_seed = 0

encoder = OrdinalEncoder()
all_encoded = encoder.fit_transform(X.append(X_test))
X = all_encoded[0:len(X)]
X_test = all_encoded[len(X):]

X = X.astype(int)
X_test = X_test.astype(int)
cat_features = np.arange(0,X.shape[1]).tolist()

In [None]:
params =   {'learning_rate': 0.03470328317940195, 
           'depth': 2, 
           'l2_leaf_reg': 820.7804346737378, 
           'random_strength': 0.336019499813798, 
           'border_count': 128,
           'grow_policy': 'Lossguide',
           'min_data_in_leaf': 267}

In [None]:
params_cb = params

#params_cb["cat_features"] = cat_features
#params_cb ["learning_rate"] = 0.01
#params_cb ["depth"] = 4
params_cb ["loss_function"] = 'MultiClass'
params_cb ["od_wait"] = 1000
params_cb ["od_type"] = 'Iter'
#params_cb ["min_data_in_leaf"] = 1
#params_cb ["max_ctr_complexity"] = 15
params_cb ["task_type"] = "GPU"
params_cb["cat_features"] = cat_features
            

name = 'catboost_3seeds_5fold'
k=5
seed_list=[0,1,2]
kf = StratifiedKFold(n_splits=k,shuffle=True,random_state=random_seed)
oof = np.zeros((len(train),9))
test_preds_list = []
score_list = []
fold=1
  
splits = list(kf.split(X,y))
fold = 1
for train_idx, val_idx in splits:
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    val_preds_list = []

    for seed in seed_list:
    
    # fit and run model
        params_cb['random_state'] = seed
        
        model = CatBoostClassifier(**params_cb,
            iterations=50000,
            use_best_model=True,
            )

        model.fit(X_train,y=y_train,
              use_best_model=True,
              eval_set=[(X_val,y_val)],
              verbose=100)
    

    
        val_preds_list.append(model.predict_proba(X_val))
        test_preds_list.append(model.predict_proba(X_test))
    
    oof[val_idx] = np.mean(val_preds_list,axis=0)
    score = log_loss(y_val, oof[val_idx])
    print(f"fold: {fold},log_loss: {score}")
    score_list.append(score)
  # print(f"fold: {fold}, class0 tr %: {y_train.value_counts()[0]/len(y_train)}, class0 val %: {y_val.value_counts()[0]/len(y_val)} ")
    fold +=1
  
cv_logloss = np.mean(score_list)
print(f"{name} ,log_loss: {cv_logloss}")

preds= np.mean(test_preds_list,axis=0)

sample_sub[['Class_1','Class_2','Class_3','Class_4','Class_5','Class_6','Class_7','Class_8','Class_9']] = preds
sample_sub.to_csv('submission_5.csv',index=None)

## CNN

In [None]:
targets = pd.get_dummies(train['target'])

In [None]:
def custom_metric(y_true, y_pred):
    y_pred = K.clip(y_pred, 1e-15, 1-1e-15)
    loss = K.mean(cce(y_true, y_pred))
    return loss

cce = tf.keras.losses.CategoricalCrossentropy()

es = tf.keras.callbacks.EarlyStopping(
    monitor='val_custom_metric', min_delta=1e-05, patience=5, verbose=0,
    mode='min', baseline=None, restore_best_weights=True)

plateau = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_custom_metric', factor=0.7, patience=2, verbose=0,
    mode='min')

In [None]:
def conv_model():

    conv_inputs = layers.Input(shape = (75))
    embed = layers.Embedding (input_dim = 354, 
                              output_dim = 7,
                              embeddings_regularizer='l2')(conv_inputs)
    embed = layers.Conv1D(12,1,activation = 'relu')(embed)        
    embed = layers.Flatten()(embed)
    hidden = layers.Dropout(0.3)(embed)
    
    hidden = tfa.layers.WeightNormalization(
                layers.Dense(
                units=32,
                activation ='selu',
                kernel_initializer = "lecun_normal"))(hidden)
    
    output = layers.Dropout(0.3)(layers.Concatenate()([embed, hidden]))
    output = tfa.layers.WeightNormalization(
    layers.Dense(
                units = 32,
                activation='relu',
                kernel_initializer = "lecun_normal"))(output) 
    output = layers.Dropout(0.4)(layers.Concatenate()([embed, hidden, output]))
    output = tfa.layers.WeightNormalization(
    layers.Dense(
                units = 32, 
                activation = 'relu',
                kernel_initializer = "lecun_normal"))(output)
    
    conv_outputs = layers.Dense(
                units = 9, 
                activation ='softmax',
                kernel_initializer ="lecun_normal")(output)
    
    model = Model(conv_inputs,conv_outputs)
    
    return model

In [None]:
oof_NN_a = np.zeros((train.shape[0],9))
pred_NN_a = np.zeros((test.shape[0],9))

N_FOLDS = 5
SEED = 2021
EPOCH = 60


skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

for fold, (tr_idx, ts_idx) in enumerate(skf.split(train,train.iloc[:,-1])):
    print(f"\n ====== TRAINING FOLD {fold} =======\n")

    X_train = train.iloc[:,1:-1].iloc[tr_idx]
    y_train = targets.iloc[tr_idx]
    X_test = train.iloc[:,1:-1].iloc[ts_idx]
    y_test = targets.iloc[ts_idx]

    K.clear_session()
    
    print("\n-----Convolution model Training----\n")

    model_conv = conv_model()

    model_conv.compile(loss='categorical_crossentropy', 
                            optimizer = keras.optimizers.Adam(learning_rate=2e-4), 
                            metrics=custom_metric)
    model_conv.fit(X_train, y_train,
              batch_size = 256, epochs = EPOCH,
              validation_data=(X_test, y_test),
              callbacks=[es, plateau],
              verbose = 0)
   
    pred_a = model_conv.predict(X_test) 
    oof_NN_a[ts_idx] += pred_a 
    score_NN_a = log_loss(y_test, pred_a)
    print(f"\nFOLD {fold} Score convolution model: {score_NN_a}\n")
    pred_NN_a += model_conv.predict(test.iloc[:,1:]) / N_FOLDS 
 
score_a = log_loss(targets, oof_NN_a)
print(f"\n=== FINAL SCORE CONVOLUTION MODEL : {score_a}===\n")

In [None]:
pred_embedding = pred_NN_a

In [None]:
sample_sub['Class_1']=pred_embedding[:,0]
sample_sub['Class_2']=pred_embedding[:,1]
sample_sub['Class_3']=pred_embedding[:,2]
sample_sub['Class_4']=pred_embedding[:,3]
sample_sub['Class_5']=pred_embedding[:,4]
sample_sub['Class_6']=pred_embedding[:,5]
sample_sub['Class_7']=pred_embedding[:,6]
sample_sub['Class_8']=pred_embedding[:,7]
sample_sub['Class_9']=pred_embedding[:,8]

In [None]:
sample_sub.to_csv("submission_6.csv", index=False)