In [None]:
import numpy as np 
import pandas as pd 


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
file_path = '/kaggle/input/tabular-playground-series-jun-2021'
train = pd.read_csv(os.path.join(file_path,'train.csv'))
test = pd.read_csv(os.path.join(file_path,'test.csv'))

In [None]:
display(train.head())
display(test.head())

In [None]:
display(train.info())
display(test.info())

In [None]:
train['target'].value_counts()

In [None]:
target_replacement = {
    'Class_1':0,
    'Class_2':1,
    'Class_3':2,
    'Class_4':3,
    'Class_5':4,
    'Class_6':5,
    'Class_7':6,
    'Class_8':7,
    'Class_9':8,
}

train['target'] = train['target'].replace(target_replacement)

In [None]:
train['target'].value_counts()

In [None]:
submission = pd.read_csv(os.path.join(file_path,'sample_submission.csv'),index_col='id')

In [None]:
submission.head()

Let's create the folds first then we will do some feature selection and then baseline prediction. 

In [None]:
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
from sklearn.metrics import log_loss
kf = StratifiedKFold(n_splits=10)
train["kfold"] = -1
train = train.sample(frac=1).reset_index(drop=True)
y = train.target.values
for f, (t_, v_) in enumerate(kf.split(X=train, y=y)):
    train.loc[v_, 'kfold'] = f



In [None]:
def predict_lgb(iteration):
    evaluation_lgb = []
    test_lgb = np.zeros((len(test),9))
    params = {
            'bagging_freq': 1, 
            'verbosity': -1, 
            'seed': 42, 
            'num_threads': -1, 
            'feature_pre_filter': True, 
            'objective': 'multiclass',
            'n_estimators':2000,
            'metric': 'multi_logloss', 
            'boosting': 'gbdt', 
            'bagging_fraction': 0.6000000000000001, 
            'feature_fraction': 0.5, 
            'lambda_l1': 10, 
            'lambda_l2': 0.1, 
            'learning_rate': 0.060119000245064017, 
            'max_depth': 8, 
            'min_child_samples': 100, 
            'num_leaves': 127
        }
    for fold in range(iteration):
        i = fold + 1
        df_train = train[train.kfold != fold].reset_index(drop=True)
        df_valid = train[train.kfold == fold].reset_index(drop=True)
        y_train = df_train.target.values
        y_valid = df_valid.target.values
        x_train = df_train.drop(['id','target','kfold'],axis = 1)
        x_valid = df_valid.drop(['id','target','kfold'],axis = 1)
        clf = LGBMClassifier(**params)
        clf.fit(x_train,y_train,eval_metric='multi_logloss',eval_set=[(x_valid,y_valid)],
                early_stopping_rounds = 200,verbose = -1)
        pred_probs = clf.predict_proba(x_valid)
        logloss = log_loss(y_valid,pred_probs)
        evaluation_lgb.append(logloss)
        print(f'The logloss value of iteration {fold} is {logloss}')
        test_final = test.drop('id',axis = 1)
        test_lgb += clf.predict_proba(test_final)/i
    return test_lgb,evaluation_lgb
    

In [None]:
fold = 10
test_lgb,evaluation_lgb = predict_lgb(fold)

In [None]:
print(evaluation_lgb)

In [None]:
from catboost import CatBoostClassifier
def predict_cb(iteration):
    evaluation_cb = []
    test_cb = np.zeros((len(test),9))
    for fold in range(iteration):
        i = fold + 1
        df_train = train[train.kfold != fold].reset_index(drop=True)
        df_valid = train[train.kfold == fold].reset_index(drop=True)
        y_train = df_train.target.values
        y_valid = df_valid.target.values
        x_train = df_train.drop(['id','target','kfold'],axis = 1)
        x_valid = df_valid.drop(['id','target','kfold'],axis = 1)
        clf = CatBoostClassifier(n_estimators=2000,
                        colsample_bylevel=0.06,
                        max_leaves=31,
                        subsample=0.67,
                        verbose=0,
                        bootstrap_type='Bernoulli',
                        thread_count=6,
                        random_state=42)
        clf.fit(x_train,y_train)
        pred_probs = clf.predict_proba(x_valid)
        logloss = log_loss(y_valid,pred_probs)
        evaluation_cb.append(logloss)
        print(f'The logloss value of iteration {fold} is {logloss}')
        test_final = test.drop('id',axis = 1)
        test_cb += clf.predict_proba(test_final)/i
    return test_cb,evaluation_cb

In [None]:
fold = 10
test_cb,evaluation_cb = predict_cb(fold)

In [None]:
print(evaluation_lgb)
print(evaluation_cb)

In [None]:
evaluation_lgb = np.array(evaluation_lgb)
evaluation_cb = np.array(evaluation_cb)

In [None]:
print(evaluation_lgb.mean())
print(evaluation_cb.mean())

In [None]:
test_preds = (test_lgb + test_cb)/2

In [None]:
submission["Class_1"] = test_preds[:,0]
submission["Class_2"] = test_preds[:,1]
submission["Class_3"] = test_preds[:,2]
submission["Class_4"] = test_preds[:,3]
submission["Class_5"] = test_preds[:,4]
submission["Class_6"] = test_preds[:,5]
submission["Class_7"] = test_preds[:,6]
submission["Class_8"] = test_preds[:,7]
submission["Class_9"] = test_preds[:,8]

In [None]:
submission.to_csv('blend_cb_lgb_3')