In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")


from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split,StratifiedKFold
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jun-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jun-2021/test.csv')

In [None]:
train.shape,test.shape

In [None]:
train.head()

In [None]:
train.isna().sum().sort_values(ascending=False)

In [None]:
fig = plt.subplots(figsize = (10,5))
sns.countplot(x='target',data=train)

In [None]:
col = []
for cols in train.columns[1:-1]:
    col.append(cols)

In [None]:
train.drop(columns=['id']).describe().T.style.bar(subset=['mean'],color='#606ff2').background_gradient(subset=['std'],cmap='PuBu').background_gradient(subset=['50%'],cmap='PuBu')

****Label Encoding and Scaling****

In [None]:
le = LabelEncoder()
encoded = le.fit_transform(train.target)
train = train.assign(target = encoded)
for i,c in enumerate(le.classes_):
    print(i,c)

In [None]:
scaler = StandardScaler()
train[col] = scaler.fit_transform(train[col])
test[col] = scaler.transform(test[col])

In [None]:
print(train.head())
print(test.head())

In [None]:
X = train.drop(['id','target'],axis=1)
y = train['target']
X.shape,y.shape,test[col].shape

****Catboost Classifier****

In [None]:
cat_params ={
    'iterations': 10143, 
    'od_wait': 1115, 
    'learning_rate': 0.02248589308956038, 
    'reg_lambda': 86.12583478104304, 
    'subsample': 0.08594672381075155, 
    'random_strength': 29.926327447041192, 
    'depth': 6, 'min_data_in_leaf': 30, 
    'leaf_estimation_iterations': 3,
    'loss_function' : 'MultiClass',
    'eval_metric' : 'MultiClass',
    'bootstrap_type' : 'Bernoulli',
    'leaf_estimation_method' : 'Newton',
    'random_state' : 42,
    'task_type' : 'GPU',
    }

In [None]:
preds = None
skf = StratifiedKFold(n_splits=10,random_state=42,shuffle=True)
l=[]
n=0
for tr_idx, test_idx in skf.split(X.values,y.values):
    
    X_tr, X_val = X.values[tr_idx], X.values[test_idx]
    y_tr, y_val = y.values[tr_idx], y.values[test_idx]
    
    model = CatBoostClassifier(**cat_params)
    
    model.fit(X_tr,y_tr,eval_set=[(X_val,y_val)],early_stopping_rounds=100,verbose=False)
    if preds is None:
        preds = model.predict_proba(test[col].values)
    else:
        preds += model.predict_proba(test[col].values)
    preds = preds/skf.n_splits
    l.append(log_loss(y_val, model.predict_proba(X_val)))
    print(n+1,l[n])
    n+=1

In [None]:
df_kfold_cat = pd.DataFrame(preds,columns=['Class_1','Class_2','Class_3','Class_4','Class_5','Class_6','Class_7','Class_8','Class_9'])
df_kfold_cat['id']  = test['id']
df_kfold_cat = df_kfold_cat[['id','Class_1','Class_2','Class_3','Class_4','Class_5','Class_6','Class_7','Class_8','Class_9']]
output = df_kfold_cat.to_csv('submit_1.csv',index=False)

****LGBM Classifier****

In [None]:
params_lgbm = {'learning_rate': 0.03193398814609538, 
                    'max_depth': 100, 
                    'min_child_samples': 263, 
                    'min_child_weight': 0.00038121415013974824,
                    'objective':'multiclass',
                    'random_state': 42,
                    'n_estimators': 10000,
                    'metric': 'multi_logloss',
                   }

In [None]:
preds2 = None
skf = StratifiedKFold(n_splits = 10 , random_state = 13 , shuffle = True)
ll =[]
n = 0
for tr_idx, test_idx in skf.split(X.values,y.values):
    X_tr, X_val = X.values[tr_idx], X.values[test_idx]
    y_tr, y_val = y.values[tr_idx], y.values[test_idx] 
    
    model = LGBMClassifier(**params_lgbm)
    
    model.fit(X_tr,y_tr,eval_set = [(X_val,y_val)],early_stopping_rounds = 200,verbose = False)
    
    if preds2 is None:
        preds2 = model.predict_proba(test[col].values)
    else:
        preds2 += model.predict_proba(test[col].values)
    preds2 /= skf.n_splits
    ll.append(log_loss(y_val, model.predict_proba(X_val)))
    print(n+1,ll[n])
    n += 1

In [None]:
np.mean(ll)

In [None]:
df_kfold_lgb = pd.DataFrame(preds2,columns=['Class_1','Class_2','Class_3','Class_4','Class_5','Class_6','Class_7','Class_8','Class_9'])
df_kfold_lgb['id']  = test['id']
df_kfold_lgb = df_kfold_lgb[['id','Class_1','Class_2','Class_3','Class_4','Class_5','Class_6','Class_7','Class_8','Class_9']]