# Tabular Playground Series - Dec 2021 
### LightGBM for multi-class target
### Add duplicate data and SMOTE
https://www.kaggle.com/stpeteishii/tps0521-lightgbm-optuna
https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.SMOTE.html

![](https://storage.googleapis.com/kaggle-competitions/kaggle/28007/logos/header.png?t=2021-06-30-01-10-51)

In [None]:
import lightgbm as lgb
import numpy as np
import pandas as pd
import random
import optuna
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from imblearn.over_sampling import SMOTE

In [None]:
train0 = pd.read_csv("../input/tabular-playground-series-dec-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-dec-2021/test.csv")

In [None]:
train0[0:3].T

In [None]:
print(train0.columns)

In [None]:
Name0=train0['Cover_Type'].unique()
Name=sorted(Name0)
print(Name)

In [None]:
N=list(range(len(Name)))  
normal_mapping=dict(zip(Name,N)) 
reverse_mapping=dict(zip(N,Name)) 

In [None]:
train0['Cover_Type'].value_counts()

In [None]:
train_add6=train0[train0['Cover_Type']==6]
train_add5=train0[train0['Cover_Type']==5]
train_add4=train0[train0['Cover_Type']==4]

In [None]:
trainadd0=[train_add4,train_add6]
for i in range(10):
    trainadd0+=[train_add5]
trainadd1=pd.concat(trainadd0)

In [None]:
target_fsm = trainadd1['Cover_Type']
data_fsm = trainadd1.drop(['Cover_Type','Id'],axis=1)
print(data_fsm.shape)
print(target_fsm.shape)

# SMOTE
https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.SMOTE.html

In [None]:
smote = SMOTE(k_neighbors=1)
data_smote, target_smote = smote.fit_resample(data_fsm,target_fsm)

In [None]:
target_smote.value_counts()

In [None]:
data_smoted=pd.concat([data_smote,target_smote],axis=1)
train1=pd.concat([train0,data_smoted])
print(len(train1))

In [None]:
M=list(range(len(train1)))
random.seed(2021)
random.shuffle(M)
train2=train1.iloc[M]

In [None]:
target = train2['Cover_Type']
data = train2.drop(['Cover_Type','Id'],axis=1)
train=data
target.value_counts()

In [None]:
columns=data.columns.to_list()
print(columns)

In [None]:
def objective(trial,data=data,target=target):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=42)
    param =   {
        'num_leaves': trial.suggest_int('num_leaves', 128, 256),
        'objective': trial.suggest_categorical('objective',['regression','rmse']),  
        'max_depth': trial.suggest_int('max_depth', 5, 12),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.04, 0.2),
        "boosting": "gbdt",
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        "bagging_freq": trial.suggest_int('bagging_freq', 1, 10),
        "bagging_fraction": trial.suggest_uniform('bagging_fraction', 0.1, 1.0),
        "feature_fraction": trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        "verbosity": trial.suggest_int('verbosity', 1, 10),
    }
    model = lgb.LGBMClassifier(**param)      
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    preds = model.predict(test_x)
    rmse = mean_squared_error(test_y, preds,squared=False)
    
    return rmse

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=16)
#print('Number of finished trials:', len(study.trials))
#print('Best trial:', study.best_trial.params)

In [None]:
study.trials_dataframe()

In [None]:
# shows the scores from all trials
optuna.visualization.plot_optimization_history(study)

In [None]:
# interactively visualizes the hyperparameters and scores
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
# shows the evolution of the search
optuna.visualization.plot_slice(study)

In [None]:
# parameter interactions on an interactive chart.
optuna.visualization.plot_contour(study, params=['num_leaves','objective'])

In [None]:
# Visualize parameter importances.
optuna.visualization.plot_param_importances(study)

In [None]:
# Visualize empirical distribution function
optuna.visualization.plot_edf(study)

In [None]:
Best_trial=study.best_trial.params
print(Best_trial)

In [None]:
sample = pd.read_csv("../input/tabular-playground-series-dec-2021/sample_submission.csv")
print(sample.shape)

In [None]:
print(columns)

In [None]:
preds = np.zeros((sample.shape[0],len(Name)))
kf = KFold(n_splits=5,random_state=48,shuffle=True)

In [None]:
for trn_idx, test_idx in kf.split(train[columns],target):
    X_tr,X_val=train[columns].iloc[trn_idx],train[columns].iloc[test_idx]
    y_tr,y_val=target.iloc[trn_idx],target.iloc[test_idx]
    
    model = lgb.LGBMClassifier(**Best_trial)
    model.fit(X_tr,y_tr,eval_set=[(X_val,y_val)],early_stopping_rounds=100,verbose=False)
    
    preds+=model.predict_proba(test[columns])/kf.n_splits   ###### predict_proba
    rmse=mean_squared_error(y_val, model.predict(X_val), squared=False)
    print(rmse)

In [None]:
print(preds.shape)
print(preds[0])

In [None]:
preds2=[]
for item in preds:
    value=np.argmax(item)
    preds2+=[reverse_mapping[value]]
print(preds2[0:5])
pd.Series(preds2).value_counts()

In [None]:
subm = sample
subm['Cover_Type'] = preds2
subm.to_csv('submission.csv',index=False)
subm