# TPS0622 LGBM


In [None]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
import lightgbm as lgb
import optuna
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import matplotlib.dates as dates
from sklearn.linear_model import LinearRegression

In [None]:
data=pd.read_csv('../input/tabular-playground-series-jun-2022/data.csv')
display(data[0:3])
colsT=data.columns.tolist()
print(colsT)
print(len(colsT))
display(data.info())

In [None]:
# NaN will be replaced with 0.0000001.
NAN=0.0000001
for item in colsT:
    print(item,len(data[data[item]==NAN]))

In [None]:
sample=pd.read_csv('../input/tabular-playground-series-jun-2022/sample_submission.csv')
sample['row']=sample['row-col'].apply(lambda x: int(x.split('-')[0]))
sample['col']=sample['row-col'].apply(lambda x: x.split('-')[1])
submit=sample.copy()
display(sample[0:3])
sample['value']=1
display(sample[['col','value']].groupby('col',as_index=False).sum())
colsS=sorted(sample['col'].unique().tolist())
print(colsS[0:3])

In [None]:
data=data.fillna(NAN)
train=data[data['F_1_0']!=NAN]
test=data[data['F_1_0']==NAN]
trainY=train['F_1_0']
trainX=train.drop('F_1_0',axis=1)
testY=test['F_1_0']
testX=test.drop('F_1_0',axis=1)
print(len(trainX),len(testX))

In [None]:
columns0=trainX.columns.to_list()
print(columns0)

# Model Tuning

In [None]:
def objective(trial,data=trainX,target=trainY):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2, random_state=42)
    param =   {
        'num_leaves': trial.suggest_int('num_leaves', 150, 200),
        'objective': 'regression',  
        'max_depth': trial.suggest_int('max_depth', 19, 20),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.100, 0.104),
        "boosting": "gbdt",
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-7, 1e-6),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-5, 1e-4),
        "bagging_freq":trial.suggest_int('bagging_freq', 4, 5),
        "bagging_fraction": trial.suggest_uniform('bagging_fraction', 0.68, 0.70),
        "feature_fraction": trial.suggest_uniform('feature_fraction', 0.7, 0.8),
        "verbosity": trial.suggest_int('verbosity', 8, 10),
    }
    model = lgb.LGBMRegressor(**param)      
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    preds = model.predict(test_x)
    rmse = mean_squared_error(test_y, preds,squared=False)
    
    return rmse

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=400)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
study.trials_dataframe()

In [None]:
# shows the scores from all trials
optuna.visualization.plot_optimization_history(study)

In [None]:
# interactively visualizes the hyperparameters and scores
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
# shows the evolution of the search
optuna.visualization.plot_slice(study)

In [None]:
# parameter interactions on an interactive chart.
optuna.visualization.plot_contour(study, params=['max_depth','learning_rate'])

In [None]:
# Visualize parameter importances.
optuna.visualization.plot_param_importances(study)

In [None]:
# Visualize empirical distribution function
optuna.visualization.plot_edf(study)

In [None]:
Best_trial=study.best_trial.params
print(Best_trial)

# Prediction

In [None]:
PREDS=[]
for item in colsS:
    print(item)
    columns=list(set(columns0)-set([item]))
    train=data[data[item]!=NAN]
    test=data[data[item]==NAN]
    trainY=train[item]
    trainX=train.drop(item,axis=1)
    testY=test[item]
    testX=test.drop(item,axis=1)
    
    preds = np.zeros((test.shape[0]))
    kf = KFold(n_splits=5,random_state=48,shuffle=True)
    for trn_idx, test_idx in kf.split(trainX[columns],trainY):
        X_tr,X_val=trainX[columns].iloc[trn_idx],trainX[columns].iloc[test_idx]
        y_tr,y_val=trainY.iloc[trn_idx],trainY.iloc[test_idx]
        model = lgb.LGBMRegressor(**Best_trial)
        model.fit(X_tr,y_tr,eval_set=[(X_val,y_val)],early_stopping_rounds=100,verbose=False)
        preds+=model.predict(testX[columns])/kf.n_splits   ###### predict_proba
        rmse=mean_squared_error(y_val, model.predict(X_val),squared=False)
        #print(rmse)
    PREDS+=[preds]

In [None]:
np.save('./PREDS', np.array(PREDS))

In [None]:
submitA=pd.DataFrame()
for i,item in enumerate(colsS):
    print(item,len(submit[submit['col']==item]),len(PREDS[i]))
    submit0=submit[submit['col']==item]
    submit0=submit0.reset_index(drop=True)
    submit0.loc[:,'value']=PREDS[i]
    submitA=pd.concat([submitA,submit0],axis=0)

In [None]:
display(submitA)
submitB=submitA[['row-col','value']]
display(submitB)
submitB.to_csv('submission.csv',index=False)