# TPS0422 Simple LGBM

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import lightgbm as lgb
from sklearn.metrics import classification_report, log_loss, accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

In [None]:
train=pd.read_csv('../input/tabular-playground-series-apr-2022/train.csv')
print(train.columns.tolist())
print(train['sequence'].nunique())
print(train['subject'].nunique())

In [None]:
sensors=['sensor_00', 'sensor_01', 'sensor_02', 'sensor_03', 'sensor_04', 
         'sensor_05', 'sensor_06', 'sensor_07', 'sensor_08', 'sensor_09', 
         'sensor_10', 'sensor_11', 'sensor_12']

In [None]:
data0=train[train['sequence']==0]
fig=make_subplots(specs=[[{"secondary_y":False}]])
for item in sensors:
    fig.add_trace(go.Scatter(x=data0['step'],y=data0[item],name=item),secondary_y=False,)
fig.update_layout(autosize=False,width=700,height=400,title_text='sequence 0')
fig.update_xaxes(title_text="step")
fig.update_yaxes(title_text="value",secondary_y=False)
fig.show()

In [None]:
data1=train[train['sequence']==1]
fig=make_subplots(specs=[[{"secondary_y":False}]])
for item in sensors:
    fig.add_trace(go.Scatter(x=data1['step'],y=data1[item],name=item),secondary_y=False,)
fig.update_layout(autosize=False,width=700,height=500,title_text='sequence 1')
fig.update_xaxes(title_text="step")
fig.update_yaxes(title_text="value",secondary_y=False)
fig.show()

In [None]:
data2=train[train['sequence']==2]
fig=make_subplots(specs=[[{"secondary_y":False}]])
for item in sensors:
    fig.add_trace(go.Scatter(x=data2['step'],y=data2[item],name=item),secondary_y=False,)
fig.update_layout(autosize=False,width=700,height=500,title_text='sequence 2')
fig.update_xaxes(title_text="step")
fig.update_yaxes(title_text="value",secondary_y=False)
fig.show()

In [None]:
data3=train[train['sequence']==3]
fig=make_subplots(specs=[[{"secondary_y":False}]])
for item in sensors:
    fig.add_trace(go.Scatter(x=data3['step'],y=data3[item],name=item),secondary_y=False,)
fig.update_layout(autosize=False,width=700,height=500,title_text='sequence 3')
fig.update_xaxes(title_text="step")
fig.update_yaxes(title_text="value",secondary_y=False)
fig.show()

In [None]:
train2=train.groupby("sequence",as_index=False)[sensors].agg(['mean','std','min','max','sum'])
display(train2)

In [None]:
test=pd.read_csv('../input/tabular-playground-series-apr-2022/test.csv')

In [None]:
test2=test.groupby("sequence",as_index=False)[sensors].agg(['mean','std','min','max','sum'])
display(test2)

In [None]:
train_labels=pd.read_csv('../input/tabular-playground-series-apr-2022/train_labels.csv')

In [None]:
trainX=train2
trainY=train_labels['state']
testX=test2

In [None]:
import lightgbm as lgbm
from sklearn.metrics import mean_squared_error

def fit_lgbm(X, y, cv, 
             params: dict=None, 
             verbose: int=50):

    if params is None:
        params = {}

    models = []
    oof_pred = np.zeros_like(y, dtype=np.float)

    for i, (idx_train, idx_valid) in enumerate(cv): 
        x_train, y_train = X[idx_train], y[idx_train]
        x_valid, y_valid = X[idx_valid], y[idx_valid]

        clf = lgbm.LGBMRegressor(**params)
        
        with Timer(prefix='fit fold={} '.format(i)):
            clf.fit(x_train, y_train, 
                    eval_set=[(x_valid, y_valid)],  
                    early_stopping_rounds=100,
                    verbose=verbose)

        pred_i = clf.predict(x_valid)
        oof_pred[idx_valid] = pred_i
        models.append(clf)
        print(f'Fold {i} RMSLE: {mean_squared_error(y_valid, pred_i) ** .5:.4f}')
        print()

    score = mean_squared_error(y, oof_pred) ** .5
    print('-' * 50)
    print('FINISHED | Whole RMSLE: {:.4f}'.format(score))
    return oof_pred, models

In [None]:
params =  {
    'num_leaves': 10, 
    'objective': 'regression', 
    'max_depth': 3, 
    'learning_rate': 0.15878788525210086, 
    'lambda_l1': 2.2250723401622309e-07, 
    'lambda_l2': 3.155610811025729, 
    'bagging_freq': 8, 
    'bagging_fraction': 0.8396873791240804, 
    'feature_fraction': 0.9169635405737855
}

In [None]:
y = trainY
print(y.shape)
ydf=pd.DataFrame(y)
ydf

In [None]:
from contextlib import contextmanager
from time import time

class Timer:
    def __init__(self, logger=None, format_str='{:.3f}[s]', prefix=None, suffix=None, sep=' '):

        if prefix: format_str = str(prefix) + sep + format_str
        if suffix: format_str = format_str + sep + str(suffix)
        self.format_str = format_str
        self.logger = logger
        self.start = None
        self.end = None

    @property
    def duration(self):
        if self.end is None:
            return 0
        return self.end - self.start

    def __enter__(self):
        self.start = time()

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.end = time()
        out_str = self.format_str.format(self.duration)
        if self.logger:
            self.logger.info(out_str)
        else:
            print(out_str)

In [None]:
target=['state']
for i in range(1):
    fold = KFold(n_splits=5, shuffle=True, random_state=71)
    ydfi=ydf.iloc[:,i]
    y=np.array(ydfi)
    cv = list(fold.split(trainX, y))
    oof, models = fit_lgbm(trainX.values, y, cv, params=params, verbose=500)
    
    fig,ax = plt.subplots(figsize=(6,6))
    ax.set_title(target[i],fontsize=20)
    ax.set_ylabel('Train Predicted '+target[i],fontsize=12)
    ax.set_xlabel('Train Actual '+target[i],fontsize=12)
    ax.scatter(y,oof)

In [None]:
models

In [None]:
preds = np.zeros((testX.shape[0]))
preds+=models[4].predict(testX.values)  ###### predict_proba
print(len(preds))
print(preds[0:3])

In [None]:
submit=pd.read_csv('../input/tabular-playground-series-apr-2022/sample_submission.csv')
print(len(submit))

In [None]:
submit['state']=np.where(preds<0.5,0,1)
display(submit)
submit.to_csv('submission.csv',index=False)