In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
import lightgbm as lgb

from sklearn.model_selection import GridSearchCV


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import json
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
root_path ="/kaggle/input/tabular-playground-series-may-2021/"
train = pd.read_csv(os.path.join(root_path, 'train.csv'))
test = pd.read_csv(os.path.join(root_path, 'test.csv'))
sample_submission = pd.read_csv(os.path.join(root_path, 'sample_submission.csv'))

In [None]:
# train.head()
unique_targets = train['target'].unique().tolist()
label_mapping = dict(zip(unique_targets, [int(i[-1]) - 1 for i in unique_targets]))
# label_mapping
train['target'] = train['target'].map(label_mapping)
# train.head(5).T

In [None]:
y_train = train['target'][:95000]
x_train = train.drop(['id', 'target'], axis=1)[:95000]
x_val = x_train[-5000:]
y_val = y_train[-5000:]
test = test.drop(['id'],axis=1)

In [None]:
print(x_train.shape)
print(y_train.shape)
print(x_val.shape)
print(y_val.shape)
print(test.shape)
print(np.unique(y_train))
print(sample_submission.shape)

In [None]:
pipeline_lr=Pipeline([('scalar1',StandardScaler()),
                     ('pca1',PCA(n_components=2)),
                     ('lr_classifier',LogisticRegression(random_state=0))])
pipeline_randomforest=Pipeline([('scalar3',StandardScaler()),
                     ('pca3',PCA(n_components=2)),
                     ('rf_classifier',RandomForestClassifier())])
pipeline_lightgbm  = Pipeline([('scalar3',StandardScaler()),
                              ('lgbm_classifier',LGBMClassifier(n_jobs=-1))])



In [None]:
pipelines = [pipeline_lr,  pipeline_randomforest,pipeline_lightgbm]


In [None]:
for pipe in pipelines:
    pipe.fit(x_train,y_train)

In [None]:
pipe_dict = {0: 'Logistic Regression', 1: 'RandomForest',2:'lightGBM'}

for  i,model in enumerate(pipelines):
    print(pipe_dict[i],model.score(x_val,y_val))

In [None]:
# test_output = pipelines[0].predict_proba(test)
x_train_new = pd.concat([x_train,x_val],axis=0,ignore_index= True)
y_train_new = pd.concat([y_train,y_val],axis=0,ignore_index= True)
#([train, test], axis = 0, ignore_index = True
print(x_train_new.shape,y_train_new.shape)

In [None]:
# train on all data
# final_model  = pipelines[2].fit(x_train_new,y_train_new)


**Running Gridsearch on the chosen model**

In [None]:
# grid-search params on final_model
params = {'boosting_type': 'gbdt',
          'max_depth' : -1,
          'objective': 'binary',
          'device_type':'cuda',
          'nthread': 3, # Updated from nthread
          'num_leaves': 64,
          'learning_rate': 0.05,
          'max_bin': 512,
          'subsample_for_bin': 200,
          'subsample': 1,
          'subsample_freq': 1,
          'colsample_bytree': 0.8,
          'reg_alpha': 5,
          'reg_lambda': 10,
          'min_split_gain': 0.5,
          'min_child_weight': 1,
          'min_child_samples': 5,
          'scale_pos_weight': 1,
          'num_class' : 1,
          'metric' : 'binary_error',
         }

# Create parameters to search
gridParams = {
    'learning_rate': [0.005],
    'n_estimators': [40],
    'num_leaves': [6,8,12,16],
    'boosting_type' : ['gbdt'],
    'objective' : ['binary'],
    'random_state' : [501], # Updated from 'seed'
    'colsample_bytree' : [0.65, 0.66],
    'subsample' : [0.7,0.75],
    'reg_alpha' : [1,1.2],
    'reg_lambda' : [1,1.2,1.4],
    }
# gridParams = {
#     'n_estimators': [40],
#     'boosting_type' : ['gbdt'],
#     'objective' : ['binary'],
#     'random_state' : [501], # Updated from 'seed'

#     }

mdl = LGBMClassifier(boosting_type= 'gbdt',
          objective = 'binary',
          n_jobs = 3, # Updated from 'nthread'
          silent = True,
                     device_type='cuda',
          max_depth = params['max_depth'],
          max_bin = params['max_bin'],
          subsample_for_bin = params['subsample_for_bin'],
          subsample = params['subsample'],
          subsample_freq = params['subsample_freq'],
          min_split_gain = params['min_split_gain'],
          min_child_weight = params['min_child_weight'],
          min_child_samples = params['min_child_samples'],
          scale_pos_weight = params['scale_pos_weight'])

# create a gridsearch of the pipeline, the fit the best model
gridsearch = GridSearchCV(mdl, gridParams, cv=5, verbose=0,n_jobs=-1) # Fit grid search

In [None]:
gridsearch.fit(x_train_new, y_train_new)

In [None]:
test_output = gridsearch.best_estimator_.predict_proba(test)

In [None]:
# test_output = final_model.predict_proba(test)

In [None]:
predictions_df = pd.DataFrame(test_output, columns = ["Class_1", "Class_2", "Class_3", "Class_4"])
predictions_df['id'] = sample_submission['id']

In [None]:
predictions_df.to_csv("submission3.csv",index=False)

In [None]:
! pip install kaggle


In [None]:
api_token= {"username":"lol","key":"lol"}
with open('/root/.kaggle/kaggle.json', 'w') as file:
    json.dump(api_token, file)
!chmod 600 /root/.kaggle/kaggle.json


In [None]:
! kaggle competitions submit -c tabular-playground-series-may-2021 -f submission3.csv -m "Gridsearch new"