In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pd.options.display.float_format = '{:,.2f}'.format

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
plt.style.use("seaborn-whitegrid")
%matplotlib inline
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer

from sklearn.model_selection import RandomizedSearchCV
from skopt import BayesSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# from cuml.dask.ensemble import RandomForestClassifier
import lightgbm
from lightgbm import LGBMClassifier

from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve

In [None]:
sample_submission = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2022/sample_submission.csv')
train_labels = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2022/train_labels.csv')
train = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2022/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2022/test.csv')

In [None]:
sample_submission

In [None]:
train_labels

In [None]:
train

In [None]:
test

In [None]:
df_data = pd.merge(train, train_labels, how='inner', on = 'sequence')
df_data

In [None]:
def details(df):
    sum_null_values = df.isnull().sum()
    percent_null_values = 100* (sum_null_values/len(df))
    data_type = df.dtypes
    unique_values = df.nunique()

    table = pd.concat([sum_null_values,percent_null_values,data_type,unique_values], axis=1)
    table_col = table.rename(columns = {0 : 'Missing Values', 1 : '% of Total Missing Values', 2 : 'Data_Type', 3: 'Unique values'})
    return table_col

In [None]:
details(df_data)

In [None]:
details(test)

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2

    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
 
    return df

df_data = reduce_mem_usage(df_data)

In [None]:
df_train = df_data.copy()

In [None]:
df = df_train.groupby(['sequence'])['sensor_00', 'sensor_01', 'sensor_02','sensor_03', 'sensor_04', 'sensor_05', 'sensor_06', 'sensor_07','sensor_08', 'sensor_09', 'sensor_10', 'sensor_11', 'sensor_12','state'].mean()

In [None]:
df_test = test.groupby(['sequence'])['sensor_00', 'sensor_01', 'sensor_02','sensor_03', 'sensor_04', 'sensor_05', 'sensor_06', 'sensor_07','sensor_08', 'sensor_09', 'sensor_10', 'sensor_11', 'sensor_12'].mean()

In [None]:
# df.groupby(['step','subject','sequence'])['sensor_00', 'sensor_01', 'sensor_02','sensor_03', 'sensor_04', 'sensor_05', 'sensor_06', 'sensor_07','sensor_08', 'sensor_09', 'sensor_10', 'sensor_11', 'sensor_12'].mean()

In [None]:
# df[['sequence', 'subject', 'step']].groupby(['sequence', 'subject']).count()

In [None]:
# df[df['subject']==573]

value, describe, outliers

In [None]:
# numeric_data = df.select_dtypes('number').columns
# numeric_data

In [None]:
# categorical_data = df.select_dtypes('object').columns
# categorical_data

In [None]:
col1=['sensor_00', 'sensor_01', 'sensor_02',
   'sensor_03', 'sensor_04', 'sensor_05', 'sensor_06', 'sensor_07',
   'sensor_08', 'sensor_09', 'sensor_10', 'sensor_11', 'sensor_12']
# col2 = ['sequence','subject','step','state']

In [None]:
def numcat(df, col1):
    for num in col1:
        print(num)
        print('--------------------------')        
        print(df[num].describe(percentiles=(.01,.05,.25,.5,.75,.9,.95,.99)))
        print('=====================================================')
#     for cat in col2:
#         print(cat)
#         print('--------------------------')
#         print(df[cat].value_counts())
#         print('=====================================================')

In [None]:
numcat(df,col1)

In [None]:
def outlier(df, col):
    plt.figure(figsize=(30,50))
    for i in enumerate(col):
        plt.subplot(5,3,i[0]+1)
        sns.boxplot(df[i[1]])
        
    plt.figure(figsize=(30,50))
    for i in enumerate(col):
        plt.subplot(5,3,i[0]+1)
        sns.kdeplot(df[i[1]], color='orange')

In [None]:
%%time
outlier(df, col1)

Dealing with outliers

In [None]:
df[col1].max()

In [None]:
df[col1].min()

In [None]:
cols = ['sensor_00', 'sensor_01', 'sensor_02','sensor_03', 'sensor_04', 'sensor_05', 'sensor_06', 'sensor_07','sensor_08', 'sensor_09', 'sensor_10', 'sensor_11', 'sensor_12']

fig, axs = plt.subplots(len(cols), 1, figsize=(15,100), sharex=True)
plt.xlabel('sequence')
for col, plt_ax in zip(cols, axs):
    plt_ax.title.set_text(col)
    df[col].plot(ax=plt_ax)

In [None]:
# sns.boxplot(x=df["state"], y=df["sequence"], palette="Blues")
# plt.show()

checking balance of target

In [None]:
round(100*df['state'].value_counts(normalize=True),2).plot(kind='pie', figsize=(6, 6), autopct='%1.2f%%')
plt.title("State")
plt.legend(["1", "0"])
plt.show()

This is a balanced dataset

In [None]:
# sns.stripplot( x = df['state'], y= df['sequence'], data=df)

Heatmap

In [None]:
plt.figure(figsize=(20, 10))
sns.heatmap(df.corr(), annot=True)
plt.show()

In [None]:
X = df.drop(['state'],1)
y = df['state']
X_test = df_test.copy()
X.shape, y.shape, X_test.shape

In [None]:
# !pip install dask-cuda

In [None]:
# !nvidia-smi

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.7, random_state=23)
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape, X_test.shape

In [None]:
# from dask_cuda import LocalCUDACluster
# from dask.distributed import Client
# cluster = LocalCUDACluster(threads_per_worker=1)
# c = Client(cluster)
# workers = c.has_what().keys()

In [None]:
# X_train, y_train = cuml.dask.common.utils.persist_across_workers(c,[X_train, y_train], workers=workers)

In [None]:
X_train

In [None]:
# %%time
# pt = PowerTransformer(copy=False)
# X_train[['subject', 'step', 'sensor_00', 'sensor_01', 'sensor_02', 'sensor_03', 'sensor_04', 'sensor_05', 'sensor_06', 'sensor_07', 'sensor_08', 'sensor_09', 'sensor_10', 'sensor_11', 'sensor_12']] = pt.fit_transform(X_train[['subject', 'step', 'sensor_00', 'sensor_01', 'sensor_02', 'sensor_03', 'sensor_04', 'sensor_05', 'sensor_06', 'sensor_07', 'sensor_08', 'sensor_09', 'sensor_10', 'sensor_11', 'sensor_12']])
# X_valid[['subject', 'step', 'sensor_00', 'sensor_01', 'sensor_02', 'sensor_03', 'sensor_04', 'sensor_05', 'sensor_06', 'sensor_07', 'sensor_08', 'sensor_09', 'sensor_10', 'sensor_11', 'sensor_12']] = pt.transform(X_valid[['subject', 'step', 'sensor_00', 'sensor_01', 'sensor_02', 'sensor_03', 'sensor_04', 'sensor_05', 'sensor_06', 'sensor_07', 'sensor_08', 'sensor_09', 'sensor_10', 'sensor_11', 'sensor_12']])
# X_train.columns

In [None]:
model_list = list()
resample  = list()
roc_train_list = list()
roc_valid_list = list()
auc_train_list = list()
auc_valid_list = list()
rcv_best_score = list()

In [None]:
model_LR = LogisticRegression()
model_DT = DecisionTreeClassifier(random_state = 23)
model_RF = RandomForestClassifier(random_state=23, oob_score = True)
model_LGBM = lightgbm.LGBMClassifier(objective = 'binary', random_state=23)

In [None]:
# training_models = {
#     'model_LR' : LogisticRegression(),
#     'model_DT' : DecisionTreeClassifier(random_state = 23),
#     'model_RF' : RandomForestClassifier(random_state=23, oob_score = True),
#     'model_LGBM' : lightgbm.LGBMClassifier(objective = 'binary', random_state=23)
# }

In [None]:
# params_DT = {
#     'max_depth': [5, 10, 20, 50, 100, 200],
#     'min_samples_leaf': [5, 10, 20, 50, 100, 200],
#     'min_samples_split' : [5, 10, 20, 50, 100, 200]
# } 
params_DT = {
    'max_depth': [100],
    'min_samples_leaf': [5],
    'min_samples_split' : [200]
} 
# 'min_samples_split': 50, 'min_samples_leaf': 5, 'max_depth': 200 #0.78
# ('max_depth', 100), ('min_samples_leaf', 5), ('min_samples_split', 200) #0.8

# params_RF = {    
#     'n_estimators': [50, 100, 200],
#     'max_depth': [5, 10, 20, 50, 100, 200],
#     'min_samples_leaf': [5, 10, 20, 50, 100, 200],
#     'min_samples_split' : [5, 10, 20, 50, 100, 200] 
# }
params_RF = {    
    'n_estimators': [100],
    'max_depth': [200],
    'min_samples_leaf': [10],
    'min_samples_split' : [5] 
}
# 'n_estimators': 200, 'min_samples_split': 100, 'min_samples_leaf': 5, 'max_depth': 50 #0.73
# ('max_depth', 200), ('min_samples_leaf', 10), ('min_samples_split', 5), ('n_estimators', 100) #0.74

# params_LGBM = {
#     'n_estimators': [50, 100, 200],
#     'boosting_type': ['gbdt','dart'],
#     'max_depth': [5, 10, 20, 50, 100, 200], 
#     'min_child_samples': [5, 10, 20, 50, 100, 200],
#     'subsample': [0.4,0.6,0.8],        
#     'learning_rate': [0.01, 0.05, 0.1]
# }
params_LGBM = {
    'n_estimators': [200],
    'boosting_type': ['gbdt'],
    'max_depth': [20], 
    'min_child_samples': [50],
    'subsample': [0.8],        
    'learning_rate': [0.1]
}
# 'subsample': 0.8, 'n_estimators': 100, 'min_child_samples': 50, 'max_depth': 200, 'learning_rate': 0.1, 'boosting_type': 'gbdt' #0.74
# ('boosting_type', 'gbdt'), ('learning_rate', 0.1), ('max_depth', 20), ('min_child_samples', 50), ('n_estimators', 200), ('subsample', 0.8) #0.76

In [None]:
def model_fit_evaluation1(model_model, X_train, y_train, X_valid, y_valid, algo=None, sampling=None):
    
    rcv_best = model_model.fit(X_train, y_train)
    
    y_train_prob = rcv_best.predict_proba(X_train)
    y_train_pred = rcv_best.predict(X_train)
    y_val_prob = rcv_best.predict_proba(X_valid)
    y_val_pred = rcv_best.predict(X_valid)
    
    matrix_train = confusion_matrix(y_train, y_train_pred)
    matrix_val = confusion_matrix(y_valid, y_val_pred)
    report_train = classification_report(y_train, y_train_pred)
    report_val = classification_report(y_valid, y_val_pred)
    roc_train = roc_auc_score(y_train, y_train_pred)
    roc_val = roc_auc_score(y_valid, y_val_pred)
    auc_train = roc_auc_score(y_train, y_train_prob[:,1])
    auc_val = roc_auc_score(y_valid, y_val_prob[:,1])
    
    print('AUCROC for train')
    print('='*60)
    print(roc_train,"\n")
    print('AUCROC for Val')
    print('='*60)
    print(roc_val,"\n")
    print('Confusion Matrix for train')
    print('='*60)
    print(matrix_train,"\n")
    print('Confusion Matrix for val')
    print('='*60)
    print(matrix_val,"\n")
    print('Classification Report for train')
    print('='*60)
    print(report_train,"\n")
    print('Classification Report for val')
    print('='*60)
    print(report_val,"\n")
    print('AUC-ROC prob for train')
    print('='*60)
    print(auc_train,'\n')
    print('AUC-ROC prob for val')
    print('='*60)
    print(auc_val,'\n')
    print('Roc-Auc-Curve for Train set')
    print('='*60)
    print(plot_roc_curve(model_LR, X_train, y_train),'\n')
    print('Roc-Auc-Curve for Val set')
    print('='*60)
    print(plot_roc_curve(model_LR, X_valid, y_valid),'\n')
    
    model_list.append(algo)
    resample.append(sampling)
    roc_train_list.append(roc_train)
    roc_valid_list.append(roc_val)
    auc_train_list.append(auc_train)
    auc_valid_list.append(auc_val)

In [None]:
def model_fit_evaluation2(model_model, params, X_train, y_train, X_valid, y_valid, algo=None, sampling=None):
    
    rcv = RandomizedSearchCV(model_model, params, cv=5, scoring='roc_auc', n_jobs=-1, verbose=1, random_state=23)
    rcv.fit(X_train, y_train)    
    rcv_best = rcv.best_estimator_
    
    print('\n')
    print('best estimator : ', rcv_best)
    print('best parameters: ', rcv.best_params_)
    print('best score: ', rcv.best_score_)
    print('\n')

    y_train_prob = rcv_best.predict_proba(X_train)
    y_train_pred = rcv_best.predict(X_train)
    y_val_prob = rcv_best.predict_proba(X_valid)
    y_val_pred = rcv_best.predict(X_valid)
    
    matrix_train = confusion_matrix(y_train, y_train_pred)
    matrix_val = confusion_matrix(y_valid, y_val_pred)
    report_train = classification_report(y_train, y_train_pred)
    report_val = classification_report(y_valid, y_val_pred)
    roc_train = roc_auc_score(y_train, y_train_pred)
    roc_val = roc_auc_score(y_valid, y_val_pred)
    auc_train = roc_auc_score(y_train, y_train_prob[:,1])
    auc_val = roc_auc_score(y_valid, y_val_prob[:,1])
    
    print('AUCROC for train')
    print('='*60)
    print(roc_train,"\n")
    print('AUCROC for Val')
    print('='*60)
    print(roc_val,"\n")
    print('Confusion Matrix for train')
    print('='*60)
    print(matrix_train,"\n")
    print('Confusion Matrix for val')
    print('='*60)
    print(matrix_val,"\n")
    print('Classification Report for train')
    print('='*60)
    print(report_train,"\n")
    print('Classification Report for val')
    print('='*60)
    print(report_val,"\n")
    print('AUC-ROC prob for train')
    print('='*60)
    print(auc_train,'\n')
    print('AUC-ROC prob for val')
    print('='*60)
    print(auc_val,'\n')
    print('Roc-Auc-Curve for Train set')
    print('='*60)
    print(plot_roc_curve(rcv_best, X_train, y_train),'\n')
    print('Roc-Auc-Curve for Val set')
    print('='*60)
    print(plot_roc_curve(rcv_best, X_valid, y_valid),'\n')
    
    model_list.append(algo)
    resample.append(sampling)
    roc_train_list.append(roc_train)
    roc_valid_list.append(roc_val)
    auc_train_list.append(auc_train)
    auc_valid_list.append(auc_val)
    rcv_best_score.append(rcv.best_score_)

In [None]:
def model_fit_evaluation3(model_model, params, X_train, y_train, X_valid, y_valid, algo=None, sampling=None):
    
    rcv = BayesSearchCV(model_model, params, cv=5, scoring='roc_auc', n_iter=5, n_jobs=-1, verbose=1, random_state=23)
    rcv.fit(X_train, y_train)    
    rcv_best = rcv.best_estimator_
    
    print('\n')
    print('best estimator : ', rcv_best)
    print('best parameters: ', rcv.best_params_)
    print('best score: ', rcv.best_score_)
    print('\n')

    y_train_prob = rcv_best.predict_proba(X_train)
    y_train_pred = rcv_best.predict(X_train)
    y_val_prob = rcv_best.predict_proba(X_valid)
    y_val_pred = rcv_best.predict(X_valid)
    
    matrix_train = confusion_matrix(y_train, y_train_pred)
    matrix_val = confusion_matrix(y_valid, y_val_pred)
    report_train = classification_report(y_train, y_train_pred)
    report_val = classification_report(y_valid, y_val_pred)
    roc_train = roc_auc_score(y_train, y_train_pred)
    roc_val = roc_auc_score(y_valid, y_val_pred)
    auc_train = roc_auc_score(y_train, y_train_prob[:,1])
    auc_val = roc_auc_score(y_valid, y_val_prob[:,1])
    
    print('AUCROC for train')
    print('='*60)
    print(roc_train,"\n")
    print('AUCROC for Val')
    print('='*60)
    print(roc_val,"\n")
    print('Confusion Matrix for train')
    print('='*60)
    print(matrix_train,"\n")
    print('Confusion Matrix for val')
    print('='*60)
    print(matrix_val,"\n")
    print('Classification Report for train')
    print('='*60)
    print(report_train,"\n")
    print('Classification Report for val')
    print('='*60)
    print(report_val,"\n")
    print('AUC-ROC prob for train')
    print('='*60)
    print(auc_train,'\n')
    print('AUC-ROC prob for val')
    print('='*60)
    print(auc_val,'\n')
    print('Roc-Auc-Curve for Train set')
    print('='*60)
    print(plot_roc_curve(rcv_best, X_train, y_train),'\n')
    print('Roc-Auc-Curve for Val set')
    print('='*60)
    print(plot_roc_curve(rcv_best, X_valid, y_valid),'\n')
    
    model_list.append(algo)
    resample.append(sampling)
    roc_train_list.append(roc_train)
    roc_valid_list.append(roc_val)
    auc_train_list.append(auc_train)
    auc_valid_list.append(auc_val)
    rcv_best_score.append(rcv.best_score_)

In [None]:
# model_fit_evaluation1(model_LR, X_train, y_train, X_valid, y_valid, 'Logistic Regression', 'without HPT')

In [None]:
# model_fit_evaluation1(model_DT, X_train, y_train, X_valid, y_valid, 'Decision Tree', 'without HPT')

In [None]:
# model_fit_evaluation1(model_LGBM, X_train, y_train, X_valid, y_valid, 'LGBM', 'without HPT')

In [None]:
# model_fit_evaluation1(model_RF, X_train, y_train, X_valid, y_valid, 'Random Forest', 'without HPT')

In [None]:
# eval_df1 = pd.DataFrame({'model': model_list, 'resample':resample, 'roc_train_list':roc_train_list, 'roc_valid_list':roc_valid_list, 'auc_train_list': auc_train_list, 'auc_valid_list': auc_valid_list})
# eval_df1

In [None]:
# %%time
# model_fit_evaluation2(model_DT, params_DT, X_train, y_train, X_valid, y_valid, 'DT', 'with HPT')

In [None]:
# %%time
# model_fit_evaluation3(model_DT, params_DT, X_train, y_train, X_valid, y_valid, 'DT', 'with HPT')

In [None]:
# %%time
# model_fit_evaluation2(model_LGBM, params_LGBM, X_train, y_train, X_valid, y_valid, 'LGBM', 'with HPT')

In [None]:
# %%time
# model_fit_evaluation3(model_LGBM, params_LGBM, X_train, y_train, X_valid, y_valid, 'LGBM', 'with HPT')

In [None]:
# %%time
# model_fit_evaluation2(model_RF, params_RF, X_train, y_train, X_valid, y_valid, 'RF', 'with HPT')

In [None]:
# %%time
# model_fit_evaluation3(model_RF, params_RF, X_train, y_train, X_valid, y_valid, 'RF', 'with HPT')

In [None]:
# c.close()
# cluster.close()

In [None]:
# eval_df2 = pd.DataFrame({'model': model_list, 'resample':resample, 'roc_train_list':roc_train_list, 'roc_valid_list':roc_valid_list, 'auc_train_list': auc_train_list, 'auc_valid_list': auc_valid_list, 'rcv_best_score':rcv_best_score})
# eval_df2

In [None]:
%%time
pt = PowerTransformer(copy=False)
X[['sensor_00', 'sensor_01', 'sensor_02', 'sensor_03', 'sensor_04', 'sensor_05', 'sensor_06', 'sensor_07', 'sensor_08', 'sensor_09', 'sensor_10', 'sensor_11', 'sensor_12']] = pt.fit_transform(X[['sensor_00', 'sensor_01', 'sensor_02', 'sensor_03', 'sensor_04', 'sensor_05', 'sensor_06', 'sensor_07', 'sensor_08', 'sensor_09', 'sensor_10', 'sensor_11', 'sensor_12']])
X_test[['sensor_00', 'sensor_01', 'sensor_02', 'sensor_03', 'sensor_04', 'sensor_05', 'sensor_06', 'sensor_07', 'sensor_08', 'sensor_09', 'sensor_10', 'sensor_11', 'sensor_12']] = pt.transform(X_test[['sensor_00', 'sensor_01', 'sensor_02', 'sensor_03', 'sensor_04', 'sensor_05', 'sensor_06', 'sensor_07', 'sensor_08', 'sensor_09', 'sensor_10', 'sensor_11', 'sensor_12']])
X.shape, X_test.shape

In [None]:
rcv = BayesSearchCV(model_DT, params_DT, cv=5, scoring='roc_auc', n_iter=20, n_jobs=-1, verbose=1, random_state=23)

rcv.fit(X, y)    
rcv_best = rcv.best_estimator_

print('\n')
print('best estimator : ', rcv_best)
print('best parameters: ', rcv.best_params_)
print('best score: ', rcv.best_score_)
print('\n')

y_train_prob = rcv_best.predict_proba(X)
y_train_pred = rcv_best.predict(X)
y_test_prob = rcv_best.predict_proba(X_test)
y_test_pred = rcv_best.predict(X_test)

matrix_train = confusion_matrix(y, y_train_pred)
report_train = classification_report(y, y_train_pred)
roc_train = roc_auc_score(y, y_train_pred)
auc_train = roc_auc_score(y, y_train_prob[:,1])

print('AUCROC for train')
print('='*60)
print(roc_train,"\n")
print('Confusion Matrix for train')
print('='*60)
print(matrix_train,"\n")
print('Classification Report for train')
print('='*60)
print(report_train,"\n")
print('AUC-ROC prob for train')
print('='*60)
print(auc_train,'\n')
print('Roc-Auc-Curve for Train set')
print('='*60)
print(plot_roc_curve(rcv_best, X, y),'\n')

# best estimator :  DecisionTreeClassifier(max_depth=100, min_samples_leaf=5, min_samples_split=200,
#                        random_state=23)
# best parameters:  OrderedDict([('max_depth', 100), ('min_samples_leaf', 5), ('min_samples_split', 200)])
# best score:  0.7626176945590644
#0.87 auc

In [None]:
y_prob = [i[1] for i in y_test_prob]
print(len(y_prob))

In [None]:
predictions = y_prob
submission = pd.DataFrame({ 'sequence' : sample_submission['sequence'], 'state': predictions }, index=None)
submission

In [None]:
submission.state = [0 if i < 0.5 else 1 for i in submission['state']]    
submission.head()

In [None]:
round(100*submission['state'].value_counts(),2).plot(kind='pie', figsize=(6, 6), autopct='%1.2f%%')
plt.title("state")
plt.legend(["1", "0"])
plt.show()

In [None]:
# predictions_pred = y_test_pred
# submission_pred = pd.DataFrame({ 'sequence' : test['sequence'], 'state': predictions_pred })
# submission_pred.shape

In [None]:
# round(100*submission_pred['state'].value_counts(),2).plot(kind='pie', figsize=(6, 6), autopct='%1.2f%%')
# plt.title("state")
# plt.legend(["1", "0"])
# plt.show()

In [None]:
submission.to_csv('submission.csv', index = False)
submission