In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import xgboost as xgb

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

# <b>1 <span style='color:orange'>|</span> Load and Explore Data</b>

In [None]:
train_data = pd.read_csv('../input/tabular-playground-series-apr-2022/train.csv')
train_labels = pd.read_csv('../input/tabular-playground-series-apr-2022/train_labels.csv')

In [None]:
print('Train data dimentions:', train_data.shape)
print('Train labels dimentions:', train_labels.shape)

In [None]:
train_data.head()

In [None]:
train_labels.head()

In [None]:
train_labels['state'].value_counts()

In [None]:
train_data.info()

In [None]:
train_data.nunique()

In [None]:
train_labels.info()

In [None]:
train_data.describe()

In [None]:
figure, axis = plt.subplots(2, 3)
axis[0, 0].plot(train_data[train_data['sequence'] == 25958].drop(['sequence', 'step', 'subject'], axis = 1))
#axis[0, 0].set_title("Sequence_25958_state_1")
axis[0, 1].plot(train_data[train_data['sequence'] == 25961].drop(['sequence', 'step', 'subject'], axis = 1))
#axis[0, 1].set_title("Sequence_25961_state_1")
axis[1, 0].plot(train_data[train_data['sequence'] == 25959].drop(['sequence', 'step', 'subject'], axis = 1))
#axis[1, 0].set_title("Sequence_25959_state_0")
axis[1, 1].plot(train_data[train_data['sequence'] == 25960].drop(['sequence', 'step', 'subject'], axis = 1))
#axis[1, 1].set_title("Sequence_25960_state_0")
axis[0, 2].plot(train_data[train_data['sequence'] == 25966].drop(['sequence', 'step', 'subject'], axis = 1))
#axis[0, 2].set_title("Sequence_25966_state_1")
axis[1, 2].plot(train_data[train_data['sequence'] == 25967].drop(['sequence', 'step', 'subject'], axis = 1))
#axis[1, 2].set_title("Sequence_25967_state_0")
plt.show()

# <b>2 <span style='color:orange'>|</span> Data Preparation and Feature Engineering<b>

In [None]:
train_data_stats = pd.DataFrame()

for sensor in range(13):
    sensor_name = f"sensor_{sensor:02d}"
    train_data_stats[f'{sensor_name}''_max'] = train_data.groupby('sequence')[f'{sensor_name}'].max()
    train_data_stats[f'{sensor_name}''_min'] = train_data.groupby('sequence')[f'{sensor_name}'].min()
    train_data_stats[f'{sensor_name}''_mean'] = train_data.groupby('sequence')[f'{sensor_name}'].mean()
    train_data_stats[f'{sensor_name}''_std'] = train_data.groupby('sequence')[f'{sensor_name}'].std()
    train_data_stats[f'{sensor_name}''_median'] = train_data.groupby('sequence')[f'{sensor_name}'].median()

In [None]:
train_data_stats

In [None]:
#train_data_sensors_mean = train_data.groupby('sequence').mean()
#train_data_sensors_mean = train_data_sensors_mean.drop(['subject', 'step'], axis = 1)
#train_data_sensors_mean.head()

In [None]:
#train_data_sensors_mean.shape

**Data Preprocessing**

In [None]:
#scaler = StandardScaler()
#scaled_sensors_mean = scaler.fit_transform(train_data_sensors_mean)

**Data Split**

In [None]:
X = train_data_stats
y = train_labels['state']

X_train, X_val, y_train, y_val = train_test_split(X, y)

**Modelling**

In [None]:
lr = LogisticRegression(max_iter = 5000)
lr.fit(X_train, y_train)
print('Train set score:', lr.score(X_train, y_train))
print('Validation set score:', lr.score(X_val, y_val))

In [None]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

print('Train set score:', rfc.score(X_train, y_train))
print('Validation set score:', rfc.score(X_val, y_val)) 
print('Untuned model is overfitting')

In [None]:
rfc = RandomForestClassifier(n_estimators = 200, max_depth=12, max_features = 6, min_samples_leaf = 50)
rfc.fit(X_train, y_train)

print('Train set score:', rfc.score(X_train, y_train))
print('Validation set score:', rfc.score(X_val, y_val)) 

max_depth = [8, 10, 12]
max_features = [5, 8, 10]

hyperparameters = dict( max_depth = max_depth, 
                       max_features = max_features)

grid = GridSearchCV(rfc, hyperparameters, cv = 5)
best_model = grid.fit(X_train, y_train)

best_model.best_estimator_.get_params()

In [None]:
y_pred = rfc.predict(X_val)

In [None]:
print(classification_report(y_val, y_pred))
print(confusion_matrix(y_val,y_pred))

In [None]:
xg_class = xgb.XGBClassifier(objective ='reg:logistic', use_label_encoder = False)
xg_class.fit(X_train, y_train)
print('Train score:', xg_class.score(X_train, y_train))
print('Test score:', xg_class.score(X_val, y_val))

In [None]:
xg_class = xgb.XGBClassifier(objective ='binary:logistic', eval_metric = 'error', learning_rate = 0.1, use_label_encoder = False)
xg_class.fit(X_train, y_train)
print('Train score:', xg_class.score(X_train, y_train))
print('Test score:', xg_class.score(X_val, y_val))


In [None]:
xg_class = xgb.XGBClassifier(objective ='binary:logistic', eval_metric = 'error', 
                             learning_rate = 0.1, n_estimators = 180, min_child_weight = 3, 
                             max_depth = 7, use_label_encoder = False)

xg_class.fit(X_train, y_train)
print('Train score:', xg_class.score(X_train, y_train))
print('Test score:', xg_class.score(X_val, y_val))

In [None]:
y_pred_proba = xg_class.predict_proba(X_val)[::, 1]
auc = roc_auc_score(y_val, y_pred_proba)
print(auc)

In [None]:
params = {'n_estimators': 1000,
          'max_depth': 7,
          'learning_rate': 0.1,
          'subsample': 0.95,
          'colsample_bytree': 0.60,
          'reg_lambda': 1.50,
          'reg_alpha': 6.10,
          'gamma': 1.40,
          'random_state': 69,
          'objective': 'binary:logistic',
          'tree_method': 'hist',
         }

In [None]:
xgb_class = xgb.XGBClassifier(**params)
xgb_class.fit(X_train, y_train, eval_set = [(X_val, y_val)], eval_metric = ['auc'], early_stopping_rounds = 128, verbose = 50)

In [None]:
y_pred_proba = xgb_class.predict_proba(X_val)[::, 1]
auc = roc_auc_score(y_val, y_pred_proba)
print(auc)

**Submission**

In [None]:
test_data = pd.read_csv('../input/tabular-playground-series-apr-2022/test.csv')

In [None]:
test_data_stats = pd.DataFrame()

for sensor in range(13):
    sensor_name = f"sensor_{sensor:02d}"
    test_data_stats[f'{sensor_name}''_max'] = test_data.groupby('sequence')[f'{sensor_name}'].max()
    test_data_stats[f'{sensor_name}''_min'] = test_data.groupby('sequence')[f'{sensor_name}'].min()
    test_data_stats[f'{sensor_name}''_mean'] = test_data.groupby('sequence')[f'{sensor_name}'].mean()
    test_data_stats[f'{sensor_name}''_std'] = test_data.groupby('sequence')[f'{sensor_name}'].std()
    test_data_stats[f'{sensor_name}''_median'] = test_data.groupby('sequence')[f'{sensor_name}'].median()

In [None]:
test_data_stats.shape

In [None]:
#test_data = pd.read_csv('../input/tabular-playground-series-apr-2022/train.csv')
#test_data_sensors_mean = test_data.groupby('sequence').mean()
#test_data_sensors_mean = test_data_sensors_mean.drop(['subject', 'step'], axis = 1)

#test_data_sensors_mean.head()

In [None]:
#test_data_sensors_mean.shape

In [None]:
X_test = test_data_stats

y_pred = xgb_class.predict(X_test)

In [None]:
submission = pd.DataFrame()

submission.index = X_test.index
submission['state'] = y_pred

submission

In [None]:
submission.to_csv('third_submission.csv')