In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.filterwarnings('ignore')

# **Loading the datasets**

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2021/test.csv")

In [None]:
train

In [None]:
test

# Saving a copy of the datsets

In [None]:
train_copy = train.copy()
test_copy = test.copy()

dropping the id-columns

In [None]:
train.drop(['id'], axis=1, inplace=True)
test.drop(['id'], axis=1, inplace=True)

# **EDA**

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
catcols = [col for col in train.columns if col.startswith('cat')]
contcols = [col for col in train.columns if col.startswith('cont')]

In [None]:
for col in catcols:
    plt.title(f'Count plot of {col}')
    sns.countplot(train[col])
    plt.show()

In [None]:
for col in contcols:
    plt.title(f'Distplot of {col}')
    sns.distplot(train[col])
    plt.show()

In [None]:
sns.countplot(train['target'])

# **Data Processing**

lets check for null values

In [None]:
train.isnull().sum().any()

In [None]:
test.isnull().sum().any()

lets encode categorical data to numeric data

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
cols_ca = ['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cat11', 'cat12',
          'cat13', 'cat14', 'cat15', 'cat16', 'cat17', 'cat18']

In [None]:
le = LabelEncoder()

for col in cols_ca:
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])

In [None]:
le_1 = LabelEncoder()

train['cat10'] = le_1.fit_transform(train['cat10'])
test['cat10'] = le_1.fit_transform(test['cat10'])

checking feature correlation

In [None]:
corr = train.corr()
plt.figure(figsize=(25,25))
sns.heatmap(corr, annot=True, cmap='coolwarm', square=True)
plt.show()

as we can see from the corr plot, there are many features with high correlation lets remove them

In [None]:
train_ = train.copy()
test_ = test.copy()

In [None]:
len(train_.columns)

In [None]:
x_train_ = train_.drop('target', axis=1)
x_test_ = test_

In [None]:
def correlation(dataset, threshold):
    col_corr = set() # Set of all the names of deleted columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if (corr_matrix.iloc[i, j] >= threshold) and (corr_matrix.columns[j] not in col_corr):
                colname = corr_matrix.columns[i] # getting the name of column
                col_corr.add(colname)
                if colname in dataset.columns:
                    del dataset[colname] # deleting the column from the dataset
    return dataset

In [None]:
x_train = correlation(x_train_, 0.4)
x_test = correlation(x_test_, 0.4)

In [None]:
x_train = pd.DataFrame(x_train, columns=x_train_.columns)
x_test = pd.DataFrame(x_test, columns=x_test_.columns)

y_train = pd.DataFrame(train['target'], columns=['target'])

In [None]:
print(len(x_train.columns)) 

In [None]:
x_train.columns

In [None]:
x_test.columns

feature imbalance

In [None]:
sns.countplot(y_train['target'])

there is some imbalance in the data lets fix this using SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
smote = SMOTE()

x_train, y_train = smote.fit_resample(x_train, y_train)

In [None]:
sns.countplot(y_train['target'])

now our data is balanced

# Splitting training data into training and validation sets

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train_, x_val, y_train_, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=56)

# Model Selection

In [None]:
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sklearn.linear_model import RidgeClassifier, SGDClassifier

In [None]:
from sklearn.metrics import classification_report, roc_auc_score, f1_score

In [None]:
def model_selection(x_train_, x_val, y_train_, y_val, model):
  model = model()
  model.fit(x_train_, y_train_)

  pred = model.predict(x_val)

  f1 = f1_score(y_val, pred)
  roc = roc_auc_score(y_val, pred)
  report = classification_report(y_val, pred)
  train_score = model.score(x_train_, y_train_)
  val_score = model.score(x_val, y_val)

  print('F1 Score:', f1)
  print('\n')
  print('ROC AUC Score:', roc)
  print('\n')
  print('Classification report:', report)
  print('\n')
  print('Train Score:', train_score*100)
  print('\n')
  print('Val Score:', val_score*100)
  print('\n')
  print('Is overfitting:', True if train_score>val_score else False)
  print('\n')
  print('Overfitting by:',train_score*100-val_score*100)

In [None]:
extratrees = model_selection(x_train_, x_val, y_train_, y_val, ExtraTreesClassifier)
extratrees

In [None]:
gradient = model_selection(x_train_, x_val, y_train_, y_val, GradientBoostingClassifier)
gradient

In [None]:
random = model_selection(x_train_, x_val, y_train_, y_val, RandomForestClassifier)
random

In [None]:
xgb = model_selection(x_train_, x_val, y_train_, y_val, XGBClassifier)
xgb

In [None]:
lgbm = model_selection(x_train_, x_val, y_train_, y_val, LGBMClassifier)
lgbm

In [None]:
logistic = model_selection(x_train_, x_val, y_train_, y_val, LogisticRegression)
logistic

In [None]:
catboost = model_selection(x_train_, x_val, y_train_, y_val, CatBoostClassifier)
catboost

In [None]:
ridge = model_selection(x_train_, x_val, y_train_, y_val, RidgeClassifier)
ridge

In [None]:
sgd = model_selection(x_train_, x_val, y_train_, y_val, SGDClassifier)
sgd

I will use CatBoostClassifier because it has a good f1 and auc score and the overfitting rate is low

# Hyper parameter tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold

In [None]:
from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt

In [None]:
model = CatBoostClassifier()

In [None]:
parameters = {'depth'         : sp_randInt(4, 10),
              'learning_rate' : sp_randFloat(),
              'iterations'    : sp_randInt(100, 1000)
              }

In [None]:
cv = StratifiedKFold(n_splits=5)

In [None]:
search = RandomizedSearchCV(estimator=model, param_distributions = parameters,
                               cv = cv, n_iter = 10, n_jobs=-1)

search.fit(x_train, y_train)

In [None]:
print('Best Params:', search.best_params_)
print('\n')
print('Best Score:', search.best_score_)

# Model Building and Training

In [None]:
model = CatBoostClassifier(depth=7, iterations=1000, learning_rate=0.49811995148302424)
model.fit(x_train, y_train)

# Predictions

In [None]:
pred = model.predict(x_test)
pred

In [None]:
sample = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/sample_submission.csv')
sample

In [None]:
Submission = pd.DataFrame({'id':test_copy['id'], 'target':pred})
Submission

# Saving the submission file

In [None]:
Submission.to_csv('/kaggle/working/Submission.csv')