In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.simplefilter('ignore')

pd.set_option('display.max_columns', None)

In [None]:
df_train= pd.read_csv('../input/tabular-playground-series-may-2022/train.csv')
df_train.head()

In [None]:
validation= pd.read_csv('../input/tabular-playground-series-may-2022/test.csv')
validation.head()

In [None]:
sub= pd.read_csv('../input/tabular-playground-series-may-2022/sample_submission.csv')
sub.head()

### Understanding the training dataset

In [None]:
df_train.head()

In [None]:
df_train.shape

There are total 9 lakhs rows and 33 columns

In [None]:
df_train.describe()

In [None]:
df_train.info()

There are no missing values in the dataset.
The columns with 'int' datatypes are discrete values and seems to be more like a categorical variables.

In [None]:
int_col= list(df_train.select_dtypes('int64').columns)
int_col

In [None]:
int_col.remove('id')
print(int_col)

In [None]:
for col in int_col:
    print(col)
    print(df_train[col].value_counts(normalize=True))
    #sns.countplot(df_train[col])
    sns.boxplot(df_train[col])
    plt.show()
    print("*"*100)

In [None]:
for col in int_col:
    if len(df_train[col].unique()) >=5:
        df_train[col]= df_train[col].apply(lambda x: 6 if x >= 6 else x)
    sns.boxplot(df_train[col])
    plt.show()

The target distribution is seems to be quiet balanced.

In [None]:
for col in df_train.select_dtypes(['float64', 'int64']):
    print(col)
    sns.distplot(df_train[col])
    plt.show()

All the numerical columns have normal distribution.

In [None]:
len(df_train['f_27'].unique())

In [None]:
df_train['f_27'].value_counts(normalize=True).head()

This column is a very high cardinality variable. So, treating this column using target encoding.

In [None]:
import category_encoders as ce
encoder= ce.TargetEncoder(cols= 'f_27')
df_train['f_27']= encoder.fit_transform(df_train['f_27'], df_train['target'])

In [None]:
df_train.head()

In [None]:
validation.head()

In [None]:
validation['f_27']= encoder.transform(validation['f_27'])
validation.head()

Splitting the target column, and then creating train and test set

In [None]:
y= df_train.pop('target')
X= df_train
print(X.shape, y.shape)

In [None]:
X.head()

In [None]:
train= X.iloc[:,1:].copy()
train.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(train, y, test_size=0.25, random_state= 42)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

Scaling the variables

In [None]:
from sklearn.preprocessing import StandardScaler
scaler= StandardScaler()
cols= X_train.columns
X_scaled= pd.DataFrame(scaler.fit_transform(X_train))
X_scaled.columns= cols
X_scaled.head()

In [None]:
X_test_scaled= pd.DataFrame(scaler.transform(X_test))
X_test_scaled.columns= cols
X_test_scaled.head()

### Model Building

### 1. Statsmodel

In [None]:
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
X= sm.add_constant(X_scaled)
model_1= sm.GLM(y_train.values.reshape(-1), X, family=sm.families.Binomial())
model_1.fit().summary()

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_scaled.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_scaled.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

### 2. Using RFE for feature selection

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [None]:
logreg= LogisticRegression()
rfe= RFE(logreg, n_features_to_select=15)
rfe= rfe.fit(X_scaled, y_train)

In [None]:
col= X_scaled.columns[rfe.support_]
col

In [None]:
X= sm.add_constant(X_scaled[col])
model_2= sm.GLM(y_train.values.reshape(-1), X, family=sm.families.Binomial())
model_2= model_2.fit()
model_2.summary()

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_scaled[col].columns
vif['VIF'] = [variance_inflation_factor(X_scaled[col].values, i) for i in range(X_scaled[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
y_train_pred= model_2.predict(X)
y_train_pred

In [None]:
prediction= pd.DataFrame({'Train_actual': y_train.values,
                         'Train_pred': y_train_pred})
prediction.head()

In [None]:
#cut_off= [(i/10) for i in range(10)]

#for i in cut_off:
#    prediction[str(i)]= prediction.Train_pred.apply(lambda x : 1 if x >= i else 0)
#prediction.head()

### Evaluation

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score
roc_auc_score(prediction.Train_actual, prediction.Train_pred)

### Prediction on test dataset

In [None]:
X_test_pred= sm.add_constant(X_test_scaled[col])
y_test_pred= model_2.predict(X_test_pred)
y_test_pred

In [None]:
roc_auc_score(y_test, y_test_pred)

### As the variables interpretation is not required, we can use PCA for feature selection.

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(abs(X_scaled.corr()))
plt.show()

In [None]:
from sklearn.decomposition import PCA
pca= PCA(n_components=10)
X_train_pca= pca.fit_transform(X_scaled)
ex_var= pca.explained_variance_ratio_
ex_var

In [None]:
np.sum(ex_var)

### 2. Using non-linear methods- Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree= DecisionTreeClassifier(random_state= 42, max_depth=8)

In [None]:
tree= tree.fit(X_scaled, y_train)

In [None]:
y_train_pred= tree.predict(X_scaled)
y_test_pred= tree.predict(X_test_scaled)

In [None]:
roc_auc_score(y_train, y_train_pred)

In [None]:
roc_auc_score(y_test, y_test_pred)

### Using GridSearch

In [None]:
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score

In [None]:
params= {'max_depth': [15, 20]}
n= 3
grid_tree= GridSearchCV(estimator=DecisionTreeClassifier(), cv= n,
                       param_grid=params,
                       scoring= 'roc_auc',
                       return_train_score=True,
                       verbose=1)

In [None]:
%time
grid_tree.fit(X_scaled, y_train)

In [None]:
grid_tree.best_params_

In [None]:
tree= DecisionTreeClassifier(random_state= 42, max_depth=15)
tree= tree.fit(X_scaled, y_train)

In [None]:
y_train_pred= tree.predict(X_scaled)
y_test_pred= tree.predict(X_test_scaled)

In [None]:
roc_auc_score(y_train, y_train_pred)

In [None]:
roc_auc_score(y_test, y_test_pred)

### 3. Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest= RandomForestClassifier(random_state= 42, n_estimators= 60)

In [None]:
forest= forest.fit(X_scaled, y_train)

In [None]:
y_train_pred= forest.predict(X_scaled)
y_test_pred= forest.predict(X_test_scaled)

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_train, y_train_pred)

In [None]:
roc_auc_score(y_test, y_test_pred)

### Using Boosting Method
### 1. AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
ada= AdaBoostClassifier(n_estimators=80, random_state=42)


In [None]:
ada= ada.fit(X_scaled, y_train)

In [None]:
y_train_pred= ada.predict(X_scaled)
y_test_pred= ada.predict(X_test_scaled)

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_train, y_train_pred)

In [None]:
roc_auc_score(y_test, y_test_pred)