In [None]:
import pandas as pd
import numpy as np

## Plotting Libraries
import matplotlib.pyplot as plt
import seaborn as sns

## ML Libraries
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale, MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, roc_auc_score
from sklearn.preprocessing import QuantileTransformer
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [None]:
# Reading only the training set for now
df = pd.read_csv("/kaggle/input/tabular-playground-series-nov-2021/train.csv")

In [None]:
## To display max column and row
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
df.head()

In [None]:
## looking at the shape of the dataframe to see number of rows and columns
df.shape

In [None]:
## Checking count of target variables to see if the data is skewed or not
df['target'].value_counts()

#### Finding Correlation

In [None]:
corr = df.corr()
# high_corr = corr[corr <= 0.3]
plt.figure(figsize=(35, 30))
sns.heatmap(corr, cmap="Greens")

##### finding correlation of all columns to target column

In [None]:
corr_target = df.corrwith(df["target"])

In [None]:
## Top positively correlated columns with target variables
corr_target[corr_target > 0].sort_values(ascending = False)

In [None]:
## Top negatively correlated columns with target variables
corr_target[corr_target < 0].sort_values()

##### Finding highly correlated columns

In [None]:
c = df.corr().abs()
c = c.drop(["id"], axis = 1)
c = c.drop(["id"], axis = 0)
s = c.unstack()
so = s.sort_values(kind="quicksort")

## Top 20 positively correlated columns 
so[:20]

In [None]:
## Everything after 102 are column coorelating with each other so looking at ~last 40 columns
so[-150:-102]

Note: None of the columns are highly correlated with each other so we cant drop any columns but we can build a model using top 20 columns that are highly correlated with target variable. 

### Feature Importance and Feature Selection With XGBoost

In [None]:
train, validation = train_test_split(df, test_size = 0.2)
train, test = train_test_split(train, test_size = 0.2)

In [None]:
## Preparing the data to be used in training the dataset
train_df = train.drop(["id","target"], axis = 1)
train_df_y = train["target"]

test_df = test.drop(["id", "target"], axis = 1)
# train_df_y = train["target"]

valid_df = validation.drop(["id", "target"], axis = 1)
valid_df_y = validation["target"]

In [None]:
## Scaling the data
scaler = MinMaxScaler()
scaler.fit(train_df)
train_df = pd.DataFrame(scaler.transform(train_df))
valid_df = pd.DataFrame(scaler.transform(valid_df))
test_df = pd.DataFrame(scaler.transform(test_df))

In [None]:
## Building XGboost classifier
XGB = xgb.XGBClassifier(max_depth = 10,
                       learning_rate = 0.01,
                       objective = 'binary: logistic',
                       gamma = 0.6,
                       min_child_weight = 8) 

In [None]:
train_df.head()

In [None]:
## training and evaluating the model
XGB_ = XGB.fit(train_df, train_df_y)
#               eval_set = [(valid_df.values, valid_df_y)],
#               eval_metric = 'auc',
#               early_stopping_rounds = 15,
#               verbose = True)

# Work still in progress. Please visit again to check for updates

#### If you liked the work so far, support it with upvote and if you have any suggestions/feedbacks please leave a comment and I will work on it. 