# Tabular Playground Series - Feb 2021

## Dataset Description
For this competition, you will be predicting a continuous target based on a number of feature columns given in the data. All of the feature columns, cat0 - cat9 are categorical, and the feature columns cont0 - cont13 are continuous.

# Data Exploration

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


from sklearn.metrics import confusion_matrix, classification_report,accuracy_score,roc_curve, auc, precision_recall_curve, f1_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier

## Load Dataset

In [None]:
df_train = pd.read_csv("../input/tabular-playground-series-mar-2021/train.csv")
df_test = pd.read_csv("../input/tabular-playground-series-mar-2021/test.csv")

In [None]:
df_train.head()

In [None]:
df_test.head()

## Description

In [None]:
df_train.info()

## Data type identification

In [None]:
df_train.columns

## Data numeric

In [None]:
numeric=['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
df_num=df_train.select_dtypes(include=numeric)
df_num.head(3)

## Data categorical

In [None]:
df_cat=df_train.select_dtypes(include='object')
df_cat.head(3)

# Early Features Engineering

## Combining Train and Test Dataframes

The purpose of combine the dataframe are to avoid repeating all the operations (such as transformations, imputations, etc) done on the train set for the test set and to get more data for our analysis (because more data we get, the BETTER it is)

##### Store the number of rows or indexes for train and test dataset to separate them while performing modeling and prediction.

In [None]:
all_data = pd.concat([df_train,df_test])

In [None]:
all_data.drop('id',axis=1,inplace=True)

# Data Preparation

## Missing Value 

In [None]:
null=pd.DataFrame(all_data.isnull().sum(),columns=["Null Values"])
null["% Missing Values"]=(all_data.isna().sum()/len(all_data)*100)
null = null[null["% Missing Values"] > 0]
null.style.background_gradient(cmap='viridis',low =0.2,high=0.1)

# Exploratory Data Analysis

## Numerical Approach

### Statistical Summary

In [None]:
describeNum = df_train.describe(include =['float64', 'int64', 'float', 'int'])
describeNum.T.style.background_gradient(cmap='viridis',low=0.2,high=0.1)

In [None]:
describeNumCat = df_train.describe(include=["O"])
describeNumCat.T.style.background_gradient(cmap='viridis',low=0.2,high=0.1)

### Categorical Value Counting

In [None]:
cats = df_train.describe(include=["O"])
for col in cats:
    print(f'''Value count colunm {col}:''')
    print(df_train[col].value_counts())
    print()

## Graphic Approach

### Correlation heatmap

Now how to correlate between data variables. 

Correlation is represented as a value between -1 and +1 where +1 indicates the highest positive correlation, -1 indicates the highest negative correlation, and 0 indicates no correlation.

In [None]:
numeric = ['cont0', 'cont1', 'cont2', 'cont3', 'cont4','cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10']

In [None]:
df_train[numeric].corr()

In [None]:
plt.figure(figsize=(30,20))
ax = sns.heatmap(data = df_train[numeric].corr(),cmap='YlGnBu',annot=True)

bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5,top - 0.5)

### Target distribution analysis

In [None]:
fig, ax =plt.subplots(1,2)

plt.style.use('fivethirtyeight')
plt.figure(figsize=(10,12))
sns.set_context("paper", font_scale=1)                                                  
sns.countplot('target',data=all_data, ax=ax[0])
all_data['target'].value_counts().plot.pie(explode=[0,0.2],autopct='%1.2f%%',ax=ax[1])
fig.show()

### Categorical Features

In [None]:
features_cat = ['cat0', 'cat1', 'cat2', 'cat3',
                'cat4', 'cat5', 'cat6', 'cat7',
                'cat8', 'cat9', 'cat10', 'cat11',
                'cat12', 'cat13', 'cat14', 'cat15',
                'cat16', 'cat17', 'cat18']

In [None]:
# plot distribution of categorical features
for f in features_cat:
    plt.figure(figsize=(14,4))
    df_train[f].value_counts().plot(kind='bar')
    plt.title(f)
    plt.grid()
    plt.show()

# Data Processing

## Outliers

We will try to detect outliers for the numeric features and then remove them , But we will only remove those outliers which are a part of the train data i.e. having index within ntrain (defined earlier)

In [None]:
featuresNum = ['cont0', 'cont1', 'cont2', 'cont3', 'cont4','cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10']

plt.figure(figsize=(15, 7))
for i in range(0, len(featuresNum)):
    plt.subplot(1, len(featuresNum), i+1)
    sns.boxplot(y=df_train[featuresNum[i]], color='green', orient='v')
    plt.tight_layout()

## Label Encoding

For handling categorical data. We modtly use these 2 path:
 - OneHotEncoder
 - LabelEncoder
Where OneHotEncoder is used where data are not in any order and LabelEncoder when data is in order.

In [None]:
all_data.cat0 = all_data.cat0.replace({'A':0,'B':1})
all_data.cat1 = all_data.cat1.replace({'A':0,'B':1,'C':2,'D':3,'E':4,'F':5,'G':6,'H':7,'I':8,'J':9,'K':10,'L':11,'M':12,'N':13,'O':14})
all_data.cat2 = all_data.cat2.replace({'A':0,'B':1,'C':2,'D':3,'E':4,'F':5,'G':6,'H':7,'I':8,'J':9,'K':10,'L':11,'M':12,'N':13,'O':14,'Q':15,'R':16,'S':17,'U':18,})
all_data.cat3 = all_data.cat3.replace({'A':0,'B':1,'C':2,'D':3,'E':4,'F':5,'G':6,'H':7,'I':8,'J':9,'K':10,'L':11,'N':12})
all_data.cat4 = all_data.cat4.replace({'A':0,'B':1,'C':2,'D':3,'E':4,'F':5,'G':6,'H':7,'I':8,'J':9,'K':10,'L':11,'M':12,'N':13,'O':14,'P':15,'Q':16,'R':17,'S':18,'T':19})
all_data.cat6 = all_data.cat6.replace({'A':0,'B':1,'C':2,'D':3,'E':4,'F':5,'G':6,'I':7,'K':8,'M':9,'O':10,'Q':11,'S':12,'U':13,'W':14,'Y':15})
all_data.cat9 = all_data.cat9.replace({'A':0,'B':1,'C':2,'D':3,'E':4,'F':5,'G':6,'I':7,'J':8,'L':9,'N':10,'O':11,'Q':12,'R':13,'S':14,'U':15,'V':16,'W':17,'X':18})
all_data.cat11 = all_data.cat11.replace({'A':0,'B':1})
all_data.cat12 = all_data.cat12.replace({'A':0,'B':1})
all_data.cat13 = all_data.cat13.replace({'A':0,'B':1})
all_data.cat14 = all_data.cat14.replace({'A':0,'B':1})
all_data.cat15 = all_data.cat15.replace({'A':0,'B':1,'C':2,'D':3})
all_data.cat16 = all_data.cat16.replace({'A':0,'B':1,'C':2,'D':3})
all_data.cat17 = all_data.cat17.replace({'A':0,'B':1,'C':2,'D':3})
all_data.cat18 = all_data.cat18.replace({'A':0,'B':1,'C':2,'D':3})

In [None]:
#Drop Column
all_data = all_data.drop(['cat5'],axis=1)
all_data = all_data.drop(['cat7'],axis=1)
all_data = all_data.drop(['cat8'],axis=1)
all_data = all_data.drop(['cat10'],axis=1)

In [None]:
all_data.head()

# Feature Engineering

## Log Transforming all the Highly Skewed Features.

In [None]:
# ## Get all the numeric features in out dataset
# numeric_features = all_data.skew().index

# ## Getting all the skewed features (skew > 0.5 or skew < -0.5)
# skewed_features = all_data[numeric_features].skew()[np.abs(all_data[numeric_features].skew()) > 0.5].index

# ## Performing log(1+x) transformation
# all_data[skewed_features] = np.log1p(all_data[skewed_features])

## Getting the new train and test sets

In [None]:
df_train = all_data.iloc[:300000][:]

df_test = all_data[300000:][:]
df_test.drop('target',axis=1,inplace=True)

In [None]:
df_train['target'] = df_train['target'].astype(int)

In [None]:
c = df_train.select_dtypes(exclude = ["object"]).columns

for a in range(len(c)):
    print("Is there any negative value in '{}' column  : {} ".format(c[a],(df_train[c[a]]<0).any()))

In [None]:
c = df_test.select_dtypes(exclude = ["object"]).columns

for a in range(len(c)):
    print("Is there any negative value in '{}' column  : {} ".format(c[a],(df_test[c[a]]<0).any()))

In [None]:
df_train=df_train.drop(df_train[df_train.cont0<0].index)
df_train=df_train.drop(df_train[df_train.cont3<0].index)
df_train=df_train.drop(df_train[df_train.cont5<0].index)
df_test=df_test.drop(df_test[df_test.cont0<0].index)
df_test=df_test.drop(df_test[df_test.cont3<0].index)
df_test=df_test.drop(df_test[df_test.cont5<0].index)
df_test=df_test.drop(df_test[df_test.cont6<0].index)

# Modeling

In [None]:
X = df_train.drop('target', axis = 1)
y  = df_train['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Random Forest Classifier

In [None]:
RFC = RandomForestClassifier()


RFC.fit(X_train,y_train)
y_pred_rf = RFC.predict(X_test)

print("Training Accuracy :", RFC.score(X_train, y_train))
print("Testing Accuracy :", RFC.score(X_test, y_test))

cm = confusion_matrix(y_test, y_pred_rf)
plt.rcParams['figure.figsize'] = (3, 3)
sns.heatmap(cm, annot = True, cmap = 'YlGnBu', fmt = '.8g')
plt.show()

cr = classification_report(y_test, y_pred_rf)
print(cr)


print("------------------------------------------")

false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,y_pred_rf)
roc_auc = auc(false_positive_rate, true_positive_rate)
print("ROC Curves              =",roc_auc)

### Gradient boosting

In [None]:
GBC = GradientBoostingClassifier(loss="deviance",
                                 n_estimators=100, 
                                 learning_rate=0.1,
                                 max_depth=8,
                                 min_samples_leaf=100,
                                 max_features=0.1)

GBC.fit(X_train,y_train)

y_pred_rf = GBC.predict(X_test)

print("Training Accuracy :", GBC.score(X_train, y_train))
print("Testing Accuracy :", GBC.score(X_test, y_test))

cm = confusion_matrix(y_test, y_pred_rf)
plt.rcParams['figure.figsize'] = (3, 3)
sns.heatmap(cm, annot = True, cmap = 'YlGnBu', fmt = '.8g')
plt.show()

cr = classification_report(y_test, y_pred_rf)
print(cr)


print("------------------------------------------")

false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,y_pred_rf)
roc_auc = auc(false_positive_rate, true_positive_rate)
print("ROC Curves              =",roc_auc)

### ExtraTrees 

In [None]:
ExtC = ExtraTreesClassifier()

ExtC.fit(X_train,y_train)

y_pred_rf = ExtC.predict(X_test)

print("Training Accuracy :", ExtC.score(X_train, y_train))
print("Testing Accuracy :", ExtC.score(X_test, y_test))

cm = confusion_matrix(y_test, y_pred_rf)
plt.rcParams['figure.figsize'] = (3, 3)
sns.heatmap(cm, annot = True, cmap = 'YlGnBu', fmt = '.8g')
plt.show()

cr = classification_report(y_test, y_pred_rf)
print(cr)


print("------------------------------------------")

false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,y_pred_rf)
roc_auc = auc(false_positive_rate, true_positive_rate)
print("ROC Curves              =",roc_auc)