In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
import re

In [None]:
train = pd.read_csv('./training_set.csv')
train = train.drop(columns=['Unnamed: 0'])

In [None]:
train.info()

In [None]:
fig,ax = plt.subplots(nrows=20,ncols=3,figsize=(10,25))

l = 0
for i in range(20):
    for j in range(3):
        try:
            sns.kdeplot(train[train.columns[l]], ax=ax[i][j])
            l += 1
        except:
            break

# Feature Selection / Dimensionality Reduction

I have tried several methods for feature selection and the summary of the methods are outlined below:

1. Univariate Feature Selection
    - Multicollinearity Analysis Using Heat map and VIF
    - ANOVA correlation coefficient
    - Mututal Information 
2. Recursive Feature Elimination (Top down approach)
    - Logistic Regression
    - LinearSVC
3. Sequential Feature Selection (Bottom up approach)
    - KNeighborsClassifier
    - Tree based Models
4. Principal Componenet Analysis (PCA)

## 1.A Multicolinearity Analysis Using Correlation Matrix and VIF

In [None]:
corr = train.drop(columns=['Y']).corr()
f, ax = plt.subplots(figsize=(20, 15))
mask = np.triu(np.ones_like(corr, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, annot=False, mask = mask, cmap=cmap)
plt.title('Correlation Heatmap')

From the above plot we see that Features X34 and X32 are highly correlated, and also X40 is correlated with X32 and X34, For regression analysis it is important to drop highly correlated columns and hence I have decided to drop X34 for further analysis, I also confirm the same observation by calculating VIF where we regress each column with all the others and confirm that X34 and X32 are having high VIF values and hence can be dropped. It is also evident from the pair plots below.

In [None]:
X = train.drop(columns=['Y'])
vif = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
cols = X.columns.values
f, ax = plt.subplots(figsize=(20, 10))
ax.bar(cols,vif)
plt.title('VIF plot: Columns vs VIF')

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
cData_attr = train[['X32','X34','X40']]
g = sns.pairplot(cData_attr, diag_kind='kde')
g.fig.suptitle("'Pair plot of X32, X34 and X40'", y=1.08)

In [None]:
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
train = train.drop(columns=to_drop)
train.shape

## 1.B, 1.C ANNOVA and Mutual Information

In [None]:
X = train.drop(columns=['Y'])
y = train['Y']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=4535, stratify=y, shuffle=True)

# Let us select 30 features
select = SelectKBest(f_classif, k=30)
select.fit(X, y)
scores = -np.log10(select.pvalues_)
scores /= scores.max()

feature_importance = list(zip(X_train.columns.values, scores))
feature_importance.sort(key = lambda x: x[1], reverse=True) 
feature_importance = [list(t) for t in zip(*feature_importance)]

f, ax = plt.subplots(figsize=(20, 10))
ax.bar(feature_importance[0][:30], feature_importance[1][:30])
plt.title('Plot representing the level of significance of 30 best features using ANOVA')

In [None]:
#30 best features according to Mutual information
select = SelectKBest(mutual_info_classif, k=30)
select.fit(X, y)
scores /= scores.max()

feature_importance = list(zip(X_train.columns.values, scores))
feature_importance.sort(key = lambda x: x[1], reverse=True) 
feature_importance = [list(t) for t in zip(*feature_importance)]

f, ax = plt.subplots(figsize=(20, 10))
ax.bar(feature_importance[0][:30], feature_importance[1][:30])
plt.title('Plot representing the level of significance of 30 best features using Mutual Information')

## 2.A Recursive Feature Elimination using weights of Logistic Regression Model

In [None]:
scaled_X = MinMaxScaler().fit_transform(X)
estimator = LogisticRegression()
selectorr = RFE(estimator,n_features_to_select=30, step=1)
selectorr = selectorr.fit(scaled_X, y)
rank = selectorr.ranking_

feature_importance = list(zip(X_train.columns.values, rank))
feature_importance.sort(key = lambda x: x[1], reverse=True) 
feature_importance = [list(t) for t in zip(*feature_importance)]
f, ax = plt.subplots(figsize=(20, 10))
ax.bar(feature_importance[0][:30], feature_importance[1][:30])
plt.title('Selected Features according to RFE and Logistic Regression')

In [None]:
estimator = LinearSVC()
selectorr = RFE(estimator,n_features_to_select=30, step=1)
selectorr = selectorr.fit(scaled_X, y)
rank = selectorr.ranking_

feature_importance = list(zip(X_train.columns.values, rank))
feature_importance.sort(key = lambda x: x[1], reverse=True) 
feature_importance = [list(t) for t in zip(*feature_importance)]
f, ax = plt.subplots(figsize=(20, 10))
ax.bar(feature_importance[0][:30], feature_importance[1][:30])
plt.title('Selected Features according to RFE and LinearSVC')

## 3. Sequential Feature Selection using KNeighborsClassifier and DecisionTreeClassifier

In [None]:
%%time
estimator = KNeighborsClassifier(weights='distance')
sfs = SequentialFeatureSelector(estimator, n_features_to_select=30, n_jobs=-1, cv=2, scoring='accuracy')
sfs.fit(X, y)
fi_Neighbors = X.columns.values[sfs.support_]
fi_Neighbors

In [None]:
%%time
estimator = DecisionTreeClassifier()
sfs = SequentialFeatureSelector(estimator, n_features_to_select=30, n_jobs=-1, cv=2, scoring='accuracy')
sfs.fit(X, y)
fi_tree_fwd = X.columns.values[sfs.support_]
fi_tree_fwd

In [None]:
%%time
from sklearn.tree import ExtraTreeClassifier
estimator = ExtraTreeClassifier()
sfs = SequentialFeatureSelector(estimator, direction='forward',n_features_to_select=30, n_jobs=-1, cv=2, scoring='accuracy')
sfs.fit(X, y)
fi_tree_bwd = X.columns.values[sfs.support_]
fi_tree_bwd

# PCA

In [None]:
pca = PCA(n_components=30)
pca.fit(X)
pca_x = pca.transform(X)
sum(pca.explained_variance_ratio_)