In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

#sklearn
from sklearn.metrics import confusion_matrix

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
churn_df = pd.read_csv("../input/WA_Fn-UseC_-Telco-Customer-Churn.csv")
churn_df.head()

In [None]:
print("Shape=",churn_df.shape)

In [None]:
churn_df.isna().any()

In [None]:
churn_df.dtypes # We see TotalCharges is a string need to be converted into float

In [None]:
churn_df["TotalCharges"] = churn_df["TotalCharges"].apply(lambda x: float(x.strip()) if len(x.strip()) != 0 else 0.0)

In [None]:
"""
Plan of Action : We will not use exploratory analysis approach in this study

1. First Split the data using StraifiedKFold into 5 splits, repetition allowed.
2. Build Initial Machine Learning Framework ( also include Customer ID ): Algorithms used KNN, SVM, RandomForest, ExtraTreeForest, XGB ( the results are not good enough )
2. Extract Feaures ( Not Models ) Transform the raw data using PCA, NCA, SelectKBest, LDA, SelectKModel
3. Build the second Machine Learning Framework: Algorithms used KNN, SVM, RandomForest, ExtraTreeForest, XGB ( the results are not good enough )
4. Stack Features again but using Count Encoder, Percentile Encoder, Likelihood Encoder ( given by ) Far0n/kaggletils
5. Build the second Machine Learning Framework: Algorithms used KNN, SVM, RandomForest,, XGB). RandomForest is giving promising results
6. Grid Search to find best parameters for  RandomForest ( by running on one 1 fold data )
"""

In [None]:
X = churn_df.values[:, :-1] # CustomerID is included
y = churn_df.values[:, -1]

In [None]:
X[0]

In [None]:
#RepeatStratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold

rksf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=42)
training_test_split = []
for train_index, test_index in rksf.split(X, y):
  training_test_split.append((train_index, test_index))

In [None]:
len(training_test_split[0][0]), len(training_test_split[0][1])

In [None]:
column_map = { v:k for k, v in enumerate(churn_df.columns)} # Indexing the columns

In [None]:
# Numeric features - Standardized
# Categorical features - One Hot Encoded
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline

# Since we are using np.array not df
numeric_features = [column_map[i] for i in ('tenure','MonthlyCharges', 'TotalCharges' )]
categorical_feature = [column_map[i] for i in ('gender', 'SeniorCitizen', 'Partner', 'Dependents', 
                       'PhoneService', 'MultipleLines','InternetService',
                       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
                       'PaperlessBilling', 'PaymentMethod') 
                      ]

In [None]:
preprocess = ColumnTransformer(
    transformers = [
        ('num', StandardScaler(), numeric_features), # Normalize Numeric Variables
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_feature) # Make OneHot encoding for Categorical var
    ]
)

In [None]:
X_transform = preprocess.fit_transform(X)
print("X_transform Shape", X_transform.shape)
X_transform[0]

In [None]:
# Apply knn algorithm to predict

from sklearn.neighbors import KNeighborsClassifier
display = True
acc_knn=[]
n_neighbors = 3 # Hyper parameter
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
for train_indices, test_indices in training_test_split:
  knn.fit(X_transform[train_indices, :], y[train_indices])
  
  # Compute the nearest neighbor accuracy on the embedded test set
  acc_knn.append(knn.score(X_transform[test_indices,:], y[test_indices]))
  
  if display:
    print("confusion metrics = ") 
    print(confusion_matrix(knn.predict(X_transform[test_indices,:]), y[test_indices]))
    display = False
  
acc_knn = np.array(acc_knn)
print(" Knn with neighbors={0}, accuracy={1}, {2}".format(n_neighbors, acc_knn.mean(), acc_knn.std()))

In [None]:
#lets try increasing k to see any effect
display = True
acc_knn = []
n_neighbors = 10 # Hyper parameter
knn = KNeighborsClassifier(n_neighbors=n_neighbors)

for train_indices, test_indices in training_test_split:
  knn.fit(X_transform[train_indices, :], y[train_indices])
  
  # Compute the nearest neighbor accuracy on the embedded test set
  acc_knn.append(knn.score(X_transform[test_indices,:], y[test_indices]))
  
  if display:
    print("confusion metrics = ")
    print(confusion_matrix(knn.predict(X_transform[test_indices,:]), y[test_indices]))
    display = False
  
"""
Not much change 
"""
acc_knn = np.array(acc_knn)
print(" Knn with neighbors={0}, accuracy={1},{2}".format(n_neighbors, acc_knn.mean(), acc_knn.std()))

# Doesnt look promising :(

In [None]:
# Apply SVM algo
from sklearn import svm
acc_svm = []
display = True
clf = svm.SVC(kernel='rbf', gamma=0.7, C=1.0)

for train_indices, test_indices in training_test_split:
  clf.fit(X_transform[train_indices, :], y[train_indices])
  
  acc_svm.append(clf.score(X_transform[test_indices,:], y[test_indices]))
  
  if display:
    print("confusion metrics = ")
    print(confusion_matrix(clf.predict(X_transform[test_indices,:]), y[test_indices]))
    display = False
    
    
acc_svm = np.array(acc_svm)
print(" svm with kernel={0}, accuracy={1},{2}".format('rbf', acc_svm.mean(), acc_svm.std()))

In [None]:
from sklearn.ensemble import RandomForestClassifier

acc_rf = []
display = True
clf = RandomForestClassifier(n_estimators=500, min_samples_split=5, random_state=42)

for train_indices, test_indices in training_test_split:
  clf.fit(X_transform[train_indices, :], y[train_indices])
  
  acc_rf.append(clf.score(X_transform[test_indices,:], y[test_indices]))
  
  if display:
    print("confusion metrics = ")
    print(confusion_matrix(clf.predict(X_transform[test_indices,:]), y[test_indices]))
    display = False
    
    
acc_rf = np.array(acc_rf)
print(" RF with n_estimators=500, accuracy={0},{1}".format(n_neighbors, acc_rf.mean(), acc_rf.std()))

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

acc_et = []
display = True
clf = ExtraTreesClassifier(n_estimators=500, min_samples_split=5, random_state=42)

for train_indices, test_indices in training_test_split:
  clf.fit(X_transform[train_indices, :], y[train_indices])
  
  acc_et.append(clf.score(X_transform[test_indices,:], y[test_indices]))
  
  if display:
    print("confusion metrics = ")
    print(confusion_matrix(clf.predict(X_transform[test_indices,:]), y[test_indices]))
    display = False
  
acc_et = np.array(acc_et)
print(" ERF with n_estimators=500, accuracy={0},{1}".format(n_neighbors, acc_et.mean(), acc_et.std()))

In [None]:
import xgboost as xgb
display = True
acc_xgb = []

for train_indices, test_indices in training_test_split:
  clf = xgb.XGBClassifier().fit(X_transform[train_indices, :], y[train_indices])
  
  acc_xgb.append(clf.score(X_transform[test_indices,:], y[test_indices]))
  
  if display:
    print("confusion metrics = ")
    print(confusion_matrix(clf.predict(X_transform[test_indices,:]), y[test_indices]))
    display = False
    
acc_xgb = np.array(acc_xgb)
print(" XGB with accuracy={0},{1}".format(acc_xgb.mean(), acc_xgb.std()))

**If I have to stop here and chose a model it is unclear which one I shd pick**

**On second thought we will drop ExtraTreesClassifier for further analysis its results are similar to RF**

**Now lets transform our raw data and later on we will exclude the customer_id field and try again**

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectFromModel

from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import PCA
from sklearn.neighbors import NeighborhoodComponentsAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LassoCV

pca = PCA(n_components=5)
skb = SelectKBest(mutual_info_classif, k=2)
nca = NeighborhoodComponentsAnalysis(n_components=3, random_state=42)
lda = LinearDiscriminantAnalysis(n_components=3)
clf = LassoCV(cv=4)
sfm = SelectFromModel(clf, threshold=0.25)

union = FeatureUnion(
    [
        ("pca", pca),
        ("skb", skb),
        ("nca", nca),
        ("lda", lda),
        ("sfm", sfm)
        
    ]
)

In [None]:
union.fit(X_transform, LabelEncoder().fit_transform(y)) # We also come to know that Variables are collinear

In [None]:
X_feature = union.transform(X_transform)
print("X_feature shape", X_feature.shape)
X_feature[0]

In [None]:
# Apply knn algorithm to predict

from sklearn.neighbors import KNeighborsClassifier

display = True
acc_knn=[]
n_neighbors = 3 # Hyper parameter
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
for train_indices, test_indices in training_test_split:
  knn.fit(X_feature[train_indices, :], y[train_indices])
  
  # Compute the nearest neighbor accuracy on the embedded test set
  acc_knn.append(knn.score(X_feature[test_indices,:], y[test_indices]))
  
  if display:
    print("confusion metrics = ") 
    print(confusion_matrix(knn.predict(X_feature[test_indices,:]), y[test_indices]))
    display = False
  
acc_knn = np.array(acc_knn)
print(" Knn with neighbors={0}, accuracy={1}, {2}".format(n_neighbors, acc_knn.mean(), acc_knn.std()))

In [None]:
# Apply SVM algo
from sklearn import svm
acc_svm = []
display = True
clf = svm.SVC(kernel='rbf', gamma=0.7, C=1.0)

for train_indices, test_indices in training_test_split:
  clf.fit(X_feature[train_indices, :], y[train_indices])
  
  acc_svm.append(clf.score(X_feature[test_indices,:], y[test_indices]))
  
  if display:
    print("confusion metrics = ")
    print(confusion_matrix(clf.predict(X_feature[test_indices,:]), y[test_indices]))
    display = False
        
    
acc_svm = np.array(acc_svm)
print(" svm with kernel={0}, accuracy={1},{2}".format('rbf', acc_svm.mean(), acc_svm.std()))



In [None]:
from sklearn.ensemble import RandomForestClassifier

acc_rf = []
display = True
clf = RandomForestClassifier(n_estimators=500, min_samples_split=5, random_state=42)

for train_indices, test_indices in training_test_split:
  clf.fit(X_feature[train_indices, :], y[train_indices])
  
  acc_rf.append(clf.score(X_feature[test_indices,:], y[test_indices]))
  
  if display:
    print("confusion metrics = ")
    print(confusion_matrix(clf.predict(X_feature[test_indices,:]), y[test_indices]))
    display = False
    
    
acc_rf = np.array(acc_rf)
print(" RF with n_estimators=500, accuracy={0},{1}".format(n_neighbors, acc_rf.mean(), acc_rf.std()))


In [None]:
import xgboost as xgb
display = True
acc_xgb = []

for train_indices, test_indices in training_test_split:
  clf = xgb.XGBClassifier().fit(X_feature[train_indices, :], y[train_indices])
  
  acc_xgb.append(clf.score(X_feature[test_indices,:], y[test_indices]))
  
  if display:
    print("confusion metrics = ")
    print(confusion_matrix(clf.predict(X_feature[test_indices,:]), y[test_indices]))
    display = False
    
acc_xgb = np.array(acc_xgb)
print(" XGB with accuracy={0},{1}".format(acc_xgb.mean(), acc_xgb.std()))


**It seems SVM did something weird was able to predict one class quiet good**

In [None]:
"""
Lets merged the raw and transformed data to see if we get any benefits
"""
X_merged = np.hstack((X_transform, X_feature))
print("X_merged shape", X_merged.shape)


In [None]:
"""
We will only try 2 Estimators SVM and RF to see if there is any difference in merging datasets
"""
from sklearn import svm
acc_svm = []
display = True
clf = svm.SVC(kernel='rbf', gamma=0.7, C=1.0)

for train_indices, test_indices in training_test_split:
  clf.fit(X_merged[train_indices, :], y[train_indices])
  
  acc_svm.append(clf.score(X_merged[test_indices,:], y[test_indices]))
  
  if display:
    print("confusion metrics = ")
    print(confusion_matrix(clf.predict(X_merged[test_indices,:]), y[test_indices]))
    display = False
        
    
acc_svm = np.array(acc_svm)
print(" svm with kernel={0}, accuracy={1},{2}".format('rbf', acc_svm.mean(), acc_svm.std()))

In [None]:
from sklearn.ensemble import RandomForestClassifier

acc_rf = []
display = True
clf = RandomForestClassifier(n_estimators=500, min_samples_split=5, random_state=42)

for train_indices, test_indices in training_test_split:
  clf.fit(X_merged[train_indices, :], y[train_indices])
  
  acc_rf.append(clf.score(X_merged[test_indices,:], y[test_indices]))
  
  if display:
    print("confusion metrics = ")
    print(confusion_matrix(clf.predict(X_merged[test_indices,:]), y[test_indices]))
    display = False
     
  
acc_rf = np.array(acc_rf)
print(" RF with n_estimators=500, accuracy={0},{1}".format(n_neighbors, acc_rf.mean(), acc_rf.std()))


In [None]:
"""
hmm doesnt look like it makes any difference :(
"""

**Second Level of Transformation**
The reason being our features are **collinear**. The above feature transformation are not great

Here we use CounterEncoder, PercentileEncoder, LikelihoodEstimator to tranform the data and then reapply the algos.
These transformation can be viewed in https://github.com/Far0n/kaggletils

In [None]:
X = churn_df.values[:, :-1] # Not considering customer_id
y = churn_df.values[:, -1]

In [None]:
from collections import Counter

import numpy as np
from scipy.stats import norm
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
import scipy
#from statsmodels.distributions import ECDF

In [None]:
class CounterEncoder(BaseEstimator, TransformerMixin):
  def __init__(self, min_count=0, nan_value=-1, copy=True):
    self.min_cnt = min_count
    self.nans = nan_value
    self.cp = copy
    self.counts = {}
    
  def is_numpy(self, x):
    return isinstance(x, np.ndarray)
    
  def fit(self, x):
    self.counts = {}
    if len(x.shape) == 1:
      x = x.reshape(-1, 1)
    ncols = x.shape[1]
    is_np = self.is_numpy(x)
    
    for i in range(ncols):
      if is_np:
        cnt = dict(Counter(x[:, i]))
      else:
        cnt = x.iloc[:, i].value_counts().to_dict()
        
      if self.min_cnt > 0:
        cnt = dict((k, self.nans if v < self.min else v ) for k, v in cnt.items())
    
      self.counts.update({i:cnt})
    return self
  
  def fit_transform(self, x):
    self.fit(x)
    return self.transform(x)
  
  def transform(self, x):
    if self.cp:
      xm = x.copy()
      
    if len(xm.shape) == 1:
      xm = xm.reshape(-1, 1)
      
    ncols = xm.shape[1]
    is_np = self.is_numpy(xm)
    
    for i in range(ncols):
      cnt = self.counts[i]
      
      if is_np:
        k, v = np.array( list ( zip ( *sorted(cnt.items()))))
        ix = np.digitize(xm[:, i], k, right=True)
        xm[:, i] = v[ix]
      else:
        xm.iloc[:, i].replace(cnt, inplace=True)
    return xm

In [None]:
numeric_features = [column_map[i] for i in ('tenure','MonthlyCharges', 'TotalCharges' )]
categorical_feature = [column_map[i] for i in ('gender', 'SeniorCitizen', 'Partner', 'Dependents', 
                       'PhoneService', 'MultipleLines','InternetService',
                       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
                       'PaperlessBilling', 'PaymentMethod') 
                      ]

In [None]:
#X_counter = counter.fit_transform(X[:, categorical_feature_index])
X_label = X.copy()
for i in categorical_feature:
  X_label[:, i] = LabelEncoder().fit(X[:, i]).transform(X[:, i])

counter = CounterEncoder()
X_label_count = counter.fit_transform(X_label[:, categorical_feature])

X_label_count[:3, :]# It captures the occurence of each categorical variable

In [None]:
from sklearn.utils import check_X_y, check_array

def is_numpy(x):
    return isinstance(x, np.ndarray)
  
class LikelihoodEstimator(BaseEstimator):
    def __init__(self, seed=0, alpha=0, noise=0, leave_one_out=False):
        self.alpha = alpha
        self.noise = noise
        self.seed = seed
        self.leave_one_out = leave_one_out
        self.nclass = None
        self.classes = None
        self.class_priors = None
        self.likelihoods = None
        self.x_likelihoods = None

    def fit(self, x, y):
        np.random.seed(self.seed)
        if len(x.shape) == 1:
            x = x.reshape(-1, 1)

        x, y = check_X_y(x, y)

        self.classes = np.unique(y)
        self.nclass = self.classes.shape[0]

        ctab = pd.crosstab(y, list(x.T)).T.reset_index()

        xdim = x.shape[1]
        xcols = list(ctab.columns[:xdim])
        ycols = list(ctab.columns[xdim:])

        xtab = pd.DataFrame(x, columns=xcols)
        xtab = xtab.merge(ctab, how='left', on=xcols)

        self.class_priors = xtab[ycols].div(xtab[ycols].sum(axis=1), axis=0).mean().values

        if self.leave_one_out:
            xtab[ycols] -= pd.get_dummies(y)

        xtab[ycols] = xtab[ycols].add(self.class_priors * self.alpha). \
            div(xtab[ycols].sum(axis=1) + self.alpha + 1E-15, axis=0)
        if self.noise > 0:
            xtab[ycols] = np.abs(xtab[ycols] + normal(0, scale=self.noise, size=xtab[ycols].shape))
            xtab[ycols] = xtab[ycols].div(xtab[ycols].sum(axis=1), axis=0)
        self.x_likelihoods = xtab[ycols].values

        xtab_agg = xtab.groupby(xcols, as_index=False)[ycols].agg(['mean']).fillna(0)
        xtab_agg.columns = xtab_agg.columns.get_level_values(1)

        self.likelihoods = xtab_agg.T.ix['mean'].reset_index(drop=True).T.reset_index()
        # self.likelihoods = xtab_agg.T.ix['mean'].reset_index(drop=True).to_dict('list')
        # self.likelihoods_cov = xtab_agg.T.ix['std'].reset_index(drop=True).to_dict('list')
        # self.likelihoods_cov = dict((k, np.diag(v)) for k, v in self.likelihoods_cov.items())

        return self

    def _calc_likelihood(self, x):
        return (x + self.class_priors * self.alpha) / (x.sum() + self.alpha)

    def _get_likelihood(self, x, noise):
        mean = self.likelihoods.get(x[0], self.class_priors)
        cov = self.likelihoods_cov.get(x[0], np.diag(np.zeros((self.nclass,))))
        if noise:
            if isinstance(noise, float):
                cov = np.diag(np.ones((self.nclass,)) * noise)
            lh = np.abs(multivariate_normal(mean, cov))
            return lh / lh.sum()
        else:
            return mean

    def predict(self, x, noise=False, normalize=False):
        if normalize:
            return np.average(self.predict_proba(x, noise), axis=1, weights=self.classes)
        else:
            return np.dot(self.predict_proba(x, noise), self.classes)

    def predict_proba(self, x, noise=False):
        if len(x.shape) == 1:
            x = x.reshape(-1, 1)

        x = check_array(x)

        xx = pd.DataFrame(x, columns=self.likelihoods.columns[:-self.nclass])
        xx = xx.merge(self.likelihoods, how='left')
        xx.drop(xx.columns[:-self.nclass], axis=1, inplace=True)
        xx.loc[xx.isnull().any(axis=1) | (xx == 0).all(axis=1), :] = self.class_priors

        if noise:
            np.random.seed(self.seed)
            _noise = noise if isinstance(noise, float) else self.noise
            if _noise > 1E-12:
                xx = np.abs(xx + normal(0, scale=_noise, size=xx.shape))
                xx = xx.div(xx.sum(axis=1), axis=0)

        # return np.apply_along_axis(self._get_likelihood, 1, x, noise)
        return xx.values

class LikelihoodEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, seed=0, alpha=0, leave_one_out=False, noise=0):
        self.alpha = alpha
        self.noise = noise
        self.seed = seed
        self.leave_one_out = leave_one_out
        self.nclass = None
        self.estimators = []

    def fit(self, x, y):
        if len(x.shape) == 1:
            x = x.reshape(-1, 1)
        ncols = x.shape[1]
        if not is_numpy(x):
            x = np.array(x)

        self.nclass = np.unique(y).shape[0]

        for i in range(ncols):
            self.estimators.append(LikelihoodEstimator(**self.get_params()).fit(x[:, i], y))
        return self
      
    def fit_transform(self, x, y):
        self.fit(x, y)
        return self.transform(x)

    def transform(self, x):
        if len(x.shape) == 1:
            x = x.reshape(-1, 1)
        ncols = x.shape[1]
        if not is_numpy(x):
            x = np.array(x)

        likelihoods = None

        for i in range(ncols):
            lh = self.estimators[i].predict(x[:, i], noise=True).reshape(-1, 1)
            # lh = self.estimators[i].predict_proba(x[:, i])
            # if self.nclass <= 2:
            #     lh = lh.T[1].reshape(-1, 1)
            likelihoods = np.hstack((lh,)) if likelihoods is None else np.hstack((likelihoods, lh))
        return likelihoods

In [None]:
X_cat = X.copy()
for i in categorical_feature:
  X_cat[:, i] = LabelEncoder().fit(X[:, i]).transform(X[:, i])
  
le = LikelihoodEncoder()
X_likelihood = le.fit_transform(X_cat[:, 1:], LabelEncoder().fit_transform(y)) # # Not considering customer_id
X_likelihood[:3, :]

In [None]:
"""Lets Stack features """
X_feature_stacking = np.hstack([X_label_count, X_feature, X_likelihood])

In [None]:
# Apply knn algorithm to predict

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
display = True
acc_knn=[]
n_neighbors = 3 # Hyper parameter
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
for train_indices, test_indices in training_test_split:
  knn.fit(X_feature_stacking[train_indices, :], y[train_indices])
  
  # Compute the nearest neighbor accuracy on the embedded test set
  acc_knn.append(knn.score(X_feature_stacking[test_indices,:], y[test_indices]))
  
  if display:
    print("confusion metrics = ") 
    print(confusion_matrix(knn.predict(X_feature_stacking[test_indices,:]), y[test_indices]))
    display = False
  
acc_knn = np.array(acc_knn)
print(" Knn with neighbors={0}, accuracy={1}, {2}".format(n_neighbors, acc_knn.mean(), acc_knn.std()))

In [None]:
# Apply Naive algorithm to predict

from sklearn.naive_bayes import GaussianNB

display = True
acc_nb=[]
n_neighbors = 3 # Hyper parameter
nb = GaussianNB()
for train_indices, test_indices in training_test_split:
  nb.fit(X_feature_stacking[train_indices, :], y[train_indices])
  
  # Compute the nearest neighbor accuracy on the embedded test set
  acc_nb.append(nb.score(X_feature_stacking[test_indices,:], y[test_indices]))
  
  if display:
    print("confusion metrics = ") 
    print(confusion_matrix(knn.predict(X_feature_stacking[test_indices,:]), y[test_indices]))
    display = False
  
acc_nb = np.array(acc_nb)
print(" NB with neighbors={0}, accuracy={1}, {2}".format(n_neighbors, acc_nb.mean(), acc_nb.std()))

In [None]:
# Apply SVM algo
from sklearn import svm
acc_svm = []
display = True
clf = svm.SVC(kernel='rbf', gamma=0.7, C=1.0)

for train_indices, test_indices in training_test_split:
  clf.fit(X_feature_stacking[train_indices, :], y[train_indices])
  
  acc_svm.append(clf.score(X_feature_stacking[test_indices,:], y[test_indices]))
  
  if display:
    print("confusion metrics = ")
    print(confusion_matrix(clf.predict(X_feature_stacking[test_indices,:]), y[test_indices]))
    display = False
        
    
acc_svm = np.array(acc_svm)
print(" svm with kernel={0}, accuracy={1},{2}".format('rbf', acc_svm.mean(), acc_svm.std()))



In [None]:
from sklearn.ensemble import RandomForestClassifier

acc_rf = []
display = True
clf_rf = RandomForestClassifier(n_estimators=500, min_samples_split=5, random_state=42)

for train_indices, test_indices in training_test_split:
  clf_rf.fit(X_feature_stacking[train_indices, :], y[train_indices])
  
  acc_rf.append(clf_rf.score(X_feature_stacking[test_indices,:], y[test_indices]))
  
  if display:
    print("confusion metrics = ")
    print(confusion_matrix(clf_rf.predict(X_feature_stacking[test_indices,:]), y[test_indices]))
    display = False
     
  
acc_rf = np.array(acc_rf)
print(" RF with n_estimators=500, accuracy={0},{1}".format(n_neighbors, acc_rf.mean(), acc_rf.std()))


In [None]:
import xgboost as xgb
display = True
acc_xgb = []

for train_indices, test_indices in training_test_split:
  clf_xgb = xgb.XGBClassifier().fit(X_feature_stacking[train_indices, :], y[train_indices])
  
  acc_xgb.append(clf_xgb.score(X_feature_stacking[test_indices,:], y[test_indices]))
  
  if display:
    print("confusion metrics = ")
    print(confusion_matrix(clf_xgb.predict(X_feature_stacking[test_indices,:]), y[test_indices]))
    display = False
    
acc_xgb = np.array(acc_xgb)
print(" XGB with accuracy={0},{1}".format(acc_xgb.mean(), acc_xgb.std()))


**RF and XGB look very promising could it be that we may overfitted on the data Lets Check !!**

In [None]:
from sklearn.model_selection import train_test_split
X_feature_train, X_feature_test, y_train, y_test = train_test_split(X_feature_stacking, y, test_size=0.33, random_state=42)

In [None]:
print(confusion_matrix(clf_rf.predict(X_feature_test),y_test ))

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve


y_pred_rf = clf_rf.predict_proba(X_feature_test)
fpr_rf, tpr_rf, _ = roc_curve(LabelEncoder().fit_transform(y_test), y_pred_rf[:, 1])

auc = roc_auc_score(LabelEncoder().fit_transform(y_test), y_pred_rf[:, 1])

plt.figure(0)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_rf, tpr_rf, label='RF AUC {}'.format(np.round(auc, 3)))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()

In [None]:
# xgb 
print(confusion_matrix(clf_xgb.predict(X_feature_test),y_test ))

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve


y_pred_rf = clf_xgb.predict_proba(X_feature_test)
fpr_rf, tpr_rf, _ = roc_curve(LabelEncoder().fit_transform(y_test), y_pred_rf[:, 1])

auc = roc_auc_score(LabelEncoder().fit_transform(y_test), y_pred_rf[:, 1])

plt.figure(0)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_rf, tpr_rf, label='XG AUC {}'.format(np.round(auc, 3)))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()

**My Hunch is it is overfitting now Need to put regularizer :)**