In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
csvfile = '../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv'
df = pd.read_csv(csvfile)
df.head()

In [None]:
df.info()

In [None]:
df['TotalCharges'].describe()

In [None]:
df.loc[488]

In [None]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'] ,errors='coerce' )

In [None]:
df.loc[488]

In [None]:
df.columns

In [None]:
df.dropna(inplace=True)
df

In [None]:
df.shape

In [None]:
df.drop(columns='customerID' ,inplace=True)


In [None]:
df

In [None]:
categ_feats = list(df.columns.values)
categ_feats

In [None]:
list_to_remove =['TotalCharges','MonthlyCharges','tenure']
categ_feats = list(set(categ_feats).difference(set(list_to_remove)))
categ_feats

In [None]:
df = pd.get_dummies(df ,columns=categ_feats)
df

In [None]:
df

In [None]:
df.info()

In [None]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [None]:
numeric_feats = [col   for col in  list(df.columns.values) if df[col].dtype in ('int64', 'float64' )]
numeric_feats

In [None]:
df_numeric_feats = pd.DataFrame(df ,columns=numeric_feats)
df_numeric_feats.head()

In [None]:
df_categ_feats = df.drop(columns=numeric_feats)
df_categ_feats.head()

In [None]:
min_max_scaler = preprocessing.MinMaxScaler()
normalized_numeric_feats = min_max_scaler.fit_transform(df_numeric_feats)
normalized_numeric_feats = pd.DataFrame(normalized_numeric_feats ,columns=numeric_feats ,index=df_categ_feats.index)

In [None]:
normalized_numeric_feats

In [None]:
df_numeric_norm = pd.concat([df_categ_feats , normalized_numeric_feats] ,axis=1 )
df_numeric_norm.head()

In [None]:
df2 = df_numeric_norm

In [None]:
df2.columns

In [None]:
X_normalized = df2.drop(['Churn_No' , 'Churn_Yes'] ,axis=1 )
y = df2.Churn_Yes
X_train ,X_test ,y_train , y_test =train_test_split(X_normalized ,y , test_size=0.2 , random_state=21)

In [None]:

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
# Normalized KNN classifier: identifying best k value with GridSearchCV
param_grid = {'n_neighbors' : np.arange(1,30)}
knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn ,param_grid ,cv=5)
knn_cv.fit(X_train, y_train)

print('k-NN best n_neighbors:', knn_cv.best_params_, '\n')


In [None]:
y_pred_knn_test = knn_cv.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix , classification_report
print('k-NN test set confusion matrix:')
print(pd.DataFrame(confusion_matrix(y_test, y_pred_knn_test), index=['actual: no churn', 'actual: churn'], columns=['pred: no churn', 'pred: churn']), '\n')

print('k-NN test set classification report:')
print(classification_report(y_test, y_pred_knn_test))



In [None]:
# Logistic Regression classifier (L1 regularization)
from sklearn.linear_model import LogisticRegression
param_grid_L1 = {'C':np.arange(.5,5,.5)}
logreg_L1 = LogisticRegression(penalty='l1' ,solver="liblinear")
logreg_L1_cv = GridSearchCV(logreg_L1 ,param_grid_L1,cv=5)

logreg_L1_cv.fit(X_train ,y_train)
print('Lasso Reg best C value', logreg_L1_cv.best_params_ ,'\n')

In [None]:
y_pred_L1_test = logreg_L1_cv.predict(X_test)


print('Lasso Reg test set confusion matrix:')
print(pd.DataFrame(confusion_matrix(y_test, y_pred_L1_test), index=['actual: no churn', 'actual: churn'], columns=['pred: no churn', 'pred: churn']), '\n')
print('Lasso Reg test set classification report:')
print(classification_report(y_test, y_pred_L1_test))

In [None]:
# Random Forest classifier

from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier


param_grid_rf = {'n_estimators': np.arange(10, 2000, 10), 
                 'max_features': ['auto', 'sqrt'],
                 'max_depth': np.arange(10, 200, 10),
                 'criterion': ['gini', 'entropy'],
                 'bootstrap': [True, False]}
param_good ={'n_estimators': 50, 'max_features': 'sqrt', 'max_depth': 10, 'criterion': 'gini', 'bootstrap': True} 

rf = RandomForestClassifier()
rf_random_grid = RandomizedSearchCV(rf , param_good ,cv = 5)
rf_random_grid.fit(X_train ,y_train)

y_pred_rf_test = rf_random_grid.predict(X_test)



In [None]:

print('Tuned Random Forest Params:', rf_random_grid.best_params_, '\n')
print('Tuned Random Forest score is {}.'.format(rf_random_grid.best_score_.round(3)), '\n')
print('Tuned Random Forest test set confusion matrix:')
print(pd.DataFrame(confusion_matrix(y_test, y_pred_rf_test), index=['actual: no churn', 'actual: churn'], columns=['pred: no churn', 'pred: churn']), '\n')
print('Tuned Random Forest test set classification report:')
print(classification_report(y_test, y_pred_rf_test), '\n')



In [None]:

# using oversampling and undersampling to address data imbalance

from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

classifiers_dict = {RandomForestClassifier:'rf', LogisticRegression:'lr', KNeighborsClassifier:'knn'}

# function that builds pipeline for each classifier using RandomOverSampler
def oversampling(classifiers_dict):
    
    for key, value in classifiers_dict.items():
        globals()['over_' + value + '_pipeline'] = make_pipeline_imb(RandomOverSampler(random_state=4), key())
        globals()['over_' + value + '_model'] = globals()['over_' + value + '_pipeline'].fit(X_train, y_train)

        globals()['y_pred_over_' + value + '_test'] = globals()['over_' + value + '_model'].predict(X_test)
        globals()['X_over_' + value + '_resample'], globals()['y_over_' + '_resample'] = RandomOverSampler().fit_sample(X_train, y_train)

# function that builds pipeline for each classifier using RandomUnderSampler
def undersampling(classifiers_dict):
    
    for key, value in classifiers_dict.items():
        globals()['under_' + value + '_pipeline'] = make_pipeline_imb(RandomUnderSampler(random_state=4), key())
        globals()['under_' + value + '_model'] = globals()['under_' + value + '_pipeline'].fit(X_train, y_train)

        globals()['y_pred_under_' + value + '_test'] = globals()['under_' + value + '_model'].predict(X_test)
        globals()['X_under_' + value + '_resample'], globals()['y_under_' + '_resample'] = RandomUnderSampler().fit_sample(X_train, y_train)

oversampling(classifiers_dict)
undersampling(classifiers_dict)