In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Imports

In [None]:
import seaborn as sns
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from category_encoders.one_hot import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from scipy.stats import uniform, truncnorm, randint
import xgboost as xgb

Data Ingestion

In [None]:
dataset = pd.read_csv('/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')

Dataset Exploration

In [None]:
dataset.describe()

In [None]:
dataset.info()

In [None]:
dataset.shape

In [None]:
dataset.isnull().sum()

In [None]:
dataset.head()

Split train-test

In [None]:
features = dataset.drop(columns=['Churn']).copy(deep=True)
target = dataset['Churn'].copy(deep=True)
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.1)
trainset = pd.concat([X_train, y_train], axis=1)
testset = pd.concat([X_test, y_test], axis=1)
testset.head()

In [None]:
X_test.shape

In [None]:
X_train.shape

In [None]:
sns.countplot(x='Churn', data=trainset)

In [None]:
trainset['Churn_encoded'] = np.where(trainset['Churn']=='No', 0, 1) #dummy encode (0/1 encoding for target variable)

In [None]:
trainset.drop(columns='Churn', inplace=True)

Explore non-binary(with "Yes" or "No") categorical columns

In [None]:
trainset.columns

In [None]:
multi_cat_col = ['MultipleLines', 'InternetService', 'Contract', 'PaymentMethod', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']

In [None]:
for col in multi_cat_col:
    sns.catplot(x=col, kind='count', hue='Churn_encoded', data=trainset, aspect=1.6, height=6)

In [None]:
trainset['MultipleLines'].value_counts()

In [None]:
trainset['InternetService'].value_counts()

In [None]:
trainset['Contract'].value_counts()

In [None]:
trainset['PaymentMethod'].value_counts()

Process binary("Yes" and "No") categorical columns

In [None]:
trainset.head()

In [None]:
binary_features = ["Partner", "Dependents", "PhoneService", "PaperlessBilling"]

In [None]:
def compute_binary_features(df):
    for col in binary_features:
        df[col].replace(('No', 'Yes'), (0, 1), inplace=True)
        df[col] = df[col].astype(int)
    return df

In [None]:
compute_binary_features(trainset)
compute_binary_features(X_train)
trainset.head()

In [None]:
X_train['gender'].replace(('Female', 'Male'), (0, 1), inplace=True)
trainset['gender'].replace(('Female', 'Male'), (0, 1), inplace=True)

In [None]:
trainset = trainset[~trainset['TotalCharges'].str.contains(' ')]
trainset['TotalCharges'] = trainset['TotalCharges'].astype('float')

In [None]:
X_train = X_train[~X_train['TotalCharges'].str.contains(' ')]
X_train['TotalCharges'] = X_train['TotalCharges'].astype('float')

In [None]:
testset = testset[~testset['TotalCharges'].str.contains(' ')]
testset['TotalCharges'] = testset['TotalCharges'].astype('float')
y_test = testset['Churn']
X_test = testset.drop(columns='Churn')

In [None]:
testset.head()

In [None]:
X_test['gender'].replace(('Female', 'Male'), (0, 1), inplace=True)
compute_binary_features(X_test)

In [None]:
y_test = y_test[~y_test.str.contains(' ')]
y_test = np.where(y_test=='No', 0, 1)

Replacing y since some rows were dropped

In [None]:
y_train = trainset['Churn_encoded']

In [None]:
total_numerical_features = list(trainset.select_dtypes(include=np.number))
categorical_features = trainset.select_dtypes(exclude=[np.number])

In [None]:
trainset['TotalCharges'].value_counts().unique()

Explore binary features

In [None]:
binary_features = [feat for feat in total_numerical_features if len(trainset[feat].unique()) < 3]
for feat in binary_features:
    sns.catplot(x=feat, kind='count', hue='Churn_encoded', data=trainset, aspect=1.6, height=6)

Explore numerical features

In [None]:
trainset['tenure'].value_counts()

In [None]:
trainset['MonthlyCharges'].value_counts()

In [None]:
trainset.head()

In [None]:
sns.distplot(trainset['tenure'])

In [None]:
sns.distplot(trainset['MonthlyCharges'])

In [None]:
sns.distplot(trainset['TotalCharges'])

In [None]:
trainset['Charges'] = trainset['TotalCharges'] - trainset['tenure'] * trainset['MonthlyCharges']

In [None]:
sns.distplot(trainset['Charges'])

In [None]:
continuous_numerical_features = [feat for feat in total_numerical_features if len(trainset[feat].unique()) > 3]

In [None]:
f, ax = plt.subplots(figsize=(10, 8))
corr = trainset[continuous_numerical_features].corr()
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(240,10,as_cmap=True),
            square=True, annot=True, ax=ax)

In [None]:
plt.figure(figsize=(12,5))
plt.title("Box plot for tenure")
sns.boxplot(y="Churn_encoded", x="tenure", data =  trainset, orient="h", palette = 'magma')

In [None]:
plt.figure(figsize=(12,5))
plt.title("Box plot for Monthly Charges")
sns.boxplot(y="Churn_encoded", x="MonthlyCharges", data =  trainset, orient="h", palette = 'magma')

In [None]:
plt.figure(figsize=(12,5))
plt.title("Box plot for Total Charges")
sns.boxplot(y="Churn_encoded", x="TotalCharges", data =  trainset, orient="h", palette = 'magma')

Encode all categorical features

In [None]:
categorical_features.drop(columns='customerID', inplace=True)
categorical_features

In [None]:
ohe = OneHotEncoder(verbose=0, cols=categorical_features  , drop_invariant=False, return_df=True, use_cat_names=True)

In [None]:
trainset.head()

In [None]:
ohe.fit(X_train)

In [None]:
X_train = ohe.transform(X_train)
X_test = ohe.transform(X_test)

In [None]:
X_test.shape

In [None]:
X_train.head()

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
#X_train.drop(columns='customerID', inplace=True)
#X_test.drop(columns='customerID', inplace=True)

In [None]:
features = X_train.columns
features = features[1:]
features

In [None]:
X_test.head()

from sklearn.ensemble import RandomForestClassifier

testset = X_test.copy(deep=True)
X = pd.get_dummies(X_train[features])
X_test = pd.get_dummies(X_test[features])
#columns, drop first

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1) 
model.fit(X,y_train) 
predictions = model.predict(X_test)

output = pd.DataFrame({'customerID': testset.customerId, 'Survived': predictions}) 
output.to_csv('my_submission.csv', index=False) 
print('Your submission was successfully saved!')


In [None]:
X_train = X_train[features]
X_test = X_test[features]

In [None]:
n_estimators_1 = [300]
max_depth_1 = [int(x) for x in np.linspace(1, 20, num = 20)]
min_samples_split_1 = [2, 5, 10]
min_samples_leaf_1 = [1, 2, 4]
max_features_1 = ['auto', 'sqrt']
n_jobs = [-1]

In [None]:
model_params_1 = {
    'n_estimators': n_estimators_1,
    'max_depth': max_depth_1,
    'min_samples_split': min_samples_split_1,
    'min_samples_leaf': min_samples_leaf_1,
    'max_features': max_features_1,
    'n_jobs': n_jobs
}

In [None]:
rf_model_1 = RandomForestClassifier(random_state=42, class_weight="balanced")

In [None]:
clf_1 = RandomizedSearchCV(rf_model_1, model_params_1, n_iter=50, cv=3, random_state=1)
tuned_model_1 = clf_1.fit(X_train, y_train)

In [None]:
from pprint import pprint
pprint(tuned_model_1.best_estimator_.get_params())

In [None]:
y_pred_test_rf_tuned_1 = tuned_model_1.predict(X_test)
y_pred_train_rf_tuned_1 = tuned_model_1.predict(X_train)

In [None]:
print(classification_report(y_train, y_pred_train_rf_tuned_1))

In [None]:
confusion_matrix(y_train, y_pred_train_rf_tuned_1)

In [None]:
print(classification_report(y_test, y_pred_test_rf_tuned_1))

In [None]:
confusion_matrix(y_test, y_pred_test_rf_tuned_1)

In [None]:
n_estimators = [300, 295, 305]
max_depth = [int(x) for x in np.linspace(1, 20, num = 20)]
min_samples_split = [5]
min_samples_leaf = [1]
max_features = ['sqrt']

In [None]:
model_params_2 = {
    'n_estimators': n_estimators,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf, 
    'max_features': max_features,
    'n_jobs': n_jobs
}

In [None]:
rf_model_2 = RandomForestClassifier(random_state=42, class_weight="balanced")

In [None]:
clf_2 = GridSearchCV(rf_model_2, model_params_2, cv=3)
tuned_model_2 = clf_2.fit(X_train, y_train)

In [None]:
from pprint import pprint
pprint(tuned_model_2.best_estimator_.get_params())

In [None]:
y_pred_test_rf_tuned_2 = tuned_model_2.predict(X_test)
y_pred_train_rf_tuned_2 = tuned_model_2.predict(X_train)

In [None]:
print(classification_report(y_train, y_pred_train_rf_tuned_2))

In [None]:
print(classification_report(y_test, y_pred_test_rf_tuned_2))

XGB Model with no hyperparameter tuning

In [None]:
xgb_model_1 = xgb.XGBClassifier(objective="binary:logistic", random_state=42)

In [None]:
xgb_model_1.fit(X_train, y_train)

In [None]:
y_pred_train_xgb_1 = xgb_model_1.predict(X_train)
y_pred_test_xgb_1 = xgb_model_1.predict(X_test)

In [None]:
print(classification_report(y_train, y_pred_train_xgb_1))

In [None]:
print(classification_report(y_test, y_pred_test_xgb_1))