# Model Training notebook

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pickle

In [2]:
df = pd.read_csv('data\Telco_customer_churn.csv')

In [3]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df = df.drop(columns=["customerID"])

In [5]:
df['TotalCharges'] = df['TotalCharges'].replace(' ', np.nan)
df['TotalCharges'] = df['TotalCharges'].astype('Float64')


In [6]:

df['Churn'] = df['Churn'].replace({'Yes': 1, 'No': 0})

  df['Churn'] = df['Churn'].replace({'Yes': 1, 'No': 0})


In [7]:
# identifying the categorical columns
categorical_features = [feature for feature in df.columns if df[feature].dtype == 'O']
print('Categorical Features:', categorical_features)

Categorical Features: ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']


In [8]:
# initialize a dictionary to save the encoders
encoders = {}

# apply label encoding and store the encoders
for column in categorical_features:
  label_encoder = LabelEncoder()
  df[column] = label_encoder.fit_transform(df[column])
  encoders[column] = label_encoder


# save the encoders to a pickle file
with open('label_encoder.pkl', 'wb') as f:
  pickle.dump(encoders, f)


In [9]:
df.head(2)
df.dropna(inplace=True)

In [10]:
## splitting the data into features and target variable
X = df.drop('Churn', axis=1)
y = df['Churn']

In [11]:
y

0       0
1       0
2       1
3       0
4       1
       ..
7038    0
7039    0
7040    0
7041    1
7042    0
Name: Churn, Length: 7032, dtype: int64

In [12]:
# standard scaling the data mothnly charges, total charges, and tenure

In [13]:
# splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
print(y_train.shape)

(5625,)


In [15]:
y_train.value_counts()

Churn
0    4130
1    1495
Name: count, dtype: int64

In [16]:
## SMOTE (SYNTHETIC MINORITY OVERSAMPLING TECHNIQUE) for handling class imbalance
smote = SMOTE(random_state=42) 


In [17]:
x_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [18]:
print(y_train_smote.shape)

(8260,)


In [19]:
print(y_train_smote.value_counts())

Churn
1    4130
0    4130
Name: count, dtype: int64


In [20]:
# dictionary of models
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(random_state=42)
}

In [21]:
# dictionary to store the cross validation results
cv_scores = {}

# perform 5-fold cross validation for each model
for model_name, model in models.items():
  print(f"Training {model_name} with default parameters")
  scores = cross_val_score(model, x_train_smote, y_train_smote, cv=5, scoring="accuracy")
  cv_scores[model_name] = scores
  print(f"{model_name} cross-validation accuracy: {np.mean(scores):.2f}")
  print("-"*70)

Training Decision Tree with default parameters
Decision Tree cross-validation accuracy: 0.78
----------------------------------------------------------------------
Training Random Forest with default parameters
Random Forest cross-validation accuracy: 0.84
----------------------------------------------------------------------
Training XGBoost with default parameters
XGBoost cross-validation accuracy: 0.84
----------------------------------------------------------------------


In [22]:
cv_scores

{'Decision Tree': array([0.68825666, 0.72033898, 0.82324455, 0.83898305, 0.83716707]),
 'Random Forest': array([0.72760291, 0.79297821, 0.89164649, 0.89709443, 0.90072639]),
 'XGBoost': array([0.70762712, 0.77966102, 0.88922518, 0.89830508, 0.90072639])}

In [23]:
# Random Forest gives the highest accuracy compared to other models with default parameters
rfc = RandomForestClassifier(random_state=42)

In [24]:
rfc.fit(x_train_smote, y_train_smote)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [25]:
print(y_test.value_counts())

Churn
0    1033
1     374
Name: count, dtype: int64


In [26]:
# evaluate on test data
y_test_pred = rfc.predict(X_test)

print("Accuracy Score:\n", accuracy_score(y_test, y_test_pred))
print("Confsuion Matrix:\n", confusion_matrix(y_test, y_test_pred))
print("Classification Report:\n", classification_report(y_test, y_test_pred))

Accuracy Score:
 0.7654584221748401
Confsuion Matrix:
 [[863 170]
 [160 214]]
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.84      0.84      1033
           1       0.56      0.57      0.56       374

    accuracy                           0.77      1407
   macro avg       0.70      0.70      0.70      1407
weighted avg       0.77      0.77      0.77      1407

