# Model Training notebook

In [52]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pickle

In [53]:
df = pd.read_csv('data\Telco_customer_churn.csv')

In [69]:
df.head()
df.dropna(inplace=True)
df.isnull().sum()   

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [70]:
df = df.drop(columns=["customerID"])

KeyError: "['customerID'] not found in axis"

In [71]:
df['TotalCharges'] = df['TotalCharges'].replace(' ', np.nan)
df['TotalCharges'] = df['TotalCharges'].astype('float64')


In [72]:

df['Churn'] = df['Churn'].replace({'Yes': 1, 'No': 0})

In [73]:
# identifying the categorical columns
categorical_features = [feature for feature in df.columns if df[feature].dtype == 'O']
print('Categorical Features:', categorical_features)

Categorical Features: []


In [74]:

encoder = {}

for col in categorical_features:
    ord_enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
    df[[col]] = ord_enc.fit_transform(df[[col]])   # pass as 2D array
    encoder[col] = ord_enc   # store encoder for each column

# Save encoders dictionary
with open('ordinal_encoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)


In [75]:
df.head(2)
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 7032 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7032 non-null   float64
 1   SeniorCitizen     7032 non-null   int64  
 2   Partner           7032 non-null   float64
 3   Dependents        7032 non-null   float64
 4   tenure            7032 non-null   int64  
 5   PhoneService      7032 non-null   float64
 6   MultipleLines     7032 non-null   float64
 7   InternetService   7032 non-null   float64
 8   OnlineSecurity    7032 non-null   float64
 9   OnlineBackup      7032 non-null   float64
 10  DeviceProtection  7032 non-null   float64
 11  TechSupport       7032 non-null   float64
 12  StreamingTV       7032 non-null   float64
 13  StreamingMovies   7032 non-null   float64
 14  Contract          7032 non-null   float64
 15  PaperlessBilling  7032 non-null   float64
 16  PaymentMethod     7032 non-null   float64
 17  

In [76]:
## splitting the data into features and target variable
X = df.drop('Churn', axis=1)
y = df['Churn']

In [77]:
y

0       0
1       0
2       1
3       0
4       1
       ..
7038    0
7039    0
7040    0
7041    1
7042    0
Name: Churn, Length: 7032, dtype: int64

In [78]:
# standard scaling the data mothnly charges, total charges, and tenure
df.dropna(inplace=True)

In [79]:
# splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [80]:
print(y_train.shape)



(5625,)


In [81]:
y_train.value_counts()

Churn
0    4130
1    1495
Name: count, dtype: int64

In [82]:
## SMOTE (SYNTHETIC MINORITY OVERSAMPLING TECHNIQUE) for handling class imbalance
smote = SMOTE(random_state=42) 


In [83]:
x_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [84]:
print(y_train_smote.shape)

(8260,)


In [85]:
print(y_train_smote.value_counts())

Churn
1    4130
0    4130
Name: count, dtype: int64


In [86]:
# dictionary of models
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(random_state=42)
}

In [87]:
# dictionary to store the cross validation results
cv_scores = {}

# perform 5-fold cross validation for each model
for model_name, model in models.items():
  print(f"Training {model_name} with default parameters")
  scores = cross_val_score(model, x_train_smote, y_train_smote, cv=5, scoring="accuracy")
  cv_scores[model_name] = scores
  print(f"{model_name} cross-validation accuracy: {np.mean(scores):.2f}")
  print("-"*70)

Training Decision Tree with default parameters
Decision Tree cross-validation accuracy: 0.79
----------------------------------------------------------------------
Training Random Forest with default parameters
Random Forest cross-validation accuracy: 0.85
----------------------------------------------------------------------
Training XGBoost with default parameters
XGBoost cross-validation accuracy: 0.84
----------------------------------------------------------------------


In [None]:
cv_scores

{'Decision Tree': array([0.65133172, 0.718523  , 0.8468523 , 0.87530266, 0.86924939]),
 'Random Forest': array([0.68523002, 0.76997579, 0.91949153, 0.92736077, 0.92675545]),
 'XGBoost': array([0.66707022, 0.7590799 , 0.90556901, 0.91464891, 0.93401937])}

In [None]:
# Random Forest gives the highest accuracy compared to other models with default parameters
rfc = RandomForestClassifier(random_state=42)

In [None]:
rfc.fit(x_train_smote, y_train_smote)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [None]:
print(y_test.value_counts())

Churn
0    1033
1     374
Name: count, dtype: int64


In [None]:
# evaluate on test data
y_test_pred = rfc.predict(X_test)

print("Accuracy Score:\n", accuracy_score(y_test, y_test_pred))
print("Confsuion Matrix:\n", confusion_matrix(y_test, y_test_pred))
print("Classification Report:\n", classification_report(y_test, y_test_pred))

Accuracy Score:
 0.7796730632551528
Confsuion Matrix:
 [[909 124]
 [186 188]]
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.88      0.85      1033
           1       0.60      0.50      0.55       374

    accuracy                           0.78      1407
   macro avg       0.72      0.69      0.70      1407
weighted avg       0.77      0.78      0.77      1407

