In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.model_selection import cross_val_score

In [3]:
df=pd.read_csv('../data/cleaned.csv')
df

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,gender_Male
0,0,0,1,0,-1,0,1,-1,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,1,1
2,2,0,0,0,-1,1,1,0,0,1,...,0,0,0,0,0,0,0,0,1,1
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
4,4,0,0,0,-1,1,1,0,0,1,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,7038,0,1,1,0,1,1,0,0,0,...,0,1,0,1,1,0,0,0,1,1
7039,7039,0,1,1,1,1,1,1,2,0,...,0,1,0,1,1,0,1,0,0,0
7040,7040,0,1,1,0,0,1,-1,0,0,...,0,0,0,0,0,0,0,1,0,0
7041,7041,1,1,0,-1,1,1,0,0,1,...,0,0,0,0,0,0,0,0,1,1


In [4]:
df=df.drop(columns=['Unnamed: 0'], axis=1)

In [5]:
x = df.drop('Churn', axis=1)
y = df['Churn']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [6]:
from imblearn.over_sampling import SMOTE
smote=SMOTE(random_state=42)

In [7]:
x_train_smote, y_train_smote=smote.fit_resample(x_train, y_train)

In [8]:
y_train_smote.value_counts()

Churn
0    4138
1    4138
Name: count, dtype: int64

MODEL TRAINING

In [9]:
 models={
     "Decision Tree": DecisionTreeClassifier(random_state=42),
     "Random Forest": RandomForestClassifier(random_state=42),
     "XGBoost": XGBClassifier(random_state=42)

 }

In [10]:
for name, model in models.items():
    scores = cross_val_score(model, x, y, cv=5, scoring='accuracy')
    print(f"{name} - Mean CV Accuracy: {scores.mean():.4f} | Individual Scores: {scores}")

Decision Tree - Mean CV Accuracy: 0.7287 | Individual Scores: [0.73811214 0.72888573 0.71682044 0.72088068 0.73863636]
Random Forest - Mean CV Accuracy: 0.7761 | Individual Scores: [0.79276082 0.77714691 0.7523066  0.77840909 0.77982955]
XGBoost - Mean CV Accuracy: 0.7789 | Individual Scores: [0.78637331 0.7707594  0.76437189 0.77840909 0.79474432]


In [11]:
scores

array([0.78637331, 0.7707594 , 0.76437189, 0.77840909, 0.79474432])

In [12]:
model=RandomForestClassifier(random_state=42)
model.fit(x_train_smote, y_train_smote)

In [13]:
y_pred=model.predict(x_test)

In [14]:
print('accuracy score', accuracy_score(y_pred, y_test))
print('confusion matrix', confusion_matrix(y_pred, y_test))
print('classification report', classification_report(y_pred, y_test))

accuracy score 0.7530163236337828
confusion matrix [[830 142]
 [206 231]]
classification report               precision    recall  f1-score   support

           0       0.80      0.85      0.83       972
           1       0.62      0.53      0.57       437

    accuracy                           0.75      1409
   macro avg       0.71      0.69      0.70      1409
weighted avg       0.74      0.75      0.75      1409



In [16]:
import pickle

with open("random_forest_churn_model.pkl", "wb") as file:
    pickle.dump(model, file)

print("Random Forest model saved as random_forest_churn_model.pkl")


Random Forest model saved as random_forest_churn_model.pkl
