In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('../../../datasets/WA_Fn-UseC_-Telco-Customer-Churn.csv')

X = df.drop('Churn', axis=1)
y = df['Churn']

In [3]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


# Missing Values

In [5]:
X['TotalCharges'] = pd.to_numeric(X['TotalCharges'], errors='coerce')
X['TotalCharges'].fillna(X['TotalCharges'].median(), inplace=True)

# Create new features

In [6]:
X['AvgMonthlySpend'] = X['TotalCharges'] / (X['tenure'] + 1)
X['tenure_group'] = np.where(X['tenure'] <= 12, 'Short', np.where(X['tenure'] <= 36, 'Medium', 'Long'))

# Binning

In [7]:
X['tenure_group'] = np.where(X['tenure'] <= 12, 'Short', np.where(
    X['tenure'] <= 36, 'Medium', 'Long'))

# Encoding

In [8]:
X['gender_enc'] = X['gender'].map({'Male': 0, 'Female': 1})

X['PhoneService_enc'] = X['PhoneService'].map({'No': 0, 'Yes': 1})

X['Contract_enc'] = X['Contract'].map({
    'Month-to-month': 0,
    'One year': 1,
    'Two year': 2
})
X['MultipleLines_enc'] = X['MultipleLines'].map({
    'Yes': 1,
    'No': 0,
    'No phone service': 0})

X['InternetService_enc'] = X['InternetService'].map({
    'No': 0,
    'DSL': 1,
    'Fiber optic': 2})

X['OnlineSecurity_enc'] = X['OnlineSecurity'].map({
    'Yes': 1,
    'No': 0,
    'No internet service': 0})

X['TechSupport_enc'] = X['TechSupport'].map({
    'Yes': 1,
    'No': 0,
    'No internet service': 0})

X['PaymentMethod_enc'] = X['PaymentMethod'].map({
    'Electronic check': 0,
    'Mailed check': 0,
    'Bank transfer (automatic)': 1,
    'Credit card (automatic)': 1})

X['PaperlessBilling_enc'] = X['PaperlessBilling'].map({'No': 0, 'Yes': 1})
X['tenure_group_enc'] = X['tenure_group'].map({
    'Short': 0,
    'Medium': 1,
    'Long': 2})

In [9]:
X.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'AvgMonthlySpend',
       'tenure_group', 'gender_enc', 'PhoneService_enc', 'Contract_enc',
       'MultipleLines_enc', 'InternetService_enc', 'OnlineSecurity_enc',
       'TechSupport_enc', 'PaymentMethod_enc', 'PaperlessBilling_enc',
       'tenure_group_enc'],
      dtype='object')

In [10]:
selected_columns = ['gender_enc','SeniorCitizen','tenure_group_enc','PhoneService_enc',
                   'Contract_enc','MultipleLines_enc','OnlineSecurity_enc','TechSupport_enc',
                   'PaymentMethod_enc','PaperlessBilling_enc','InternetService_enc', 'AvgMonthlySpend','TotalCharges']
X = X[selected_columns]

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1, stratify=y)

# Feature Scaling

In [12]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model Building

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [14]:
params = {'max_depth': range(1,6),'min_samples_leaf': range(2,8),'max_features': range(2,8),'min_samples_split': range(2,6)}
grid_cv = GridSearchCV(DecisionTreeClassifier(), param_grid = params, cv= 5,verbose=1)
grid_cv.fit(X_train,y_train)

Fitting 5 folds for each of 720 candidates, totalling 3600 fits


0,1,2
,estimator,DecisionTreeClassifier()
,param_grid,"{'max_depth': range(1, 6), 'max_features': range(2, 8), 'min_samples_leaf': range(2, 8), 'min_samples_split': range(2, 6)}"
,scoring,
,n_jobs,
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,5
,min_samples_split,5
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,6
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [15]:
grid_cv.best_params_

{'max_depth': 5,
 'max_features': 6,
 'min_samples_leaf': 2,
 'min_samples_split': 5}

In [16]:
params_1 = {'max_depth': [3, 5, 10],'min_samples_leaf': [1, 2, 5],'n_estimators': [50, 100]}
grid_cv1 = GridSearchCV(RandomForestClassifier(random_state=1),param_grid=params_1,cv=5,n_jobs=-1,verbose=1)
grid_cv1.fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


0,1,2
,estimator,RandomForestC...andom_state=1)
,param_grid,"{'max_depth': [3, 5, ...], 'min_samples_leaf': [1, 2, ...], 'n_estimators': [50, 100]}"
,scoring,
,n_jobs,-1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,50
,criterion,'gini'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,5
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [17]:
grid_cv1.best_params_

{'max_depth': 10, 'min_samples_leaf': 5, 'n_estimators': 50}

In [18]:
models = {
    'LogReg': LogisticRegression(),
    'SVC_Lin': SVC(kernel='linear'),
    'SVC_poly2': SVC(kernel='poly', degree=2),
    'SVC_poly3': SVC(kernel='poly', degree=3),
    'SVC_rbf': SVC(kernel='rbf'),
    'KNN': KNeighborsClassifier(n_neighbors=3),
    'DecT': DecisionTreeClassifier( random_state=1,max_depth=4,max_features=5,min_samples_leaf=3,min_samples_split=5),
    'RanFor': RandomForestClassifier(random_state=1,max_depth=10, min_samples_leaf=5,n_estimators=50)}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    score = model.score(X_train_scaled, y_train)
    print(name, score)

LogReg 0.7974252177205604
SVC_Lin 0.7960999621355547
SVC_poly2 0.7951533510034078
SVC_poly3 0.8015903067020068
SVC_rbf 0.8042408178720182
KNN 0.8653918970087088
DecT 0.7589928057553957
RanFor 0.8373722074971601


In [19]:
best_model = RandomForestClassifier(random_state=1,
    max_depth=10,
    min_samples_leaf=2,
    n_estimators=100)
best_model.fit(X_train_scaled, y_train)
y_test_pred = best_model.predict(X_test_scaled)

metrics = {
    "accuracy": accuracy_score(y_test, y_test_pred),
    "precision": precision_score(y_test, y_test_pred, pos_label="Yes"),
    "recall": recall_score(y_test, y_test_pred, pos_label="Yes"),
    "f1_score": f1_score(y_test, y_test_pred, pos_label="Yes"),
    "confusion_matrix": confusion_matrix(y_test, y_test_pred).tolist()
}

print("Evaluation Metrics:")
for k, v in metrics.items():
    print(k, ":", v)

Evaluation Metrics:
accuracy : 0.8126064735945485
precision : 0.7069486404833837
recall : 0.5010706638115632
f1_score : 0.5864661654135338
confusion_matrix : [[1197, 97], [233, 234]]


In [20]:
import pickle
scaler.fit(X)
X_scaled = scaler.transform(X)

final_model = RandomForestClassifier(
    random_state=1,
    max_depth=10,
    min_samples_leaf=2,
    n_estimators=100
)

final_model.fit(X_scaled, y)
pickle.dump(final_model, open("final_model.pkl", "wb"))
pickle.dump(scaler, open("scaler.pkl", "wb"))
pickle.dump(metrics, open("metrics.pkl", "wb"))

In [21]:
sample_input = {
    'gender_enc': 1,
    'SeniorCitizen': 0,
    'tenure_group_enc': 1,
    'PhoneService_enc': 1,
    'Contract_enc': 0,
    'MultipleLines_enc': 0,
    'OnlineSecurity_enc': 1,
    'TechSupport_enc': 0,
    'PaymentMethod_enc': 0,
    'PaperlessBilling_enc': 1,
    'InternetService_enc': 2,
    'AvgMonthlySpend': 1200 / (22 + 1),
    'TotalCharges': 1200
}

Q = pd.DataFrame([sample_input])
Q_scaled = scaler.transform(Q)

prediction = final_model.predict(Q_scaled)[0]
print("Prediction:", prediction)

Prediction: No


In [22]:
X.isna().sum().sort_values(ascending=False)


gender_enc              0
SeniorCitizen           0
tenure_group_enc        0
PhoneService_enc        0
Contract_enc            0
MultipleLines_enc       0
OnlineSecurity_enc      0
TechSupport_enc         0
PaymentMethod_enc       0
PaperlessBilling_enc    0
InternetService_enc     0
AvgMonthlySpend         0
TotalCharges            0
dtype: int64

In [None]:
df = pd.read_csv('../../../datasets/WA_Fn-UseC_-Telco-Customer-Churn.csv')

X = df.drop('Churn', axis=1)
y = df['Churn']
X['TotalCharges'] = pd.to_numeric(X['TotalCharges'], errors='coerce')
X['TotalCharges'].fillna(X['TotalCharges'].median(), inplace=True)
X['AvgMonthlySpend'] = X['TotalCharges'] / (X['tenure'] + 1)
X['tenure_group'] = np.where(X['tenure'] <= 12, 'Short', np.where(X['tenure'] <= 36, 'Medium', 'Long'))
X['tenure_group'] = np.where(X['tenure'] <= 12, 'Short', np.where(
    X['tenure'] <= 36, 'Medium', 'Long'))
X['gender_enc'] = X['gender'].map({'Male': 0, 'Female': 1})

X['PhoneService_enc'] = X['PhoneService'].map({'No': 0, 'Yes': 1})

X['Contract_enc'] = X['Contract'].map({
    'Month-to-month': 0,
    'One year': 1,
    'Two year': 2
})
X['MultipleLines_enc'] = X['MultipleLines'].map({
    'Yes': 1,
    'No': 0,
    'No phone service': 0})

X['InternetService_enc'] = X['InternetService'].map({
    'No': 0,
    'DSL': 1,
    'Fiber optic': 2})

X['OnlineSecurity_enc'] = X['OnlineSecurity'].map({
    'Yes': 1,
    'No': 0,
    'No internet service': 0})

X['TechSupport_enc'] = X['TechSupport'].map({
    'Yes': 1,
    'No': 0,
    'No internet service': 0})

X['PaymentMethod_enc'] = X['PaymentMethod'].map({
    'Electronic check': 0,
    'Mailed check': 0,
    'Bank transfer (automatic)': 1,
    'Credit card (automatic)': 1})

X['PaperlessBilling_enc'] = X['PaperlessBilling'].map({'No': 0, 'Yes': 1})
X['tenure_group_enc'] = X['tenure_group'].map({
    'Short': 0,
    'Medium': 1,
    'Long': 2})
selected_columns = ['gender_enc','SeniorCitizen','tenure_group_enc','PhoneService_enc',
                   'Contract_enc','MultipleLines_enc','OnlineSecurity_enc','TechSupport_enc',
                   'PaymentMethod_enc','PaperlessBilling_enc','InternetService_enc', 'AvgMonthlySpend','TotalCharges']
X = X[selected_columns]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1, stratify=y)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
models = {
    'LogReg': LogisticRegression(),
    'SVC_Lin': SVC(kernel='linear'),
    'SVC_poly2': SVC(kernel='poly', degree=2),
    'SVC_poly3': SVC(kernel='poly', degree=3),
    'SVC_rbf': SVC(kernel='rbf'),
    'KNN': KNeighborsClassifier(n_neighbors=3),
    'DecT': DecisionTreeClassifier( random_state=1,max_depth=4,max_features=5,min_samples_leaf=3,min_samples_split=5),
    'RanFor': RandomForestClassifier(random_state=1,max_depth=10, min_samples_leaf=5,n_estimators=50)}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    score = model.score(X_train_scaled, y_train)
    print(name, score)
    best_model = RandomForestClassifier(random_state=1,
    max_depth=10,
    min_samples_leaf=2,
    n_estimators=100)
best_model.fit(X_train_scaled, y_train)
print("Test accuracy:", best_model.score(X_test_scaled, y_test))
import pickle
scaler.fit(X)
X_scaled = scaler.transform(X)

final_model = RandomForestClassifier(
    random_state=1,
    max_depth=10,
    min_samples_leaf=2,
    n_estimators=100
)

final_model.fit(X_scaled, y)
pickle.dump(final_model, open("final_model.pkl", "wb"))
pickle.dump(scaler, open("scaler.pkl", "wb"))
sample_input = {
    'gender_enc': 1,
    'SeniorCitizen': 0,
    'tenure_group_enc': 1,
    'PhoneService_enc': 1,
    'Contract_enc': 0,
    'MultipleLines_enc': 0,
    'OnlineSecurity_enc': 1,
    'TechSupport_enc': 0,
    'PaymentMethod_enc': 0,
    'PaperlessBilling_enc': 1,
    'InternetService_enc': 2,
    'AvgMonthlySpend': 1200 / (22 + 1),
    'TotalCharges': 1200
}

Q = pd.DataFrame([sample_input])
Q_scaled = scaler.transform(Q)

prediction = final_model.predict(Q_scaled)[0]
print("Prediction:", prediction)