In [1]:
# import necessary library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import metrics
import pickle
import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns",None)

In [2]:
# load data
data = pd.read_csv("Telco_Customer_Churn.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [3]:
# data backup
df = data.copy()
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
# duplicates
df[df.duplicated()]

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn


In [5]:
# dropping irrelevant columns
df.drop(columns='customerID',axis=1,inplace=True)
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [6]:
# Change datatype of "total CHarges"
df['TotalCharges']= pd.to_numeric(df['TotalCharges'], errors ='coerce')
df['TotalCharges'].dtype

dtype('float64')

In [7]:
# null values
df.dropna(inplace=True)
df.isnull().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [8]:
# outliers
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
count,7032.0,7032.0,7032.0,7032.0
mean,0.1624,32.421786,64.798208,2283.300441
std,0.368844,24.54526,30.085974,2266.771362
min,0.0,1.0,18.25,18.8
25%,0.0,9.0,35.5875,401.45
50%,0.0,29.0,70.35,1397.475
75%,0.0,55.0,89.8625,3794.7375
max,1.0,72.0,118.75,8684.8


In [9]:
# numeric and non-numeric columns
non_numeric_columns = df.select_dtypes(include= ['object']).columns.to_list()
numeric_columns = df.select_dtypes(exclude= ['object']).columns.to_list()
print("numeric columns: ",numeric_columns)
print("non_numeric columns: ",non_numeric_columns)

numeric columns:  ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']
non_numeric columns:  ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn']


In [10]:
# data encoding
label = LabelEncoder()
df['Churn'] = label.fit_transform(df['Churn'])
df['Churn'].value_counts()

Churn
0    5163
1    1869
Name: count, dtype: int64

In [11]:
# setting feature and target
x = df.drop(columns= ['Churn'])
y = df['Churn']
non_numeric_columns.remove('Churn')

# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_columns),
    ('cat', OneHotEncoder(handle_unknown= 'ignore', sparse_output= False), non_numeric_columns)
])

# split dataset
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state= 42)

# data preprocessing
x_train = preprocessor.fit_transform(x_train)
x_test = preprocessor.fit_transform(x_test)

In [12]:
# model details
model_dict = {
    'logistic regression': LogisticRegression(max_iter= 1000),
    'SVC': SVC(probability= True),
    'random forest': RandomForestClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder= False, eval_matrics= 'logloss')
}

# hyperparameter details
search_space = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'n_estimator': [100, 200],
    'max_depth': [None, 10],
    'learning_rate': [0.5, 1]
}

# filter hyperparameter
def filter_hyperparameter(model, space):
    valid_key = model.get_params()
    param_grid = {k:v for k,v in space.items() if k in valid_key}
    return param_grid

In [13]:
# model training with grid search
result = []
i = 0
for name, model in model_dict.items():
    print(f'Tuning for {name}')
    param_grid = filter_hyperparameter(model, search_space)
    grid = GridSearchCV(estimator= model, param_grid= param_grid, cv= 5, scoring= 'accuracy', n_jobs= -1)
    grid.fit(x_train, y_train)
    best_model = grid.best_estimator_
    y_pred = best_model.predict(x_test)
    report = metrics.classification_report(y_test, y_pred, output_dict= True)
    result.append({
        'model_name': name,
        'best_parameter': grid.best_params_,
        'accuracy': round(metrics.accuracy_score(y_test, y_pred), 4),
        'F1_score': round(report['weighted avg']['f1-score'], 4)
    })
    i+=1

Tuning for logistic regression
Tuning for SVC
Tuning for random forest
Tuning for AdaBoost
Tuning for XGBoost


In [14]:
# finding best model
model_df = pd.DataFrame(result)
model_df

Unnamed: 0,model_name,best_parameter,accuracy,F1_score
0,logistic regression,{'C': 10},0.7846,0.7787
1,SVC,"{'C': 1, 'kernel': 'linear'}",0.7946,0.7889
2,random forest,{'max_depth': 10},0.7925,0.7829
3,AdaBoost,{'learning_rate': 1},0.7868,0.779
4,XGBoost,"{'learning_rate': 0.5, 'max_depth': None}",0.7555,0.7491


In [15]:
# best model
model_df.sort_values(by= 'accuracy', ascending= False, inplace= True)
best_model = model_df.iloc[0]['model_name']
best_parameter = model_df.iloc[0]['best_parameter']
print("best model is:", best_model)
print("best parameter is:", best_parameter)

best model is: SVC
best parameter is: {'C': 1, 'kernel': 'linear'}


In [16]:
# final model
final_model = model_dict[best_model].set_params(**best_parameter)

# retrain model
x_processed = preprocessor.transform(x)
final_model.fit(x_processed, y)

In [17]:
# pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', final_model)
])

# save model
with open("model.pkl", 'wb') as f:
    pickle.dump(pipeline, f)