In [None]:
import pandas as pd 
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score


In [None]:
df = pd.read_csv("Telco-customer-churn.csv")

In [None]:
df.head(222)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,1,0,1,0,No phone service,DSL,No,...,No,No,No,No,Month-to-month,1,Electronic check,29.85,29.85,0
1,5575-GNVDE,Male,0,0,0,34,1,No,DSL,Yes,...,Yes,No,No,No,One year,0,Mailed check,56.95,1889.5,0
2,3668-QPYBK,Male,0,0,0,2,1,No,DSL,Yes,...,No,No,No,No,Month-to-month,1,Mailed check,53.85,108.15,1
3,7795-CFOCW,Male,0,0,0,45,0,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,0,Bank transfer (automatic),42.30,1840.75,0
4,9237-HQITU,Female,0,0,0,2,1,No,Fiber optic,No,...,No,No,No,No,Month-to-month,1,Electronic check,70.70,151.65,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
217,0230-WEQUW,Male,0,1,0,66,0,No phone service,DSL,Yes,...,Yes,Yes,Yes,No,Two year,1,Bank transfer (automatic),56.60,3789.2,0
218,2040-LDIWQ,Male,0,1,1,65,1,Yes,DSL,No,...,Yes,Yes,Yes,Yes,Two year,1,Bank transfer (automatic),84.20,5324.5,0
219,6496-JDSSB,Female,0,0,0,8,1,No,Fiber optic,No,...,No,No,Yes,No,Month-to-month,1,Bank transfer (automatic),80.00,624.6,0
220,9408-SSNVZ,Female,0,0,0,4,1,No,Fiber optic,No,...,No,No,No,No,Month-to-month,1,Electronic check,70.15,268.35,1


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [None]:
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [None]:
df.duplicated().sum()
df.drop_duplicates(inplace=True)


In [None]:
df.dtypes
df['SeniorCitizen'] = df['SeniorCitizen'].astype('category')


In [None]:
for col in df.select_dtypes(include='object'):
    print(col, df[col].unique())


customerID ['7590-VHVEG' '5575-GNVDE' '3668-QPYBK' ... '4801-JZAZL' '8361-LTMKD'
 '3186-AJIEK']
gender ['Female' 'Male']
Partner ['Yes' 'No']
Dependents ['No' 'Yes']
PhoneService ['No' 'Yes']
MultipleLines ['No phone service' 'No' 'Yes']
InternetService ['DSL' 'Fiber optic' 'No']
OnlineSecurity ['No' 'Yes' 'No internet service']
OnlineBackup ['Yes' 'No' 'No internet service']
DeviceProtection ['No' 'Yes' 'No internet service']
TechSupport ['No' 'Yes' 'No internet service']
StreamingTV ['No' 'Yes' 'No internet service']
StreamingMovies ['No' 'Yes' 'No internet service']
Contract ['Month-to-month' 'One year' 'Two year']
PaperlessBilling ['Yes' 'No']
PaymentMethod ['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']
TotalCharges ['29.85' '1889.5' '108.15' ... '346.45' '306.6' '6844.5']
Churn ['No' 'Yes']


In [None]:
df.columns = df.columns.str.strip()
for col in df.select_dtypes(include='object'):
    df[col] = df[col].str.strip()


In [None]:
binary_cols = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']

for col in binary_cols:
    df[col] = df[col].map({'Yes': 1, 'No': 0})


In [None]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,1,0,1,0,No phone service,DSL,No,...,No,No,No,No,Month-to-month,1,Electronic check,29.85,29.85,0
1,5575-GNVDE,Male,0,0,0,34,1,No,DSL,Yes,...,Yes,No,No,No,One year,0,Mailed check,56.95,1889.5,0
2,3668-QPYBK,Male,0,0,0,2,1,No,DSL,Yes,...,No,No,No,No,Month-to-month,1,Mailed check,53.85,108.15,1
3,7795-CFOCW,Male,0,0,0,45,0,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,0,Bank transfer (automatic),42.3,1840.75,0
4,9237-HQITU,Female,0,0,0,2,1,No,Fiber optic,No,...,No,No,No,No,Month-to-month,1,Electronic check,70.7,151.65,1


In [None]:
multi_cat_cols = ['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 
                  'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 
                  'Contract', 'PaymentMethod']

df = pd.get_dummies(df, columns=multi_cat_cols, drop_first=True)


In [None]:
df.tail()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,...,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
7038,6840-RESVB,Male,0,1,1,24,1,1,84.8,1990.5,...,True,False,True,False,True,True,False,False,False,True
7039,2234-XADUH,Female,0,1,1,72,1,1,103.2,7362.9,...,False,False,True,False,True,True,False,True,False,False
7040,4801-JZAZL,Female,0,1,1,11,0,1,29.6,346.45,...,False,False,False,False,False,False,False,False,True,False
7041,8361-LTMKD,Male,1,1,0,4,1,1,74.4,306.6,...,False,False,False,False,False,False,False,False,False,True
7042,3186-AJIEK,Male,0,0,0,66,1,1,105.65,6844.5,...,True,False,True,False,True,False,True,False,False,False


In [None]:
df.shape

(7043, 32)

In [None]:
df.sample(12)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,...,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
2759,1194-SPVSP,Male,0,0,0,1,1,0,19.65,19.65,...,False,True,False,True,False,False,False,False,False,False
4420,1544-JJMYL,Male,0,0,0,27,1,0,69.35,1927.3,...,True,False,True,False,False,True,False,True,False,False
1773,2007-QVGAW,Female,0,1,1,68,1,1,19.35,1292.65,...,False,True,False,True,False,False,True,False,False,False
3525,5889-JTMUL,Female,1,1,0,50,1,1,95.05,4888.7,...,False,False,True,False,False,False,False,False,True,False
1123,0074-HDKDG,Male,0,1,1,25,1,1,61.6,1611.0,...,False,False,False,False,False,True,False,False,False,False
996,6641-XRPSU,Female,0,0,0,34,1,1,70.0,2416.1,...,False,False,False,False,False,False,False,True,False,False
3037,8512-WIWYV,Male,0,0,0,32,1,0,20.35,707.5,...,False,True,False,True,False,True,False,True,False,False
4335,9208-OLGAQ,Female,1,0,0,18,1,1,84.95,1443.65,...,False,False,False,False,True,False,False,False,True,False
2389,6161-ERDGD,Male,0,1,1,71,1,0,85.45,6300.85,...,True,False,True,False,True,True,False,False,True,False
1048,2829-HYVZP,Male,0,0,0,29,1,1,19.8,572.2,...,False,True,False,True,False,True,False,False,False,True
