In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/WA_Fn-UseC_-Telco-Customer-Churn.csv')

# data updates 
df['Churn'] = df['Churn'].map({'Yes':1,'No':0})


df['InternetType']=df['InternetService']
df['InternetService'] = df['InternetService'].map({'Fiber optic':'Yes','DSL':'Yes','No':'No'})

product_features = [
    'PhoneService', 'InternetService', 
    'OnlineBackup', 'OnlineSecurity', 
    'DeviceProtection', 'TechSupport', 
    'StreamingTV', 'StreamingMovies'
]

# for feature in product_features:
#     df[feature].replace({'Yes':1,'No':0},inplace=True)

In [3]:
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,InternetType
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,Yes,No,...,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0,DSL
1,5575-GNVDE,Male,0,No,No,34,Yes,No,Yes,Yes,...,No,No,No,One year,No,Mailed check,56.95,1889.5,0,DSL
2,3668-QPYBK,Male,0,No,No,2,Yes,No,Yes,Yes,...,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1,DSL
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,Yes,Yes,...,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,0,DSL
4,9237-HQITU,Female,0,No,No,2,Yes,No,Yes,No,...,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,1,Fiber optic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,Yes,Yes,...,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,0,DSL
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Yes,No,...,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,0,Fiber optic
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,Yes,Yes,...,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,0,DSL
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Yes,No,...,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,1,Fiber optic


In [4]:
from sklearn.model_selection import StratifiedKFold,train_test_split

In [5]:
X = df.drop(columns=['Churn','customerID'])
Y = df['Churn']

In [6]:
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=.2,stratify=df[['Contract','Churn']])

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [8]:
cat_features = ['gender','SeniorCitizen','Partner','Dependents','MultipleLines','PaperlessBilling','PaymentMethod','Contract']
cat_features.extend(product_features)

In [9]:
num_features = ['MonthlyCharges']
ord_features = ['tenure']

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder

In [11]:
column_transformer = ColumnTransformer([
    ('num',StandardScaler(),num_features),
    ('cat',OneHotEncoder(),cat_features),
    ('ord',OrdinalEncoder(),ord_features)
])

In [12]:
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE

Using TensorFlow backend.


In [13]:
from sklearn.ensemble import RandomForestClassifier

In [14]:
sm_pipeline = make_pipeline(column_transformer,
                            SMOTE(n_jobs=-1,random_state=43),
                            RandomForestClassifier(n_jobs=-1)
                           )

In [15]:
fit = sm_pipeline.fit(X_train,y_train)

In [16]:
y_pred = fit.predict(X_test)

In [21]:
from sklearn.metrics import classification_report
from sklearn import metrics

In [18]:
pd.DataFrame(classification_report(y_test,y_pred,output_dict=True))

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.835629,0.609375,0.784244,0.722502,0.775573
recall,0.879227,0.52139,0.784244,0.700309,0.784244
f1-score,0.856874,0.56196,0.784244,0.709417,0.778593
support,1035.0,374.0,0.784244,1409.0,1409.0


In [19]:
def pipeline_cv(splits, X, Y, pipeline):
    
    kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=777)
    
    reports = []
    for train, test in kfold.split(X, Y):
        fit = pipeline.fit(X.iloc[train], Y.iloc[train])
        prediction = fit.predict(X.iloc[test])
        
        reports.append(
            pd.DataFrame(
                metrics.classification_report(
                    Y.iloc[test],prediction,output_dict=True
                )
            )
        )

    df_concat = pd.concat([x for x in reports])

    by_row_index = df_concat.groupby(df_concat.index)
    df_means = by_row_index.mean()

    return df_means


In [22]:
pipeline_cv(3,X,Y,sm_pipeline)

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
f1-score,0.853988,0.549723,0.779497,0.701855,0.773245
precision,0.831419,0.600271,0.779497,0.715845,0.770079
recall,0.877849,0.507223,0.779497,0.692536,0.779497
support,1724.666667,623.0,0.779497,2347.666667,2347.666667
