In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
import pickle

def data_preprocessing(dataset:pd.DataFrame):
    '''
    dataset contain this columns:
        'gender':               ['Female' 'Male']
        'SeniorCitizen':        [0 1]
        'Partner':              ['Yes' 'No']
        'Dependents':           ['No' 'Yes']
        'tenure':               int
        'PhoneService':         ['No' 'Yes']
        'MultipleLines':        ['No phone service' 'No' 'Yes']
        'InternetService':      ['DSL' 'Fiber optic' 'No']
        'OnlineSecurity':       ['No' 'Yes' 'No internet service']
        'OnlineBackup':         ['Yes' 'No' 'No internet service']
        'DeviceProtection':     ['No' 'Yes' 'No internet service']
        'TechSupport':          ['No' 'Yes' 'No internet service']
        'StreamingTV':          ['No' 'Yes' 'No internet service']
        'StreamingMovies':      ['No' 'Yes' 'No internet service']
        'Contract':             ['Month-to-month' 'One year' 'Two year']
        'PaperlessBilling':     ['Yes' 'No']
        'PaymentMethod':        ['Electronic check' 'Mailed check' 'Bank transfer (automatic)' 'Credit card (automatic)']
        'MonthlyCharges':       float
        'TotalCharges':         float
        'Churn':                ['No' 'Yes']   // Not necessary
    '''
    df = dataset.copy()

    # Prepare string variables
    df.columns = df.columns.str.lower().str.replace(' ','_')
    string_columns = list(df.dtypes[df.dtypes=='O'].index)
    for col in string_columns:
        df[col] = df[col].str.lower().str.replace(' ','_')

    # drop customerid
    if 'customerid' in df.columns:
        del df['customerid']
        
    # Prepare numeric columns type
    if 'tenure' in df.columns:
        df['tenure'] = pd.to_numeric(df['tenure'], errors='coerce')
        df['tenure'] = df['tenure'].fillna(0)
    else:
        raise Exception("!!! Tenure column didn't exist !!!")

    if 'monthlycharges' in df.columns:
        df['monthlycharges'] = pd.to_numeric(df['monthlycharges'], errors='coerce')
        df['monthlycharges'] = df['monthlycharges'].fillna(0)
    else:
        raise Exception("!!! MonthlyCharges column didn't exist !!!")

    if 'totalcharges' in df.columns:
        df['totalcharges'] = pd.to_numeric(df['totalcharges'], errors='coerce')
        df['totalcharges'] = df['totalcharges'].fillna(0)
    else:
        raise Exception("!!! TotalCharges column didn't exist !!!")
        
    # Make dependent variable numeric
    if 'churn' in df.columns:
        df.churn = (df.churn == 'yes').astype(int)
        
    # Drop duplicates
    df.drop_duplicates(inplace=True)
    
    return df

def FeatureEngineering(dataset:pd.DataFrame, categorical_variables:list, numerical_variables:list, target_variable:str):
    df = dataset.copy()
    categorical_important = categorical_variables.copy()
    numerical_important = numerical_variables.copy()
    dependent_variable = [target_variable.lower().replace(' ','_')]

    df = df[numerical_important + categorical_important]
    
    # Dummy variables
    OHE = make_column_transformer((OneHotEncoder(), categorical_important ),
                                        remainder='passthrough',
                                        verbose_feature_names_out=False)
    ohe = OHE.fit_transform(df)
    pickle.dump(ohe, open('OneHotEncoder.pkl','wb'))
    df = pd.DataFrame(ohe, columns=OHE.get_feature_names_out())

    # Feature scaling
    scaler = StandardScaler()
    scaler.fit(df)
    pickle.dump(scaler, open('scaler.pkl','wb'))
    df = scaler.transform(df)
    df = pd.DataFrame(df,columns=scaler.feature_names_in_)
    
    #add target variable
    df[dependent_variable] = dataset[dependent_variable]
    return df

In [2]:
df = pd.read_csv('Churn_prediction.csv')

<h2>Split dataset</h2>

In [3]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.2, random_state=666)

df_train.reset_index(inplace=True,drop=True)
df_test.reset_index(inplace=True,drop=True)


In [4]:
categorical_important = ['contract', 'onlinesecurity', 'techsupport', 'internetservice']
numerical_important = ['tenure', 'monthlycharges', 'totalcharges']

df_train = data_preprocessing(df_train)
df_train = FeatureEngineering(df_train,
                              categorical_variables=categorical_important,
                              numerical_variables=numerical_important,
                              target_variable='Churn')

y_train = df_train['churn']
del df_train['churn']

df_train.head().T

Unnamed: 0,0,1,2,3,4
contract_month-to-month,0.908986,0.908986,0.908986,0.908986,0.908986
contract_one_year,-0.518023,-0.518023,-0.518023,-0.518023,-0.518023
contract_two_year,-0.563275,-0.563275,-0.563275,-0.563275,-0.563275
onlinesecurity_no,1.004269,1.004269,1.004269,1.004269,1.004269
onlinesecurity_no_internet_service,-0.52298,-0.52298,-0.52298,-0.52298,-0.52298
onlinesecurity_yes,-0.63501,-0.63501,-0.63501,-0.63501,-0.63501
techsupport_no,1.019357,1.019357,-0.981011,1.019357,1.019357
techsupport_no_internet_service,-0.52298,-0.52298,-0.52298,-0.52298,-0.52298
techsupport_yes,-0.646585,-0.646585,1.546587,-0.646585,-0.646585
internetservice_dsl,1.380985,-0.724121,1.380985,-0.724121,-0.724121


<h2>1) Logistic Regression</h2>

In [5]:
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
clf_LogisticRegression = LogisticRegression(random_state=0)

scores = cross_validate(clf_LogisticRegression, df_train, y_train, cv=6,
                        scoring=('roc_auc'),
                        return_train_score=True)
print(scores['test_score'].mean())


0.8367262420328858


<h2>2) K Nearest Neighbors</h2>

In [6]:
from sklearn.neighbors import KNeighborsClassifier
clf_KNN = KNeighborsClassifier()

scores = cross_validate(clf_KNN, df_train, y_train, cv=6,
                        scoring=('roc_auc'),
                        return_train_score=True)
print(scores['test_score'].mean())


0.7796812937086126


<h2>3) Naive Bayes</h2>

In [7]:
from sklearn.naive_bayes import GaussianNB
clf_NB = GaussianNB()

scores = cross_validate(clf_NB, df_train, y_train, cv=6,
                        scoring=('roc_auc'),
                        return_train_score=True)
print(scores['test_score'].mean())


0.8254343639998684


<h2>4) Support Vector machine for Classification (SVC)</h2>

In [8]:
from sklearn.svm import SVC
clf_SVC = SVC(kernel='linear', random_state=6)

scores = cross_validate(clf_SVC, df_train, y_train, cv=6,
                        scoring=('roc_auc'),
                        return_train_score=True)
print(scores['test_score'].mean())
# kernel
#  -sigmoid   0.7217249049598338
#  -poly      0.8158020900887415
#  -rbf       0.7720218063211098
#  -linear    0.8092300188309788


0.8092300188309788


<h2>5) Desicion Tree Classification</h2>

In [9]:
from calendar import c
from sklearn.tree import DecisionTreeClassifier
clf_DT = DecisionTreeClassifier(criterion='entropy', random_state=6)

scores = cross_validate(clf_DT, df_train, y_train, cv=6,
                        scoring=('roc_auc'),
                        return_train_score=True)
print(scores['test_score'].mean())
# criterion
#   -gini      0.666598842172868
#   -entropy   0.6673704556237601

0.6673704556237601


<h2>6) Random Forest</h2>

In [10]:
from sklearn.ensemble import RandomForestClassifier

clf_RF = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=6)

scores = cross_validate(clf_RF, df_train, y_train, cv=6,
                        scoring=('roc_auc'),
                        return_train_score=True)
print(scores['test_score'].mean())
# criterion
#   -gini       0.7940777923624925    
#   -entropy    0.7955202892868366  

0.7955202892868366


<h2>7) XGBoost</h2>

In [11]:
from xgboost import XGBClassifier

clf_xgboost = XGBClassifier()

scores = cross_validate(clf_xgboost, df_train, y_train, cv=6,
                        scoring=('roc_auc'),
                        return_train_score=True)
print(scores['test_score'].mean())

0.8171957479568427


<h2>8) Gradiant Boosting model</h2>

In [12]:
from sklearn.ensemble import GradientBoostingClassifier

clf_GB = GradientBoostingClassifier(n_estimators=20, learning_rate=0.1, max_depth=3)

scores = cross_validate(clf_GB, df_train, y_train, cv=6,
                        scoring=('roc_auc'),
                        return_train_score=True)
print(scores['test_score'].mean())

0.8405381208558099


<h2>TEST</h2>

In [13]:
df_test = data_preprocessing(df_test)
df_test = FeatureEngineering(df_test,
                              categorical_variables=categorical_important,
                              numerical_variables=numerical_important,
                              target_variable='churn')
y_test = df_test['churn']
del df_test['churn']

# model
clf_LogisticRegression = LogisticRegression(random_state=0)
clf_LogisticRegression.fit(df_train, y_train)

# clf_GB = GradientBoostingClassifier(n_estimators=20, learning_rate=0.1, max_depth=3)
# clf_GB.fit(df_train, y_train)



pred = clf_LogisticRegression.predict(df_test)

from sklearn.metrics import roc_auc_score
# 0.6610603450428183