> <h2>EDA</h2>

<h2>import data</h2>

In [13]:
import pandas as pd
df = pd.read_csv('Churn_prediction.csv')

def data_preprocessing(dataset:pd.DataFrame):
    '''
    dataset contain this columns:
        'gender':               ['Female' 'Male']
        'SeniorCitizen':        [0 1]
        'Partner':              ['Yes' 'No']
        'Dependents':           ['No' 'Yes']
        'tenure':               int
        'PhoneService':         ['No' 'Yes']
        'MultipleLines':        ['No phone service' 'No' 'Yes']
        'InternetService':      ['DSL' 'Fiber optic' 'No']
        'OnlineSecurity':       ['No' 'Yes' 'No internet service']
        'OnlineBackup':         ['Yes' 'No' 'No internet service']
        'DeviceProtection':     ['No' 'Yes' 'No internet service']
        'TechSupport':          ['No' 'Yes' 'No internet service']
        'StreamingTV':          ['No' 'Yes' 'No internet service']
        'StreamingMovies':      ['No' 'Yes' 'No internet service']
        'Contract':             ['Month-to-month' 'One year' 'Two year']
        'PaperlessBilling':     ['Yes' 'No']
        'PaymentMethod':        ['Electronic check' 'Mailed check' 'Bank transfer (automatic)' 'Credit card (automatic)']
        'MonthlyCharges':       float
        'TotalCharges':         float
        'Churn':                ['No' 'Yes']   // Not necessary
    '''
    df = dataset.copy()

    # Prepare string variables
    df.columns = df.columns.str.lower().str.replace(' ','_')
    string_columns = list(df.dtypes[df.dtypes=='O'].index)
    for col in string_columns:
        df[col] = df[col].str.lower().str.replace(' ','_')

    # drop customerid
    if 'customerid' in df.columns:
        del df['customerid']
        
    # Prepare numeric columns type
    if 'tenure' in df.columns:
        df['tenure'] = pd.to_numeric(df['tenure'], errors='coerce')
        df['tenure'] = df['tenure'].fillna(0)
    else:
        raise Exception("!!! Tenure column didn't exist !!!")

    if 'monthlycharges' in df.columns:
        df['monthlycharges'] = pd.to_numeric(df['monthlycharges'], errors='coerce')
        df['monthlycharges'] = df['monthlycharges'].fillna(0)
    else:
        raise Exception("!!! MonthlyCharges column didn't exist !!!")

    if 'totalcharges' in df.columns:
        df['totalcharges'] = pd.to_numeric(df['totalcharges'], errors='coerce')
        df['totalcharges'] = df['totalcharges'].fillna(0)
    else:
        raise Exception("!!! TotalCharges column didn't exist !!!")
        
    # Make dependent variable numeric
    if 'churn' in df.columns:
        df.churn = (df.churn == 'yes').astype(int)
        
    # Drop duplicates
    df.drop_duplicates(inplace=True)
    
    return df

df = pd.read_csv('Churn_prediction.csv')
df = data_preprocessing(df)

<h2>Split dataset to training set and test set</h2>

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.loc[:,df.columns!='churn'],
                                                    df['churn'],
                                                    test_size=0.2,
                                                    random_state=666)

In [None]:
# from sklearn.model_selection import train_test_split
# df_train, df_test = train_test_split(df, test_size=0.2, random_state=666)

# print("df_train shape: {0}".format(df_train.shape))
# print("df_test shape: {0}".format(df_test.shape))

# y_train = df_train['churn'].values
# y_test = df_test['churn'].values


<p>Because the percentage of churns and not churns is not the same, we find out accuracy method is not a good choice for model evaluation. So we use other evaluation methods.</p>

In [17]:
# from locale import normalize
y_train.value_counts(normalize=True)

0    0.730947
1    0.269053
Name: churn, dtype: float64

In [18]:
global_mean = y_train.mean()
round(global_mean, 3)

0.269

<h2>Feature importance</h2>

In [20]:
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
               'phoneservice', 'multiplelines', 'internetservice',
               'onlinesecurity', 'onlinebackup', 'deviceprotection',
               'techsupport', 'streamingtv', 'streamingmovies',
               'contract', 'paperlessbilling', 'paymentmethod']
numerical = ['tenure', 'monthlycharges', 'totalcharges']

In [24]:
X_train[categorical].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

<p>Which group of observation has more risk of churn?</p>

In [25]:
Xy_train = X_train.copy()
Xy_train['churn']= y_train

for col in categorical:
    df_group = Xy_train.groupby(by=col).churn.agg(['mean'])
    df_group['diff'] = df_group['mean'] - global_mean
    df_group['risk'] = df_group['mean'] / global_mean
    display(df_group)

Unnamed: 0_level_0,mean,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.272467,0.003414,1.01269
male,0.265675,-0.003378,0.987445


Unnamed: 0_level_0,mean,diff,risk
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.237953,-0.0311,0.884411
1,0.426566,0.157513,1.585436


Unnamed: 0_level_0,mean,diff,risk
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.332876,0.063824,1.237216
yes,0.200074,-0.068979,0.743624


Unnamed: 0_level_0,mean,diff,risk
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.318517,0.049464,1.183845
yes,0.153067,-0.115985,0.568912


Unnamed: 0_level_0,mean,diff,risk
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.239209,-0.029844,0.889077
yes,0.272332,0.003279,1.012188


Unnamed: 0_level_0,mean,diff,risk
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.252495,-0.016557,0.938461
no_phone_service,0.239209,-0.029844,0.889077
yes,0.295117,0.026064,1.096873


Unnamed: 0_level_0,mean,diff,risk
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dsl,0.191688,-0.077364,0.712456
fiber_optic,0.421982,0.152929,1.568397
no,0.075125,-0.193927,0.279221


Unnamed: 0_level_0,mean,diff,risk
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.422691,0.153639,1.571036
no_internet_service,0.075125,-0.193927,0.279221
yes,0.147783,-0.121269,0.549272


Unnamed: 0_level_0,mean,diff,risk
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.405218,0.136165,1.506092
no_internet_service,0.075125,-0.193927,0.279221
yes,0.217303,-0.05175,0.807659


Unnamed: 0_level_0,mean,diff,risk
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.392554,0.123501,1.459021
no_internet_service,0.075125,-0.193927,0.279221
yes,0.231638,-0.037414,0.860941


Unnamed: 0_level_0,mean,diff,risk
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.416697,0.147644,1.548754
no_internet_service,0.075125,-0.193927,0.279221
yes,0.158574,-0.110479,0.589379


Unnamed: 0_level_0,mean,diff,risk
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.344329,0.075276,1.279783
no_internet_service,0.075125,-0.193927,0.279221
yes,0.298866,0.029814,1.110809


Unnamed: 0_level_0,mean,diff,risk
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.3397,0.070647,1.262577
no_internet_service,0.075125,-0.193927,0.279221
yes,0.30374,0.034688,1.128925


Unnamed: 0_level_0,mean,diff,risk
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
month-to-month,0.430839,0.161786,1.601318
one_year,0.119211,-0.149842,0.443077
two_year,0.030814,-0.238238,0.114529


Unnamed: 0_level_0,mean,diff,risk
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.166593,-0.102459,0.619185
yes,0.338821,0.069768,1.25931


Unnamed: 0_level_0,mean,diff,risk
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bank_transfer_(automatic),0.173948,-0.095104,0.646521
credit_card_(automatic),0.158367,-0.110685,0.588611
electronic_check,0.4584,0.189347,1.703754
mailed_check,0.186909,-0.082144,0.694691


<h2>Mutual information</h2>
<p>Mutual information (MI) - concept from information theory , it tells us how much we can learn about one variable if we know the value of another</p>
<a href='https://en.wikipedia.org/wiki/Mutual_information'>https://en.wikipedia.org/wiki/Mutual_information</a>
<p>We find out contract, onlinesecurity, techsupport and internetservice are important independent variables</p>

In [26]:
from sklearn.metrics import mutual_info_score

def calculate_mi(series):
    return mutual_info_score(series, y_train)

df_mi = X_train[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='more_important')

print(df_mi)
categorical_important=['contract', 'onlinesecurity', 'techsupport', 'internetservice']

                  more_important
contract                0.097340
onlinesecurity          0.065429
techsupport             0.061188
internetservice         0.055655
onlinebackup            0.047400
paymentmethod           0.044806
deviceprotection        0.043037
streamingtv             0.032241
streamingmovies         0.031892
paperlessbilling        0.018954
dependents              0.015690
seniorcitizen           0.011574
partner                 0.011330
multiplelines           0.001280
phoneservice            0.000254
gender                  0.000029


<h2>Corrlation between numerical variables and the dependent variable(churn)</h2>
<p>Non of them is important</p>

In [28]:
print(X_train[numerical].corrwith(y_train).to_frame('correlation'))
# But we use them :/
numerical_important = ['tenure', 'monthlycharges', 'totalcharges']

                correlation
tenure            -0.349202
monthlycharges     0.195308
totalcharges      -0.196418


In [29]:
Xy_train.groupby(by='churn')[numerical].mean()


Unnamed: 0_level_0,tenure,monthlycharges,totalcharges
churn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,37.53106,61.5262,2557.191194
1,18.2409,74.77366,1553.076737


In [30]:
X_train = X_train[categorical_important+numerical_important]
X_train

Unnamed: 0,contract,onlinesecurity,techsupport,internetservice,tenure,monthlycharges,totalcharges
3448,month-to-month,no,yes,dsl,4,61.45,229.55
3697,month-to-month,yes,no,fiber_optic,32,89.60,2901.80
2747,month-to-month,no_internet_service,no_internet_service,no,7,19.40,168.65
4420,one_year,yes,yes,dsl,27,69.35,1927.30
1022,month-to-month,no,no,fiber_optic,5,84.70,392.50
...,...,...,...,...,...,...,...
70,month-to-month,no,yes,dsl,1,49.05,49.05
2884,two_year,yes,yes,dsl,72,90.15,6716.45
1955,month-to-month,no,no,fiber_optic,49,95.40,4613.95
1926,month-to-month,no,no,fiber_optic,49,99.80,4872.45


<h2>One Hot Encoding</h2>

In [31]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
import pickle

OHE = make_column_transformer((OneHotEncoder(), categorical_important ),
                                        remainder='passthrough',
                                        verbose_feature_names_out=False)

ohe = OHE.fit_transform(X_train)
pickle.dump(ohe, open('OneHotEncoder.pkl','wb'))

# save OHE.get_feature_names_out() to use in test process
with open("OHE_feature_names_out.txt", "w") as output:
    for item in OHE.get_feature_names_out():
        output.write('%s\n' %item)
output.close()

# ohe = pickle.load(open('OneHotEncoder.pkl','rb'))
X_train = pd.DataFrame(ohe, columns=OHE.get_feature_names_out())

X_train

Unnamed: 0,contract_month-to-month,contract_one_year,contract_two_year,onlinesecurity_no,onlinesecurity_no_internet_service,onlinesecurity_yes,techsupport_no,techsupport_no_internet_service,techsupport_yes,internetservice_dsl,internetservice_fiber_optic,internetservice_no,tenure,monthlycharges,totalcharges
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,4.0,61.45,229.55
1,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,32.0,89.60,2901.80
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,7.0,19.40,168.65
3,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,27.0,69.35,1927.30
4,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,5.0,84.70,392.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5611,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,49.05,49.05
5612,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,72.0,90.15,6716.45
5613,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,49.0,95.40,4613.95
5614,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,49.0,99.80,4872.45


<h2>Feature scaling</h2>

In [32]:
from sklearn.preprocessing import StandardScaler
import pickle

scaler = StandardScaler()
scaler.fit(X_train)
pickle.dump(scaler, open('scaler.pkl','wb'))

df_train = scaler.transform(X_train)
df_train =pd.DataFrame(df_train, columns=scaler.feature_names_in_)

# scaler = pickle.load(open('scaler.pkl','rb'))


<h2>Creat a function for Feature Engineering</h2>

In [42]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
import pickle

def FeatureEngineering(dataset:pd.DataFrame, categorical_variables:list, numerical_variables:list, isTrain:bool):
    """
    dataset: pd.Dataframe
    categorical_variables: list of important categorical columns that we want to use in pur model
    numerical_variables: list of important numerical columns that we want to use in pur model
    isTrain:    if you want to train a model, set it 'True'
                if you want to use for test or deployment set it 'False'
    """
    df = dataset.copy()
    categorical_important = categorical_variables.copy()
    numerical_important = numerical_variables.copy()
    
    df = df[numerical_important + categorical_important]
    
    # Dummy variables
    if isTrain:
        OHE = make_column_transformer((OneHotEncoder(), categorical_important ),
                                            remainder='passthrough',
                                            verbose_feature_names_out=False)
        ohe = OHE.fit_transform(df)
        pickle.dump(ohe, open('OneHotEncoder.pkl','wb'))
        df = pd.DataFrame(ohe, columns=OHE.get_feature_names_out())

        # we Save OHE.get_feature_names_out() to use in test process or deployment in future
        with open("OHE_feature_names_out.txt", "w") as file:
            for item in OHE.get_feature_names_out():
                file.write('%s\n' %item)
        file.close()
    else:
        ohe = pickle.load(open('OneHotEncoder.pkl','rb'))
        OHE_feature_names_out=[]
        with open('OHE_feature_names_out.txt') as file:
            OHE_feature_names_out = file.readlines()
        file.close()
        OHE_feature_names_out = list(map(lambda x: x[:-1], OHE_feature_names_out))

        df = pd.DataFrame(ohe, columns=OHE.get_feature_names_out())
        
    # Feature scaling
    if isTrain:
        scaler = StandardScaler()
        scaler.fit(df)
        df = scaler.transform(df)
        df = pd.DataFrame(df, columns=scaler.feature_names_in_)
        pickle.dump(scaler, open('scaler.pkl','wb'))
    else:
        scaler = pickle.load(open('scaler.pkl','rb'))
        df = scaler.transform(df)
        df = pd.DataFrame(df, columns=scaler.feature_names_in_)

    return df

In [44]:
import pandas as pd
df = pd.read_csv('Churn_prediction.csv')

categorical_important = ['contract', 'onlinesecurity', 'techsupport', 'internetservice']
numerical_important = ['tenure', 'monthlycharges', 'totalcharges']

df = data_preprocessing(df)
df = FeatureEngineering(df,
                        categorical_variables=categorical_important,
                        numerical_variables=numerical_important,
                        isTrain=True)
df

Unnamed: 0,contract_month-to-month,contract_one_year,contract_two_year,onlinesecurity_no,onlinesecurity_no_internet_service,onlinesecurity_yes,techsupport_no,techsupport_no_internet_service,techsupport_yes,internetservice_dsl,internetservice_fiber_optic,internetservice_no,tenure,monthlycharges,totalcharges
0,0.906762,-0.515268,-0.564137,1.005857,-0.523889,-0.635326,1.013046,-0.523889,-0.640850,1.379289,-0.886600,-0.523889,-1.282728,-1.164135,-0.995686
1,-1.102826,1.940738,-0.564137,-0.994177,-0.523889,1.573996,1.013046,-0.523889,-0.640850,1.379289,-0.886600,-0.523889,0.062387,-0.262811,-0.175262
2,0.906762,-0.515268,-0.564137,-0.994177,-0.523889,1.573996,1.013046,-0.523889,-0.640850,1.379289,-0.886600,-0.523889,-1.241967,-0.365914,-0.961142
3,-1.102826,1.940738,-0.564137,-0.994177,-0.523889,1.573996,-0.987122,-0.523889,1.560427,1.379289,-0.886600,-0.523889,0.510759,-0.750058,-0.196769
4,0.906762,-0.515268,-0.564137,1.005857,-0.523889,-0.635326,1.013046,-0.523889,-0.640850,-0.725011,1.127904,-0.523889,-1.241967,0.194503,-0.941951
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7016,-1.102826,1.940738,-0.564137,-0.994177,-0.523889,1.573996,-0.987122,-0.523889,1.560427,1.379289,-0.886600,-0.523889,-0.345224,0.663458,-0.130704
7017,-1.102826,1.940738,-0.564137,1.005857,-0.523889,-0.635326,1.013046,-0.523889,-0.640850,-0.725011,1.127904,-0.523889,1.611307,1.275428,2.239445
7018,0.906762,-0.515268,-0.564137,-0.994177,-0.523889,1.573996,1.013046,-0.523889,-0.640850,1.379289,-0.886600,-0.523889,-0.875118,-1.172450,-0.856011
7019,0.906762,-0.515268,-0.564137,1.005857,-0.523889,-0.635326,1.013046,-0.523889,-0.640850,-0.725011,1.127904,-0.523889,-1.160445,0.317562,-0.873592
