<h2 style='text-align:center'>Feature Engineering</h2>

<hr>

import libraries

In [17]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split


We created 'preprocess' function in DataCleaning step

In [18]:
#data_preprocessing

def preprocess(dataset:pd.DataFrame) -> pd.DataFrame :
    '''
    dataset contain this columns:
        'gender':               ['Female' 'Male']
        'SeniorCitizen':        [0 1]
        'Partner':              ['Yes' 'No']
        'Dependents':           ['No' 'Yes']
        'tenure':               int
        'PhoneService':         ['No' 'Yes']
        'MultipleLines':        ['No phone service' 'No' 'Yes']
        'InternetService':      ['DSL' 'Fiber optic' 'No']
        'OnlineSecurity':       ['No' 'Yes' 'No internet service']
        'OnlineBackup':         ['Yes' 'No' 'No internet service']
        'DeviceProtection':     ['No' 'Yes' 'No internet service']
        'TechSupport':          ['No' 'Yes' 'No internet service']
        'StreamingTV':          ['No' 'Yes' 'No internet service']
        'StreamingMovies':      ['No' 'Yes' 'No internet service']
        'Contract':             ['Month-to-month' 'One year' 'Two year']
        'PaperlessBilling':     ['Yes' 'No']
        'PaymentMethod':        ['Electronic check' 'Mailed check' 'Bank transfer (automatic)' 'Credit card (automatic)']
        'MonthlyCharges':       float
        'TotalCharges':         float
        'Churn':                ['No' 'Yes']   // Not necessary
    '''
    df = dataset.copy()

    # Prepare string variables
    df.columns = df.columns.str.lower().str.replace(' ','_')
    string_columns = list(df.dtypes[df.dtypes=='O'].index)
    for col in string_columns:
        df[col] = df[col].str.lower().str.replace(' ','_')

    # drop customerid
    if 'customerid' in df.columns:
        del df['customerid']
        
    # Prepare numeric columns type
    if 'tenure' in df.columns:
        df['tenure'] = pd.to_numeric(df['tenure'], errors='coerce')
        df['tenure'] = df['tenure'].fillna(0)
    else:
        raise Exception("!!! Tenure column didn't exist !!!")

    if 'monthlycharges' in df.columns:
        df['monthlycharges'] = pd.to_numeric(df['monthlycharges'], errors='coerce')
        df['monthlycharges'] = df['monthlycharges'].fillna(0)
    else:
        raise Exception("!!! MonthlyCharges column didn't exist !!!")

    if 'totalcharges' in df.columns:
        df['totalcharges'] = pd.to_numeric(df['totalcharges'], errors='coerce')
        df['totalcharges'] = df['totalcharges'].fillna(0)
    else:
        raise Exception("!!! TotalCharges column didn't exist !!!")
        
    # Make dependent variable numeric
    if 'churn' in df.columns:
        df.churn = (df.churn == 'yes').astype(int)
    
    return df

In [19]:
df = pd.read_csv('Churn_prediction.csv')
df = preprocess(df)

categorical_important=['contract', 'onlinesecurity', 'techsupport', 'internetservice']
numerical_important = ['tenure', 'monthlycharges', 'totalcharges']

<hr>

<h2>Split dataset to <u>training set</u> and <u>test set</u></h2>

<p style='color:red; text-align:center'>It's so important to split the dataset before scaling or encoding it.<br>
cause the training dataset shouldn't affect the test set values.<p>

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.loc[:,df.columns!='churn'],
                                                    df['churn'],
                                                    test_size=0.2,
                                                    random_state=1)

<hr>

In [22]:
X_train = X_train[categorical_important+numerical_important]

Unnamed: 0,contract,onlinesecurity,techsupport,internetservice,tenure,monthlycharges,totalcharges
1814,two_year,no_internet_service,no_internet_service,no,12,19.70,258.35
5946,one_year,yes,yes,dsl,42,73.90,3160.55
3881,two_year,yes,yes,dsl,71,65.15,4681.75
2389,one_year,yes,yes,dsl,71,85.45,6300.85
3676,one_year,yes,yes,dsl,30,70.40,2044.75
...,...,...,...,...,...,...,...
905,month-to-month,no,no,fiber_optic,9,100.50,918.60
5192,two_year,no_internet_service,no_internet_service,no,60,19.95,1189.90
3980,month-to-month,no,no,fiber_optic,28,105.70,2979.50
235,month-to-month,no,no,dsl,2,54.40,114.10


<hr>

Get the working directory path to save in the Future.<br>
I will save mean and std values of standardizing in a CSV file to use in the future or I will save OneHotEncoding important values.

In [32]:
import os
path = os.getcwd()

# we create a folder to save our data there 
assets_path = path+'\\assets'
if not os.path.exists(assets_path):
    os.makedirs(assets_path)


<hr>

<h2>OneHotEncoding</h2>


In [None]:
# We just use this function for training dataset 
def ohe(dataset:pd.DataFrame, categorical_columns:str):
    df = dataset.copy()
    df_info = pd.DataFrame(columns=['col', 'uniques'])
    for col in categorical_columns:
        list_Uniques = df[col].unique()
        df_info.loc[len(df_info)] = [col, ','.join(map(str, list_Uniques))]
        for name in list_Uniques:
            df[str(col)+'_'+str(name)] = df[col].apply(lambda x : 1 if x==name else 0)
        del df[col]

    df_info.to_csv(assets_path+'\\ohe_info.csv', index=False)
    return df

df=ohe(df, categorical_important)

In [None]:
# We use code below for test dataset.

# df_info = pd.read_csv(assets_path+'\\ohe_info.csv')
# for i in range(len(df_info)):
    # col = df_info.iloc[i,:].col
    # list_Uniques = df_info.iloc[i,:].uniques.split(',')
    # for name in list_Uniques:
        # df[str(col)+'_'+str(name)] =  df[col].apply(lambda x : 1 if x==name else 0)
    # del df[col]

<hr>

<h2>Feature scaling</h2>

In [None]:
# use for training dataset
scale_info = pd.DataFrame(columns=['col','mean','std'])
for col in numerical_important:
    mean_col = df[col].mean()
    std_col = df[col].std()
    df[col] = (df[col]-mean_col)/std_col
    scale_info.loc[len(scale_info)] = [col, mean_col, std_col]

scale_info.to_csv(assets_path+'\\scale_info.csv', index=False)

In [None]:
# use for test dataset

# scale_info = pd.read_csv(assets_path+'\\scale_info.csv')

# for i in range(len(scale_info)):
#     col = scale_info.loc[i,'col']
#     mean_col = scale_info.loc[i,'mean']
#     std_col = scale_info.loc[i,'std']
#     df[col] = (df[col]-mean_col)/std_col

<hr>

package all of the feature engineering functions in a function to use for each training dataset and test set.

In [None]:
def FeatureEngineering(dataset:pd.DataFrame,
                       categorical_variables:list,
                       numerical_variables:list,
                       isTrain:bool,
                       scaleColumns:list):
    """
    dataset: pd.Dataframe
    categorical_variables: list of important categorical columns that we want to use in pur model
    numerical_variables: list of important numerical columns that we want to use in pur model
    isTrain:    if you want to train a model, set it 'True'
                if you want to use for test or deployment set it 'False'
    """
    df = dataset.copy()
    categorical_important = categorical_variables.copy()
    numerical_important = numerical_variables.copy()
    df = df[numerical_important + categorical_important]

    # we create a folder to save our data there 
    path = os.getcwd()
    assets_path = path+'\\assets'
    if not os.path.exists(assets_path):
        os.makedirs(assets_path)

    # Dummy variables
    if isTrain:
        def ohe(dataset:pd.DataFrame, categorical_columns:str):
            df = dataset.copy()
            df_info = pd.DataFrame(columns=['col', 'uniques'])
            for col in categorical_columns:
                list_Uniques = df[col].unique()
                df_info.loc[len(df_info)] = [col, ','.join(map(str, list_Uniques))]
                for name in list_Uniques:
                    df[str(col)+'_'+str(name)] = df[col].apply(lambda x : 1 if x==name else 0)
                del df[col]

            df_info.to_csv(assets_path'\\ohe_info.csv', index=False)
            return df

        df=ohe(df, categorical_important)
    else:
        df_info = pd.read_csv(assets_path+'\ohe_info.csv')
        for i in range(len(df_info)):
            col = df_info.iloc[i,:].col
            list_Uniques = df_info.iloc[i,:].uniques.split(',')
            for name in list_Uniques:
                df[str(col)+'_'+str(name)] =  df[col].apply(lambda x : 1 if x==name else 0)
            del df[col]

    # Feature scaling
    if isTrain:
        scale_info = pd.DataFrame(columns=['col','mean','std'])
        for col in scaleColumns:
            mean_col = df[col].mean()
            std_col = df[col].std()
            df[col] = (df[col]-mean_col)/std_col
            scale_info.loc[len(scale_info)] = [col, mean_col, std_col]
        
        scale_info.to_csv(assets_path+'\\scale_info.csv', index=False)
    else:
        scale_info = pd.read_csv(assets_path+'\\scale_info.csv')
        
        for i in range(len(scale_info)):
            col = scale_info.loc[i,'col']
            mean_col = scale_info.loc[i,'mean']
            std_col = scale_info.loc[i,'std']

            df[col] = (df[col]-mean_col)/std_col
    return df