import data

In [37]:
import pandas as pd
df = pd.read_csv('Churn_prediction.csv')

<hr>

In [38]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


<hr>

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


Very interesting

- We have 21 columns and 7043 Observation 
- The dataset doesn't have missing values
- Some features are categorical
- Our Target variable is 'Churn' and it's categorical 

<hr>

'TotalCharges' column should be numeric but it's categorical, So I change the type of that to numeric.  

In [40]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(0)

<i>tenure</i>, <i>MonthlyCharges</i> and <i>TotalCharges</i> are numeric columuns.
<br>It's important for us to import them as numeric columns.

<hr>

customerID is a unique value for each observation so it doesn't help us to create ML model.
<br>So I remove it

In [41]:
del df['customerID']

<hr>

We make our data into a standard format:
- lowercase
- replace whitespaces with underline(_)

In [42]:
df.columns = df.columns.str.lower().str.replace(' ','_')

string_columns = list(df.dtypes[df.dtypes=='O'].index)
for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ','_')

<hr>

understand categorical features

In [50]:
for col in string_columns:
    print( '{0} : {1}'.format(col,set(df[col])) )

gender : {'male', 'female'}
partner : {'no', 'yes'}
dependents : {'no', 'yes'}
phoneservice : {'no', 'yes'}
multiplelines : {'no_phone_service', 'no', 'yes'}
internetservice : {'no', 'fiber_optic', 'dsl'}
onlinesecurity : {'no_internet_service', 'no', 'yes'}
onlinebackup : {'no_internet_service', 'no', 'yes'}
deviceprotection : {'no_internet_service', 'no', 'yes'}
techsupport : {'no_internet_service', 'no', 'yes'}
streamingtv : {'no_internet_service', 'no', 'yes'}
streamingmovies : {'no_internet_service', 'no', 'yes'}
contract : {'one_year', 'two_year', 'month-to-month'}
paperlessbilling : {'no', 'yes'}
paymentmethod : {'mailed_check', 'bank_transfer_(automatic)', 'electronic_check', 'credit_card_(automatic)'}
churn : {0, 1}


<hr>

We will use ML algorithms to predict Churn, So I replace churn with 1 and no churn with 0 to help our model to classify by sigmoid functions in the future.

In [44]:
df.churn = (df.churn == 'yes').astype(int)

<hr>

See the result

In [45]:
df.head().T

Unnamed: 0,0,1,2,3,4
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no
onlinebackup,yes,no,yes,no,no


<hr>

I package all of our activities as a function to use in future  

In [51]:
#data_preprocessing

def preprocess(dataset:pd.DataFrame) -> pd.DataFrame :
    '''
    dataset contain this columns:
        'gender':               ['Female' 'Male']
        'SeniorCitizen':        [0 1]
        'Partner':              ['Yes' 'No']
        'Dependents':           ['No' 'Yes']
        'tenure':               int
        'PhoneService':         ['No' 'Yes']
        'MultipleLines':        ['No phone service' 'No' 'Yes']
        'InternetService':      ['DSL' 'Fiber optic' 'No']
        'OnlineSecurity':       ['No' 'Yes' 'No internet service']
        'OnlineBackup':         ['Yes' 'No' 'No internet service']
        'DeviceProtection':     ['No' 'Yes' 'No internet service']
        'TechSupport':          ['No' 'Yes' 'No internet service']
        'StreamingTV':          ['No' 'Yes' 'No internet service']
        'StreamingMovies':      ['No' 'Yes' 'No internet service']
        'Contract':             ['Month-to-month' 'One year' 'Two year']
        'PaperlessBilling':     ['Yes' 'No']
        'PaymentMethod':        ['Electronic check' 'Mailed check' 'Bank transfer (automatic)' 'Credit card (automatic)']
        'MonthlyCharges':       float
        'TotalCharges':         float
        'Churn':                ['No' 'Yes']   // Not necessary
    '''
    df = dataset.copy()

    # Prepare string variables
    df.columns = df.columns.str.lower().str.replace(' ','_')
    string_columns = list(df.dtypes[df.dtypes=='O'].index)
    for col in string_columns:
        df[col] = df[col].str.lower().str.replace(' ','_')

    # drop customerid
    if 'customerid' in df.columns:
        del df['customerid']
        
    # Prepare numeric columns type
    if 'tenure' in df.columns:
        df['tenure'] = pd.to_numeric(df['tenure'], errors='coerce')
        df['tenure'] = df['tenure'].fillna(0)
    else:
        raise Exception("!!! Tenure column didn't exist !!!")

    if 'monthlycharges' in df.columns:
        df['monthlycharges'] = pd.to_numeric(df['monthlycharges'], errors='coerce')
        df['monthlycharges'] = df['monthlycharges'].fillna(0)
    else:
        raise Exception("!!! MonthlyCharges column didn't exist !!!")

    if 'totalcharges' in df.columns:
        df['totalcharges'] = pd.to_numeric(df['totalcharges'], errors='coerce')
        df['totalcharges'] = df['totalcharges'].fillna(0)
    else:
        raise Exception("!!! TotalCharges column didn't exist !!!")
        
    # Make dependent variable numeric
    if 'churn' in df.columns:
        df.churn = (df.churn == 'yes').astype(int)
    
    return df

Now we are ready to go to the next step...