> <h3>Import data</h3>

In [101]:
import pandas as pd
df = pd.read_csv('Churn_prediction.csv')

In [102]:
df.shape

(7043, 21)

-- because we have a lot of independent variable, I display the transpose  of data frame 

In [103]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [104]:
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

> <h3>Data preprocessing</h3>

-- Here we have some problems:
- We dont need customerID column           
- TotalCharges should be a numeric variable
- All the string values or column name should be lower case or replace whitespace with _
- Make dependent variable to number


In [105]:
del df['customerID']

In [106]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(0)

In [107]:
df.columns = df.columns.str.lower().str.replace(' ','_')

string_columns = list(df.dtypes[df.dtypes=='O'].index)
for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ','_')

In [108]:
df.churn = (df.churn == 'yes').astype(int)

specify categorical and numerical columns

In [109]:
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
               'phoneservice', 'multiplelines', 'internetservice',
               'onlinesecurity', 'onlinebackup', 'deviceprotection',
               'techsupport', 'streamingtv', 'streamingmovies',
               'contract', 'paperlessbilling', 'paymentmethod']
numerical = ['tenure', 'monthlycharges', 'totalcharges']

In [110]:
df.head().T

Unnamed: 0,0,1,2,3,4
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no
onlinebackup,yes,no,yes,no,no


remove duplicate observations

In [111]:
print('We have {0} duplicate observation'.format(df.duplicated().sum()))
# drop them
df.drop_duplicates(inplace=True)
print('Now we have {0} duplicate observation'.format(df.duplicated().sum()))

We have 22 duplicate observation
Now we have 0 duplicate observation


Check missing values

In [112]:
df.isnull().sum()

gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

> <h3>Finally we create a function for data preprocessing </h3>

In [113]:
def data_preprocessing(dataset):
    df = dataset.copy()
    columns = ['customerID','gender','SeniorCitizen',
                'Partner','Dependents','tenure','PhoneService',
                'MultipleLines','InternetService','OnlineSecurity',
                'OnlineBackup','DeviceProtection','TechSupport',
                'StreamingTV','StreamingMovies','Contract',
                'PaperlessBilling','PaymentMethod',
                'MonthlyCharges','TotalCharges','Churn']
    # create columns not exist
    notImported=list(set(columns) - set(df.columns))
    if len(notImported):
        print("{0}, column(s) don't(doesn't) exist".format(notImported))
        print('!!! please enter this column first !!!')

    else:
            
        # make TotalCharges numeric                  
        df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
        df['TotalCharges'] = df['TotalCharges'].fillna(0)
        # prepare string variables
        df.columns = df.columns.str.lower().str.replace(' ','_')
        string_columns = list(df.dtypes[df.dtypes=='O'].index)
        for col in string_columns:
            df[col] = df[col].str.lower().str.replace(' ','_')
        # make dependent variable numeric
        df.churn = (df.churn == 'yes').astype(int)
        # drop duplicates
        df.drop_duplicates(inplace=True)
    
        return df


In [114]:
df = pd.read_csv('Churn_prediction.csv')
# df.drop('Churn', axis=1,inplace=True)
data_preprocessing(df).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7033,7034,7035,7036,7037,7038,7039,7040,7041,7042
customerid,7590-vhveg,5575-gnvde,3668-qpybk,7795-cfocw,9237-hqitu,9305-cdskc,1452-kiovk,6713-okomc,7892-pookp,6388-tabgu,...,9767-fflem,0639-tsiqw,8456-qdavc,7750-eyxwz,2569-wgero,6840-resvb,2234-xaduh,4801-jzazl,8361-ltmkd,3186-ajiek
gender,female,male,male,male,female,female,male,female,female,male,...,male,female,male,female,female,male,female,female,male,male
seniorcitizen,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
partner,yes,no,no,no,no,no,no,no,yes,no,...,no,no,no,no,no,yes,yes,yes,yes,no
dependents,no,no,no,no,no,no,yes,no,no,yes,...,no,no,no,no,no,yes,yes,yes,no,no
tenure,1,34,2,45,2,8,22,10,28,62,...,38,67,19,12,72,24,72,11,4,66
phoneservice,no,yes,yes,no,yes,yes,yes,no,yes,yes,...,yes,yes,yes,no,yes,yes,yes,no,yes,yes
multiplelines,no_phone_service,no,no,no_phone_service,no,yes,yes,no_phone_service,yes,no,...,no,yes,no,no_phone_service,no,yes,yes,no_phone_service,yes,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic,fiber_optic,fiber_optic,dsl,fiber_optic,dsl,...,fiber_optic,fiber_optic,fiber_optic,dsl,no,dsl,fiber_optic,dsl,fiber_optic,fiber_optic
onlinesecurity,no,yes,yes,yes,no,no,no,yes,no,yes,...,no,yes,no,no,no_internet_service,yes,no,yes,no,yes
