Do these exercises in a notebook called modeling.ipynb first, then transfer the final functions to the model.py file.

## Using the Titanic dataset

1. Use the function defined in acquire.py to load the Titanic data.

2. Use the function defined in prepare.py to prepare the titanic data.

3. Encode the categorical columns on train dataset.
   
   Create dummy variables of the categorical columns and concatenate them onto the dataframe. 
   
   Remove the columns they are replacing.
   
   Repeat on validate and test.

4. Create a function named preprocess_titanic that accepts the train, validate, and test titanic data, and returns the dataframes ready for modeling.

In [1]:
# standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# import our acquire module
import acquire
import prepare

from sklearn.model_selection import train_test_split
from scipy import stats

>1. Use the function defined in acquire.py to load the Titanic data.

In [2]:
df=acquire.get_titanic_data()
df.head(3)

this file exists, reading csv


Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1


In [3]:
df.shape

(891, 13)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   passenger_id  891 non-null    int64  
 1   survived      891 non-null    int64  
 2   pclass        891 non-null    int64  
 3   sex           891 non-null    object 
 4   age           714 non-null    float64
 5   sibsp         891 non-null    int64  
 6   parch         891 non-null    int64  
 7   fare          891 non-null    float64
 8   embarked      889 non-null    object 
 9   class         891 non-null    object 
 10  deck          203 non-null    object 
 11  embark_town   889 non-null    object 
 12  alone         891 non-null    int64  
dtypes: float64(2), int64(6), object(5)
memory usage: 97.5+ KB


> 2.Use the function defined in prepare.py to prepare the titanic data.

In [5]:
#drop unncessary columns
df = df.drop(columns=['passenger_id','embarked','deck', 'class'])
df.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone
0,0,3,male,22.0,1,0,7.25,Southampton,0
1,1,1,female,38.0,1,0,71.2833,Cherbourg,0
2,1,3,female,26.0,0,0,7.925,Southampton,1


In [6]:
df.age.isnull().sum()

177

In [7]:
# 'age' has 177 null values lets figure it out how much % 
177/df.shape[0]

0.19865319865319866

In [8]:
# # impute age values
# we are filling in nearly 20 percent of our info so be careful here

df['age']=df['age'].fillna(df.age.mean())


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   survived     891 non-null    int64  
 1   pclass       891 non-null    int64  
 2   sex          891 non-null    object 
 3   age          891 non-null    float64
 4   sibsp        891 non-null    int64  
 5   parch        891 non-null    int64  
 6   fare         891 non-null    float64
 7   embark_town  889 non-null    object 
 8   alone        891 non-null    int64  
dtypes: float64(2), int64(5), object(2)
memory usage: 69.6+ KB


In [10]:
df.embark_town.isnull().sum()

2

In [11]:
# anotherway of checking null values
df.embark_town.value_counts(dropna=False)

embark_town
Southampton    644
Cherbourg      168
Queenstown      77
NaN              2
Name: count, dtype: int64

In [12]:
# let's impute for 'embark_town'
df.embark_town.value_counts()

embark_town
Southampton    644
Cherbourg      168
Queenstown      77
Name: count, dtype: int64

In [13]:
df.embark_town.mode()

0    Southampton
Name: embark_town, dtype: object

In [14]:
df.embark_town.mode()[0]

'Southampton'

In [15]:
df['embark_town'] = df['embark_town'].fillna(df.embark_town.mode()[0])

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   survived     891 non-null    int64  
 1   pclass       891 non-null    int64  
 2   sex          891 non-null    object 
 3   age          891 non-null    float64
 4   sibsp        891 non-null    int64  
 5   parch        891 non-null    int64  
 6   fare         891 non-null    float64
 7   embark_town  891 non-null    object 
 8   alone        891 non-null    int64  
dtypes: float64(2), int64(5), object(2)
memory usage: 69.6+ KB


## split

In [17]:
train,validate,test=prepare.splitting_data(df,'survived')

In [18]:
print(f'shape of titanic DB: {df.shape}')
print('-----------------------------')
print(f'shape of train: {train.shape}')
print(f'shape of validate: {validate.shape}')
print(f'shape of test: {test.shape}')

shape of titanic DB: (891, 9)
-----------------------------
shape of train: (534, 9)
shape of validate: (178, 9)
shape of test: (179, 9)


## Encoding:---
> Q3) Encode the categorical columns on train dataset.

Create dummy variables of the categorical columns and concatenate them onto the dataframe.

Remove the columns they are replacing.

Repeat on validate and test.

In [19]:
# preprocessing from here on out:
# stuff we need to do for the modeling:

In [20]:
# two flavors of encoding:
# ordinal, or one-hot encoding:
# ordinal: numbers that are discrete and have an order: pclass: 1, 2, 3
# nominal: numbers that represent categories that do not have an order
# nominal is our use-case for one-hot encoding

In [21]:
train.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone
776,0,3,male,29.699118,0,0,7.75,Queenstown,1
829,1,1,female,62.0,0,0,80.0,Southampton,1
215,1,1,female,31.0,1,0,113.275,Cherbourg,0


In [22]:


# Encoding categorical columns for train dataset
dummy_train = pd.get_dummies(train[['sex','embark_town']],dummy_na=False, drop_first=True).astype(int) 

# for nominal
# drop_first=True means it looks for n-1 categories.
dummy_train.head(3)

Unnamed: 0,sex_male,embark_town_Queenstown,embark_town_Southampton
776,1,1,0
829,0,0,1
215,0,0,0


In [23]:
# Concatenate the dummy_df dataframe above with the original train and verify.

train = pd.concat([train, dummy_train], axis=1)
train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
776,0,3,male,29.699118,0,0,7.75,Queenstown,1,1,1,0
829,1,1,female,62.0,0,0,80.0,Southampton,1,0,0,1
215,1,1,female,31.0,1,0,113.275,Cherbourg,0,0,0,0
258,1,1,female,35.0,0,0,512.3292,Cherbourg,1,0,0,0
129,0,3,male,45.0,0,0,6.975,Southampton,1,1,0,1


In [24]:
# Drop string values that have been replaced with encoded values.

train = train.drop(columns=['sex', 'embark_town'])
train.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
776,0,3,29.699118,0,0,7.75,1,1,1,0
829,1,1,62.0,0,0,80.0,1,0,0,1
215,1,1,31.0,1,0,113.275,0,0,0,0
258,1,1,35.0,0,0,512.3292,1,0,0,0
129,0,3,45.0,0,0,6.975,1,1,0,1


In [25]:
# Repeat on validate and test. 

# Encoding categorical columns for validate & test dataframe

dummy_validate = pd.get_dummies(validate[['sex','embark_town']], dummy_na=False, drop_first=[True, True]).astype(int)
dummy_test = pd.get_dummies(test[['sex','embark_town']], dummy_na=False, drop_first=[True, True]).astype(int)

# Concatenate the dummy_validate & dummy_test above with the original validate & test dataframe

validate = pd.concat([validate, dummy_validate], axis=1)
test = pd.concat([test, dummy_test], axis=1)

# Drop string values that have been replaced with encoded values.

validate = validate.drop(columns=['sex', 'embark_town'])
test = test.drop(columns=['sex', 'embark_town'])


In [26]:
validate.head(3)

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
541,0,3,9.0,4,2,31.275,0,0,0,1
204,1,3,18.0,0,0,8.05,1,1,0,1
108,0,3,38.0,0,0,7.8958,1,1,0,1


In [27]:
test.head(3)

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
763,1,1,36.0,1,2,120.0,0,0,0,1
112,0,3,22.0,0,0,8.05,1,1,0,1
230,1,1,35.0,1,0,83.475,0,0,0,1


> 4. Create a function named preprocess_titanic that accepts the train, validate, and test titanic data, and returns the dataframes ready for modeling.

In [28]:


def preprocess_titanic(df):
    
    #  Encoding categorical columns for original dataframe
    dummy_df = pd.get_dummies(df[['sex','embark_town']], dummy_na=False, drop_first=[True, True]).astype(int)
    # Concatenate the dummy_df  with df dataframe
    df = pd.concat([df,dummy_df], axis=1)
    # Drop string values that have been replaced with encoded values
    df = df.drop(columns=['sex','embark_town'])
    

    return df



In [29]:
fresh_titanic=acquire.get_titanic_data()

this file exists, reading csv


In [30]:
fresh_titanic=prepare.prep_titanic(fresh_titanic)

In [31]:
fresh_train,fresh_validate,fresh_test=prepare.splitting_data(df,'survived')

In [32]:
preprocess_titanic(fresh_train).head(3)

                                    

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
776,0,3,29.699118,0,0,7.75,1,1,1,0
829,1,1,62.0,0,0,80.0,1,0,0,1
215,1,1,31.0,1,0,113.275,0,0,0,0


In [33]:
preprocess_titanic(fresh_validate).head(3)

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
541,0,3,9.0,4,2,31.275,0,0,0,1
204,1,3,18.0,0,0,8.05,1,1,0,1
108,0,3,38.0,0,0,7.8958,1,1,0,1


## Using the Telco dataset

1. Use the function defined in acquire.py to load the Telco data.

2. Use the function defined in prepare.py to prepare the Telco data.

3. Encode the categorical columns on train.

        a. Encode at least one column using .replace
        
        b. Encode at least one column using .map
        
        c. Encode the rest of the columns by creating dummy variables and concatenating them onto the dataframe.

4. Repeat the same steps on validate and test.

5. Create a function named prep_telco that accepts the train, validate, and test telco data, and returns the dataframes ready for modeling.

In [34]:
# standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# import our acquire module
import acquire
import prepare

from sklearn.model_selection import train_test_split
from scipy import stats

In [35]:
telco=acquire.get_telco_data()

this file exists, reading csv


In [36]:
telco.head(3)

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,2,1,2,0002-ORFBO,Female,0,Yes,Yes,9,Yes,...,Yes,Yes,No,Yes,65.6,593.3,No,One year,DSL,Mailed check
1,2,1,1,0003-MKNFE,Male,0,No,No,9,Yes,...,No,No,Yes,No,59.9,542.4,No,Month-to-month,DSL,Mailed check
2,1,2,1,0004-TLHLJ,Male,0,No,No,4,Yes,...,No,No,No,Yes,73.9,280.85,Yes,Month-to-month,Fiber optic,Electronic check


In [37]:
telco.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7043 entries, 0 to 7042
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   payment_type_id           7043 non-null   int64  
 1   internet_service_type_id  7043 non-null   int64  
 2   contract_type_id          7043 non-null   int64  
 3   customer_id               7043 non-null   object 
 4   gender                    7043 non-null   object 
 5   senior_citizen            7043 non-null   int64  
 6   partner                   7043 non-null   object 
 7   dependents                7043 non-null   object 
 8   tenure                    7043 non-null   int64  
 9   phone_service             7043 non-null   object 
 10  multiple_lines            7043 non-null   object 
 11  online_security           7043 non-null   object 
 12  online_backup             7043 non-null   object 
 13  device_protection         7043 non-null   object 
 14  tech_support 

In [38]:
telco=prepare.prep_telco(telco)
telco.head(3)

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,0002-ORFBO,Female,0,Yes,Yes,9,Yes,No,No,Yes,...,Yes,Yes,No,Yes,65.6,593.3,No,One year,DSL,Mailed check
1,0003-MKNFE,Male,0,No,No,9,Yes,Yes,No,No,...,No,No,Yes,No,59.9,542.4,No,Month-to-month,DSL,Mailed check
2,0004-TLHLJ,Male,0,No,No,4,Yes,No,No,No,...,No,No,No,Yes,73.9,280.85,Yes,Month-to-month,Fiber optic,Electronic check


In [39]:
telco.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   customer_id            7043 non-null   object 
 1   gender                 7043 non-null   object 
 2   senior_citizen         7043 non-null   int64  
 3   partner                7043 non-null   object 
 4   dependents             7043 non-null   object 
 5   tenure                 7043 non-null   int64  
 6   phone_service          7043 non-null   object 
 7   multiple_lines         7043 non-null   object 
 8   online_security        7043 non-null   object 
 9   online_backup          7043 non-null   object 
 10  device_protection      7043 non-null   object 
 11  tech_support           7043 non-null   object 
 12  streaming_tv           7043 non-null   object 
 13  streaming_movies       7043 non-null   object 
 14  paperless_billing      7043 non-null   object 
 15  monthly_c

## split

In [40]:
train_telco,validate_telco,test_telco=prepare.splitting_data(telco,'churn')

In [41]:
print(f'shape of titanic DB: {telco.shape}')
print('-----------------------------')
print(f'shape of train_telco: {train_telco.shape}')
print(f'shape of validate_telco: {validate_telco.shape}')
print(f'shape of test_telco: {test_telco.shape}')

shape of titanic DB: (7043, 21)
-----------------------------
shape of train_telco: (4225, 21)
shape of validate_telco: (1409, 21)
shape of test_telco: (1409, 21)


## Encoding

> 3. Encode the categorical columns on train.

a.Encode at least one column using .replace

b.Encode at least one column using .map

c.Encode the rest of the columns by creating dummy variables and concatenating them onto the dataframe.

In [42]:
# a.Encode at least one column using .replace

train_telco.loc[:, 'is_female'] = train_telco['gender'].replace({'Female': 1, 'Male': 0})



In [43]:
train_telco.head(3)

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,...,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type,is_female
2865,4083-BFNYK,Female,1,Yes,No,38,Yes,Yes,Yes,Yes,...,No,Yes,No,95.0,3591.25,No,One year,Fiber optic,Credit card (automatic),1
4107,5804-LEPIM,Female,1,No,No,2,Yes,No,No,No,...,No,No,Yes,70.35,139.05,Yes,Month-to-month,Fiber optic,Electronic check,1
3453,4895-TMWIR,Male,1,Yes,No,11,Yes,No,No internet service,No internet service,...,No internet service,No internet service,No,19.95,214.75,Yes,Month-to-month,,Mailed check,0


In [44]:
# b.Encode at least one column using .map

train_telco.loc[:, 'has_partner'] = train_telco['partner'].map({'Yes': 1, 'No': 0})



In [45]:
train_telco.head(3)

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,...,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type,is_female,has_partner
2865,4083-BFNYK,Female,1,Yes,No,38,Yes,Yes,Yes,Yes,...,Yes,No,95.0,3591.25,No,One year,Fiber optic,Credit card (automatic),1,1
4107,5804-LEPIM,Female,1,No,No,2,Yes,No,No,No,...,No,Yes,70.35,139.05,Yes,Month-to-month,Fiber optic,Electronic check,1,0
3453,4895-TMWIR,Male,1,Yes,No,11,Yes,No,No internet service,No internet service,...,No internet service,No,19.95,214.75,Yes,Month-to-month,,Mailed check,0,1


In [46]:
train_telco.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4225 entries, 2865 to 5354
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   customer_id            4225 non-null   object 
 1   gender                 4225 non-null   object 
 2   senior_citizen         4225 non-null   int64  
 3   partner                4225 non-null   object 
 4   dependents             4225 non-null   object 
 5   tenure                 4225 non-null   int64  
 6   phone_service          4225 non-null   object 
 7   multiple_lines         4225 non-null   object 
 8   online_security        4225 non-null   object 
 9   online_backup          4225 non-null   object 
 10  device_protection      4225 non-null   object 
 11  tech_support           4225 non-null   object 
 12  streaming_tv           4225 non-null   object 
 13  streaming_movies       4225 non-null   object 
 14  paperless_billing      4225 non-null   object 
 15  monthl

In [47]:
### Encode the rest of the columns by creating dummy variables and concatenating them onto the dataframe.


encoded_cols=['dependents','phone_service','multiple_lines','online_security','online_backup','device_protection','tech_support','streaming_tv','streaming_movies','paperless_billing','churn','contract_type','internet_service_type','payment_type']

# Encoding categorical columns for original dataframe
dummy_train = pd.get_dummies(train_telco[encoded_cols], dummy_na=False, drop_first=[True, True]).astype(int)
# Concatenate the dummy_df  with df dataframe
train_telco = pd.concat([train_telco,dummy_train], axis=1)
# Drop string values that have been replaced with encoded values
train_telco = train_telco.drop(columns=encoded_cols)


    

In [48]:
train_telco.head(3)

Unnamed: 0,customer_id,gender,senior_citizen,partner,tenure,monthly_charges,total_charges,is_female,has_partner,dependents_Yes,...,streaming_movies_Yes,paperless_billing_Yes,churn_Yes,contract_type_One year,contract_type_Two year,internet_service_type_Fiber optic,internet_service_type_NaN,payment_type_Credit card (automatic),payment_type_Electronic check,payment_type_Mailed check
2865,4083-BFNYK,Female,1,Yes,38,95.0,3591.25,1,1,0,...,1,0,0,1,0,1,0,1,0,0
4107,5804-LEPIM,Female,1,No,2,70.35,139.05,1,0,0,...,0,1,1,0,0,1,0,0,1,0
3453,4895-TMWIR,Male,1,Yes,11,19.95,214.75,0,1,0,...,0,0,1,0,0,0,1,0,0,1


> 5. Create a function named prep_telco that accepts the train, validate, and test telco data, and returns the dataframes ready for modeling.

In [49]:
def preprocess_telco(df):
    
    # Encode by using replace()

    df.loc[:, 'is_female'] = df['gender'].replace({'Female': 1, 'Male': 0})


    # encode by using map()
    df.loc[:, 'has_partner'] = df['partner'].map({'Yes': 1, 'No': 0})
    
    # encode by using get_dummies()
    encoded_cols=['dependents','phone_service','multiple_lines','online_security','online_backup','device_protection','tech_support','streaming_tv','streaming_movies','paperless_billing','churn','contract_type','internet_service_type','payment_type']

    # Encoding categorical columns for original dataframe
    dummy_train = pd.get_dummies(df[encoded_cols], dummy_na=False, drop_first=[True, True]).astype(int)
    # Concatenate the dummy_df  with df dataframe
    df = pd.concat([df,dummy_train], axis=1)
    # Drop string values that have been replaced with encoded values
    df = df.drop(columns=encoded_cols)

    return df


In [50]:


'''

def preprocess_telco(train_df, val_df, test_df):
    
    preprocess_telco will take in three pandas dataframes
    of our telco data, expected as cleaned versions of this 
    telco data set (see documentation on acquire.py and prepare.py)
    
    output:
    encoded, ML(machine learning)-ready versions of our clean data, with 
    columns sex and embark_town encoded in the one-hot fashion
    return: (pd.DataFrame, pd.DataFrame, pd.DataFrame)
    
    # with a looping structure:
    # go through the three dfs, set the index to customer id
    for df in [train_df, val_df, test_df]:
        df = df.set_index('customer_id')
        
    # initialize an empty list to see what needs to be encoded:
    encoding_vars = []
    # loop through the columns to fill encoded_vars with appropriate
    # datatype field names
    for col in train_df.columns:
        if train_df[col].dtype == 'O':
            encoding_vars.append(col)
            encoding_vars.remove('customer_id')
    # initialize an empty list to hold our encoded dataframes:
    encoded_dfs = []
    for df in [train_df, val_df, test_df]:
        df_encoded_cats = pd.get_dummies(df[encoding_vars],drop_first=True).astype(int)
        encoded_dfs.append(pd.concat([df,df_encoded_cats],axis=1).drop(columns=encoding_vars))
    
    
    return encoded_dfs





'''







"\n\ndef preprocess_telco(train_df, val_df, test_df):\n    \n    preprocess_telco will take in three pandas dataframes\n    of our telco data, expected as cleaned versions of this \n    telco data set (see documentation on acquire.py and prepare.py)\n    \n    output:\n    encoded, ML(machine learning)-ready versions of our clean data, with \n    columns sex and embark_town encoded in the one-hot fashion\n    return: (pd.DataFrame, pd.DataFrame, pd.DataFrame)\n    \n    # with a looping structure:\n    # go through the three dfs, set the index to customer id\n    for df in [train_df, val_df, test_df]:\n        df = df.set_index('customer_id')\n        \n    # initialize an empty list to see what needs to be encoded:\n    encoding_vars = []\n    # loop through the columns to fill encoded_vars with appropriate\n    # datatype field names\n    for col in train_df.columns:\n        if train_df[col].dtype == 'O':\n            encoding_vars.append(col)\n            encoding_vars.remove('cust

In [51]:
fresh_telco=acquire.get_telco_data()

this file exists, reading csv


In [52]:
fresh_telco=prepare.prep_telco(fresh_telco)

In [53]:
fr_train_telco,fr_validate_telco,fr_test_telco=prepare.splitting_data(fresh_telco,'churn')

In [54]:
tlco_train_encoded = preprocess_telco(fr_train_telco)
tlco_train_encoded.head(3)

Unnamed: 0,customer_id,gender,senior_citizen,partner,tenure,monthly_charges,total_charges,is_female,has_partner,dependents_Yes,...,streaming_movies_Yes,paperless_billing_Yes,churn_Yes,contract_type_One year,contract_type_Two year,internet_service_type_Fiber optic,internet_service_type_NaN,payment_type_Credit card (automatic),payment_type_Electronic check,payment_type_Mailed check
2865,4083-BFNYK,Female,1,Yes,38,95.0,3591.25,1,1,0,...,1,0,0,1,0,1,0,1,0,0
4107,5804-LEPIM,Female,1,No,2,70.35,139.05,1,0,0,...,0,1,1,0,0,1,0,0,1,0
3453,4895-TMWIR,Male,1,Yes,11,19.95,214.75,0,1,0,...,0,0,1,0,0,0,1,0,0,1


In [55]:
import model

In [57]:
model.preprocess_telco(fr_validate_telco)

Unnamed: 0,customer_id,gender,senior_citizen,partner,tenure,monthly_charges,total_charges,is_female,has_partner,dependents_Yes,...,streaming_movies_Yes,paperless_billing_Yes,churn_Yes,contract_type_One year,contract_type_Two year,internet_service_type_Fiber optic,internet_service_type_NaN,payment_type_Credit card (automatic),payment_type_Electronic check,payment_type_Mailed check
328,0486-LGCCH,Male,0,Yes,11,19.65,225.75,0,1,1,...,0,0,0,0,1,0,1,0,0,1
5983,8436-BJUMM,Male,0,Yes,26,83.75,2070.60,0,1,1,...,0,1,1,0,0,1,0,0,1,0
5551,7816-VGHTO,Female,0,Yes,6,40.55,217.50,1,1,1,...,0,0,0,0,1,0,0,0,0,1
6614,9398-MMQTO,Male,0,No,3,74.45,221.10,0,0,0,...,0,0,0,0,0,1,0,1,0,0
6283,8885-QSQBX,Female,0,No,1,49.55,49.55,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6894-LFHLY,Male,1,No,1,75.75,75.75,0,0,0,...,0,1,1,0,0,1,0,0,1,0
10,0017-DINOC,Male,0,No,54,45.20,2460.55,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4861,6851-WEFYX,Male,1,Yes,35,100.80,3437.50,0,1,0,...,1,1,0,0,0,1,0,0,1,0
2452,3511-BFTJW,Male,0,Yes,72,38.50,2763.00,0,1,1,...,0,0,0,0,1,0,0,1,0,0
