# Exercises

## Do these exercises in a notebook called `modeling.ipynb` first, then transfer the final functions to the model.py file.

## This work should all be saved in your local `classification-exercises` repo. Add, commit, and push your changes.

In [101]:
import pandas as pd
import numpy as np

import acquire
import prepare

### <ins>**Using the Titanic dataset**</ins>

**1. Use the function defined in `acquire.py` to load the Titanic data.**

**2. Use the function defined in `prepare.py` to prepare the titanic data.**

In [2]:
titanic_train,\
titanic_validate,\
titanic_test = prepare.split_data(
    prepare.prep_titanic(
    acquire.get_titanic_data()),'survived')

File exists - reading CSV file


In [3]:
titanic_train.shape, titanic_validate.shape, titanic_test.shape

((534, 8), (178, 8), (179, 8))

In [4]:
titanic_train

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embark_town,alone
580,1,2,female,1,1,30.0000,Southampton,0
140,0,3,female,0,2,15.2458,Cherbourg,0
747,1,2,female,0,0,13.0000,Southampton,1
615,1,2,female,1,2,65.0000,Southampton,0
132,0,3,female,1,0,14.5000,Southampton,0
...,...,...,...,...,...,...,...,...
461,0,3,male,0,0,8.0500,Southampton,1
344,0,2,male,0,0,13.0000,Southampton,1
513,1,1,female,1,0,59.4000,Cherbourg,0
467,0,1,male,0,0,26.5500,Southampton,1


**3. Encode the categorical columns on train dataset. Create dummy variables of the categorical columns and concatenate them onto the dataframe. Remove the columns they are replacing. Repeat on validate and test.**

In [None]:
titanic_train.pclass = titanic_train.pclass.astype(int)

In [None]:
titanic_train_encoded_cats = pd.get_dummies(titanic_train[['embark_town', 'sex']],
              drop_first=True).astype(int)

In [None]:
titanic_preprocessed = pd.concat(
    [titanic_train,
    titanic_train_encoded_cats],
    axis=1).drop(columns=['sex', 'embark_town'])

In [None]:
titanic_preprocessed

**4. Create a function named `preprocess_titanic` that accepts the train, validate, and test titanic data, and returns the dataframes ready for modeling.**

In [5]:
def preprocess_titanic(train_df, validate_df, test_df):
    encoded_df = []
    for df in [train_df, validate_df, test_df]:
        df.pclass = df.pclass.astype(int)
        df_encoded_columns = pd.get_dummies(df[['embark_town', 'sex']],
              drop_first=True).astype(int)
        encoded_df.append(pd.concat([df, df_encoded_columns],
                                    axis=1).drop(columns=['sex', 'embark_town']))
    return encoded_df

In [6]:
titanic_train_prepocessed, \
titanic_validate_preprocessed, \
titanic_test_preprocessed = preprocess_titanic(titanic_train,titanic_validate,titanic_test)

In [8]:
titanic_train_prepocessed

Unnamed: 0,survived,pclass,sibsp,parch,fare,alone,embark_town_Queenstown,embark_town_Southampton,sex_male
580,1,2,1,1,30.0000,0,0,1,0
140,0,3,0,2,15.2458,0,0,0,0
747,1,2,0,0,13.0000,1,0,1,0
615,1,2,1,2,65.0000,0,0,1,0
132,0,3,1,0,14.5000,0,0,1,0
...,...,...,...,...,...,...,...,...,...
461,0,3,0,0,8.0500,1,0,1,1
344,0,2,0,0,13.0000,1,0,1,1
513,1,1,1,0,59.4000,0,0,0,0
467,0,1,0,0,26.5500,1,0,1,1


### <ins>**Using the Telco dataset**</ins>

**1. Use the function defined in acquire.py to load the Telco data.**

**2. Use the function defined in prepare.py to prepare the Telco data.**

In [135]:
telco_train,\
telco_validate,\
telco_test = prepare.split_data(
    prepare.prep_telco(
        acquire.get_telco_data()),'churn')

File exists - reading CSV file


In [136]:
telco_train.shape, telco_validate.shape, telco_test.shape

((4225, 20), (1409, 20), (1409, 20))

In [137]:
telco_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4225 entries, 7169-YWAMK to 5480-HPRRX
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   gender                 4225 non-null   object 
 1   senior_citizen         4225 non-null   int64  
 2   partner                4225 non-null   object 
 3   dependents             4225 non-null   object 
 4   tenure                 4225 non-null   int64  
 5   phone_service          4225 non-null   object 
 6   multiple_lines         4225 non-null   object 
 7   online_security        4225 non-null   object 
 8   online_backup          4225 non-null   object 
 9   device_protection      4225 non-null   object 
 10  tech_support           4225 non-null   object 
 11  streaming_tv           4225 non-null   object 
 12  streaming_movies       4225 non-null   object 
 13  paperless_billing      4225 non-null   object 
 14  monthly_charges        4225 non-null   float64

**3. Encode the categorical columns on train.**

> **a. Encode at least one column using .replace**

> **b. Encode at least one column using .map**

> **c. Encode the rest of the columns by creating dummy variables and concatenating them onto the dataframe.**

In [127]:
for col in telco_train.columns:
    if telco_train[col].nunique() < 10:
        print(telco_train[col].unique().tolist().sort())
        telco_train[col].unique().tolist().sort()

None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None


In [117]:
yes_no_columns = []
yes_no_nointernet_columns = []

for col in telco_train:
    if telco_train[col].unique().tolist() == ['Yes','No']:
        yes_no_columns.append(col)
    elif telco_train[col].unique().tolist() == ['Yes', 'No', 'No internet service']:
        yes_no_nointernet_columns.append(col)
        
print(yes_no_columns)
print(yes_no_nointernet_columns)

['partner', 'dependents', 'phone_service']
['online_security', 'streaming_tv']


In [143]:
yes_no_columns = []

for col in telco_train[yes_no_columns]:
    telco_train.loc[:,col] = np.where(telco_train[col] == 'Yes', 1, 0)

In [144]:
yes_no_columns = ['partner', 'dependents', 'phone_service','paperless_billing','churn','multiple_lines','online_security', 'online_backup',
       'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies']

for col in telco_train[multi_answer_columns]:
    telco_train.loc[:,col] = np.where(telco_train[col] == 'Yes', 1, 0)

In [145]:
df_encoded_columns = pd.get_dummies(telco_train[['gender','contract_type','internet_service_type','payment_type']],drop_first=True).astype(int)
telco_train_preprocessed = pd.concat([telco_train, df_encoded_columns],
                                    axis=1).drop(columns=['contract_type','internet_service_type','payment_type'])

In [59]:
telco_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4225 entries, 7169-YWAMK to 5480-HPRRX
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   gender                 4225 non-null   object 
 1   senior_citizen         4225 non-null   int64  
 2   partner                4225 non-null   object 
 3   dependents             4225 non-null   object 
 4   tenure                 4225 non-null   int64  
 5   phone_service          4225 non-null   object 
 6   multiple_lines         4225 non-null   object 
 7   online_security        4225 non-null   object 
 8   online_backup          4225 non-null   object 
 9   device_protection      4225 non-null   object 
 10  tech_support           4225 non-null   object 
 11  streaming_tv           4225 non-null   object 
 12  streaming_movies       4225 non-null   object 
 13  paperless_billing      4225 non-null   object 
 14  monthly_charges        4225 non-null   float64

In [None]:
telco_train.info()

In [None]:
train.info()

**4. Repeat the same steps on validate and test.**

**5. Create a function named prep_telco that accepts the train, validate, and test telco data, and returns the dataframes ready for modeling.**

In [79]:
def preprocess_telco(train_df, validate_df, test_df):
    encoded_df = []
    for df in [train_df,validate_df,test_df]:
        df.loc[:,'internet_service'] = np.where(df['online_security'] == 'No internet service', 0, 1)
        df.insert(7, 'internet_service', df.pop('internet_service')) 
        
        yes_no_columns = ['partner', 'dependents', 'phone_service','paperless_billing','churn','multiple_lines','online_security', 'online_backup',
       'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies']

        for col in df[yes_no_columns]:
            df.loc[:,col] = np.where(df[col] == 'Yes', 1, 0)
        
        df_encoded_columns = pd.get_dummies(telco_train[['gender','contract_type','internet_service_type','payment_type']],drop_first=True).astype(int)
        encoded_df.append(pd.concat([df, df_encoded_columns],
                                             axis=1).drop(columns=['gender','contract_type','internet_service_type','payment_type']))
         
    return encoded_df

In [86]:
def preprocess_telco(train_df, val_df, test_df):

    encoding_vars = []
    for col in train_df.columns:
        if train_df[col].dtype == 'O':
            encoding_vars.append(col)

    encoded_dfs = []
    for df in [train_df, val_df, test_df]:
        df_encoded_cats = pd.get_dummies(
            df[encoding_vars],
              drop_first=True).astype(int)
        encoded_dfs.append(pd.concat(
            [df,
            df_encoded_cats],
            axis=1).drop(columns=encoding_vars))
    return encoded_dfs

In [87]:
telco_train_preprocessed,\
telco_validate_preprocessed,\
telco_test_preprocessed = preprocess_telco(telco_train,telco_validate,telco_test)

In [88]:
telco_train_preprocessed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4225 entries, 7169-YWAMK to 5480-HPRRX
Data columns (total 31 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   senior_citizen                             4225 non-null   int64  
 1   tenure                                     4225 non-null   int64  
 2   monthly_charges                            4225 non-null   float64
 3   total_charges                              4225 non-null   float64
 4   gender_Male                                4225 non-null   int64  
 5   partner_Yes                                4225 non-null   int64  
 6   dependents_Yes                             4225 non-null   int64  
 7   phone_service_Yes                          4225 non-null   int64  
 8   multiple_lines_No phone service            4225 non-null   int64  
 9   multiple_lines_Yes                         4225 non-null   int64  
 10  online_securit