# Exercises

## Do these exercises in a notebook called `modeling.ipynb` first, then transfer the final functions to the model.py file.

## This work should all be saved in your local `classification-exercises` repo. Add, commit, and push your changes.

In [1]:
import pandas as pd
import numpy as np

import acquire
import prepare

### <ins>**Using the Titanic dataset**</ins>

**1. Use the function defined in `acquire.py` to load the Titanic data.**

**2. Use the function defined in `prepare.py` to prepare the titanic data.**

In [2]:
titanic_train,\
titanic_validate,\
titanic_test = prepare.split_data(
    prepare.prep_titanic(
    acquire.get_titanic_data()),'survived')

File exists - reading CSV file


In [3]:
titanic_train.shape, titanic_validate.shape, titanic_test.shape

((534, 8), (178, 8), (179, 8))

In [4]:
titanic_train

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embark_town,alone
580,1,2,female,1,1,30.0000,Southampton,0
140,0,3,female,0,2,15.2458,Cherbourg,0
747,1,2,female,0,0,13.0000,Southampton,1
615,1,2,female,1,2,65.0000,Southampton,0
132,0,3,female,1,0,14.5000,Southampton,0
...,...,...,...,...,...,...,...,...
461,0,3,male,0,0,8.0500,Southampton,1
344,0,2,male,0,0,13.0000,Southampton,1
513,1,1,female,1,0,59.4000,Cherbourg,0
467,0,1,male,0,0,26.5500,Southampton,1


**3. Encode the categorical columns on train dataset. Create dummy variables of the categorical columns and concatenate them onto the dataframe. Remove the columns they are replacing. Repeat on validate and test.**

In [5]:
titanic_train.pclass = titanic_train.pclass.astype(int)

In [6]:
titanic_train_encoded_cats = pd.get_dummies(titanic_train[['embark_town', 'sex']],
              drop_first=True).astype(int)

In [7]:
titanic_preprocessed = pd.concat(
    [titanic_train,
    titanic_train_encoded_cats],
    axis=1).drop(columns=['sex', 'embark_town'])

In [10]:
titanic_preprocessed

Unnamed: 0,survived,pclass,sibsp,parch,fare,alone,embark_town_Queenstown,embark_town_Southampton,sex_male
580,1,2,1,1,30.0000,0,0,1,0
140,0,3,0,2,15.2458,0,0,0,0
747,1,2,0,0,13.0000,1,0,1,0
615,1,2,1,2,65.0000,0,0,1,0
132,0,3,1,0,14.5000,0,0,1,0
...,...,...,...,...,...,...,...,...,...
461,0,3,0,0,8.0500,1,0,1,1
344,0,2,0,0,13.0000,1,0,1,1
513,1,1,1,0,59.4000,0,0,0,0
467,0,1,0,0,26.5500,1,0,1,1


**4. Create a function named `preprocess_titanic` that accepts the train, validate, and test titanic data, and returns the dataframes ready for modeling.**

In [11]:
def preprocess_titanic(train_df, validate_df, test_df):
    encoded_df = []
    for df in [train_df, validate_df, test_df]:
        df.pclass = df.pclass.astype(int)
        df_encoded_columns = pd.get_dummies(df[['embark_town', 'sex']],
              drop_first=True).astype(int)
        encoded_df.append(pd.concat([df, df_encoded_columns],
                                    axis=1).drop(columns=['sex', 'embark_town']))
    return encoded_df

In [12]:
titanic_train_prepocessed, \
titanic_validate_preprocessed, \
titanic_test_preprocessed = preprocess_titanic(titanic_train,titanic_validate,titanic_test)

In [13]:
titanic_train_prepocessed

Unnamed: 0,survived,pclass,sibsp,parch,fare,alone,embark_town_Queenstown,embark_town_Southampton,sex_male
580,1,2,1,1,30.0000,0,0,1,0
140,0,3,0,2,15.2458,0,0,0,0
747,1,2,0,0,13.0000,1,0,1,0
615,1,2,1,2,65.0000,0,0,1,0
132,0,3,1,0,14.5000,0,0,1,0
...,...,...,...,...,...,...,...,...,...
461,0,3,0,0,8.0500,1,0,1,1
344,0,2,0,0,13.0000,1,0,1,1
513,1,1,1,0,59.4000,0,0,0,0
467,0,1,0,0,26.5500,1,0,1,1


### <ins>**Using the Telco dataset**</ins>

**1. Use the function defined in acquire.py to load the Telco data.**

**2. Use the function defined in prepare.py to prepare the Telco data.**

In [14]:
telco_train,\
telco_validate,\
telco_test = prepare.split_data(
    prepare.prep_telco(
        acquire.get_telco_data()),'churn')

File exists - reading CSV file


In [15]:
telco_train.shape, telco_validate.shape, telco_test.shape

((4225, 20), (1409, 20), (1409, 20))

In [16]:
telco_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4225 entries, 7169-YWAMK to 5480-HPRRX
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   gender                 4225 non-null   object 
 1   senior_citizen         4225 non-null   int64  
 2   partner                4225 non-null   object 
 3   dependents             4225 non-null   object 
 4   tenure                 4225 non-null   int64  
 5   phone_service          4225 non-null   object 
 6   multiple_lines         4225 non-null   object 
 7   online_security        4225 non-null   object 
 8   online_backup          4225 non-null   object 
 9   device_protection      4225 non-null   object 
 10  tech_support           4225 non-null   object 
 11  streaming_tv           4225 non-null   object 
 12  streaming_movies       4225 non-null   object 
 13  paperless_billing      4225 non-null   object 
 14  monthly_charges        4225 non-null   float64

**3. Encode the categorical columns on train.**

> **a. Encode at least one column using .replace**

> **b. Encode at least one column using .map**

> **c. Encode the rest of the columns by creating dummy variables and concatenating them onto the dataframe.**

In [27]:
def preprocess_telco(train, validate, test):
    encoded_df = []
    
    for df in [train, validate, test]:
        yes_no_columns = ['multiple_lines', 'online_security', 'online_backup', 'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies']

        for col in yes_no_columns:
            df[col] = np.where(df[col] == 'Yes', 'Yes', 'No')
            
        encoded_columns = df.columns[df.nunique() < 10].tolist()
        
        df_encoded_columns = pd.get_dummies(df[encoded_columns], drop_first=True).astype(int)
        df = pd.concat([df, df_encoded_columns], axis=1).drop(columns=encoded_columns)
        encoded_df.append(df)
    
    return encoded_df

In [28]:
telco_train_encoded,\
telco_validate_encoded,\
telco_test_encoded = preprocess_telco(telco_train,telco_validate,telco_test)

In [29]:
telco_test_encoded

Unnamed: 0_level_0,tenure,monthly_charges,total_charges,gender_Male,partner_Yes,dependents_Yes,phone_service_Yes,multiple_lines_Yes,online_security_Yes,online_backup_Yes,...,streaming_movies_Yes,paperless_billing_Yes,churn_Yes,contract_type_One year,contract_type_Two year,internet_service_type_Fiber optic,internet_service_type_No internet service,payment_type_Credit card (automatic),payment_type_Electronic check,payment_type_Mailed check
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4706-AXVKM,11,84.80,906.85,0,0,0,1,1,0,0,...,1,1,1,0,0,1,0,1,0,0
9693-XMUOB,59,50.25,2997.45,1,1,0,0,0,0,1,...,1,1,0,0,0,0,0,0,1,0
8946-BFWSG,63,25.25,1573.05,1,1,1,1,1,0,0,...,0,0,0,0,1,0,1,0,0,1
9402-CXWPL,70,98.90,6838.60,0,0,0,1,1,0,1,...,1,0,0,1,0,1,0,0,1,0
6646-JPPHA,14,78.85,1043.80,0,0,0,1,0,1,1,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5879-SESNB,11,75.25,888.65,0,0,0,1,1,0,0,...,0,0,0,0,0,1,0,0,1,0
5351-QESIO,1,24.20,24.20,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2410-CIYFZ,2,20.40,42.90,1,0,1,1,0,0,0,...,0,0,0,0,0,0,1,0,0,1
3247-ZVOUO,10,85.55,851.75,1,1,0,1,1,0,0,...,1,0,1,0,0,1,0,0,1,0


**4. Repeat the same steps on validate and test.**

In [30]:
telco_validate_encoded

Unnamed: 0_level_0,tenure,monthly_charges,total_charges,gender_Male,partner_Yes,dependents_Yes,phone_service_Yes,multiple_lines_Yes,online_security_Yes,online_backup_Yes,...,streaming_movies_Yes,paperless_billing_Yes,churn_Yes,contract_type_One year,contract_type_Two year,internet_service_type_Fiber optic,internet_service_type_No internet service,payment_type_Credit card (automatic),payment_type_Electronic check,payment_type_Mailed check
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6060-DRTNL,5,84.85,415.55,0,0,0,1,0,0,0,...,0,1,1,0,0,1,0,0,0,1
7435-ZNUYY,6,20.60,116.60,1,0,0,1,0,0,0,...,0,0,0,1,0,0,1,0,0,1
9675-ICXCT,72,80.80,5728.55,1,1,1,1,0,1,0,...,1,1,0,0,1,0,0,0,1,0
8634-MPHTR,47,100.05,4871.05,1,1,0,1,1,0,0,...,1,1,1,0,0,1,0,0,1,0
7314-OXENN,2,82.00,184.65,1,0,0,1,1,0,1,...,0,1,1,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0742-NXBGR,1,82.30,82.30,0,0,0,1,0,0,1,...,0,1,1,0,0,1,0,0,1,0
6918-UMQCG,5,80.20,384.25,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
5498-TXHLF,34,87.45,2874.15,0,1,1,1,1,0,0,...,0,0,1,0,0,1,0,0,1,0
3338-CVVEH,12,94.55,1173.55,1,0,0,1,1,0,0,...,1,0,0,0,0,1,0,0,1,0


In [31]:
telco_test_encoded

Unnamed: 0_level_0,tenure,monthly_charges,total_charges,gender_Male,partner_Yes,dependents_Yes,phone_service_Yes,multiple_lines_Yes,online_security_Yes,online_backup_Yes,...,streaming_movies_Yes,paperless_billing_Yes,churn_Yes,contract_type_One year,contract_type_Two year,internet_service_type_Fiber optic,internet_service_type_No internet service,payment_type_Credit card (automatic),payment_type_Electronic check,payment_type_Mailed check
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4706-AXVKM,11,84.80,906.85,0,0,0,1,1,0,0,...,1,1,1,0,0,1,0,1,0,0
9693-XMUOB,59,50.25,2997.45,1,1,0,0,0,0,1,...,1,1,0,0,0,0,0,0,1,0
8946-BFWSG,63,25.25,1573.05,1,1,1,1,1,0,0,...,0,0,0,0,1,0,1,0,0,1
9402-CXWPL,70,98.90,6838.60,0,0,0,1,1,0,1,...,1,0,0,1,0,1,0,0,1,0
6646-JPPHA,14,78.85,1043.80,0,0,0,1,0,1,1,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5879-SESNB,11,75.25,888.65,0,0,0,1,1,0,0,...,0,0,0,0,0,1,0,0,1,0
5351-QESIO,1,24.20,24.20,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2410-CIYFZ,2,20.40,42.90,1,0,1,1,0,0,0,...,0,0,0,0,0,0,1,0,0,1
3247-ZVOUO,10,85.55,851.75,1,1,0,1,1,0,0,...,1,0,1,0,0,1,0,0,1,0


**5. Create a function named prep_telco that accepts the train, validate, and test telco data, and returns the dataframes ready for modeling.**

In [32]:
def preprocess_telco(train, validate, test):
    encoded_df = []
    
    for df in [train, validate, test]:
        yes_no_columns = ['multiple_lines', 'online_security', 'online_backup', 'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies']

        for col in yes_no_columns:
            df[col] = np.where(df[col] == 'Yes', 'Yes', 'No')
            
        encoded_columns = df.columns[df.nunique() < 10].tolist()
        
        df_encoded_columns = pd.get_dummies(df[encoded_columns], drop_first=True).astype(int)
        df = pd.concat([df, df_encoded_columns], axis=1).drop(columns=encoded_columns)
        encoded_df.append(df)
    
    return encoded_df