# Exercises

## Do these exercises in a notebook called `modeling.ipynb` first, then transfer the final functions to the model.py file.

## This work should all be saved in your local `classification-exercises` repo. Add, commit, and push your changes.

In [6]:
import pandas as pd
import numpy as np

import acquire
import prepare

### <ins>**Using the Titanic dataset**</ins>

**1. Use the function defined in `acquire.py` to load the Titanic data.**

**2. Use the function defined in `prepare.py` to prepare the titanic data.**

In [10]:
titanic_train,\
titanic_validate,\
titanic_test = prepare.split_data(
    prepare.prep_titanic(
    acquire.get_titanic_data()),'survived')

File does not exist - creating CSV file


In [11]:
titanic_train.shape, titanic_validate.shape, titanic_test.shape

((534, 8), (178, 8), (179, 8))

In [12]:
titanic_train

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embark_town,alone
580,1,2,female,1,1,30.0000,Southampton,0
140,0,3,female,0,2,15.2458,Cherbourg,0
747,1,2,female,0,0,13.0000,Southampton,1
615,1,2,female,1,2,65.0000,Southampton,0
132,0,3,female,1,0,14.5000,Southampton,0
...,...,...,...,...,...,...,...,...
461,0,3,male,0,0,8.0500,Southampton,1
344,0,2,male,0,0,13.0000,Southampton,1
513,1,1,female,1,0,59.4000,Cherbourg,0
467,0,1,male,0,0,26.5500,Southampton,1


**3. Encode the categorical columns on train dataset. Create dummy variables of the categorical columns and concatenate them onto the dataframe. Remove the columns they are replacing. Repeat on validate and test.**

In [None]:
df_titanic.loc[:,'is_female'] = df_titanic.sex.map({'male': 0, 'female': 1})

In [None]:
df_titanic[['is_queenstown', 'is_southampton']] = pd.get_dummies(df_titanic.embark_town,drop_first=True).astype(int)

In [None]:
df_titanic = df_titanic.drop(columns=['sex','embark_town'])

In [None]:
df_titanic.head()

**4. Create a function named `preprocess_titanic` that accepts the train, validate, and test titanic data, and returns the dataframes ready for modeling.**

In [13]:
def preprocess_titanic(train_df, validate_df, test_df):
    train.pclass = train.pclass.astype(int)
    train.loc[:,'is_female'] = train.sex.map({'male': 0, 'female': 1})
    train[['is_queenstown', 'is_southampton']] = pd.get_dummies(train.embark_town,drop_first=True).astype(int)
    train = train.drop(columns=['sex','embark_town'])
    
    validate.pclass = validate.pclass.astype(int)
    validate.loc[:,'is_female'] = validate.sex.map({'male': 0, 'female': 1})
    validate[['is_queenstown', 'is_southampton']] = pd.get_dummies(validate.embark_town,drop_first=True).astype(int)
    validate = validate.drop(columns=['sex','embark_town'])
    
    test.pclass = test.pclass.astype(int)
    test.loc[:,'is_female'] = test.sex.map({'male': 0, 'female': 1})
    test[['is_queenstown', 'is_southampton']] = pd.get_dummies(test.embark_town,drop_first=True).astype(int)
    test = test.drop(columns=['sex','embark_town'])
    
    return train, validate, test

In [None]:
train,validate,test = preprocess_titanic(train,validate,test)

### <ins>**Using the Telco dataset**</ins>

**1. Use the function defined in acquire.py to load the Telco data.**

**2. Use the function defined in prepare.py to prepare the Telco data.**

In [6]:
telco_train,\
telco_validate,\
telco_test = prepare.split_data(
    prepare.prep_telco(
        acquire.get_telco_data()),'churn')

File exists - reading CSV file


In [9]:
telco_train.shape, telco_validate.shape, telco_test.shape

((4225, 20), (1409, 20), (1409, 20))

**3. Encode the categorical columns on train.**

> **a. Encode at least one column using .replace**

> **b. Encode at least one column using .map**

> **c. Encode the rest of the columns by creating dummy variables and concatenating them onto the dataframe.**

In [None]:
train['is_male'] = pd.get_dummies(train.gender,drop_first=True).astype(int)
train = train.drop(columns=['gender'])
train.insert(0, 'is_male', train.pop('is_male')) 

In [None]:
train.loc[:,'internet_service'] = np.where(train['online_security'] == 'No internet service', 0, 1)
train.insert(7, 'internet_service', train.pop('internet_service')) 

In [None]:
yes_no_columns = ['partner', 'dependents', 'phone_service','paperless_billing','churn']

for col in train[yes_no_columns]:
    train.loc[:,col] = train[col].map({'Yes': 1,'No': 0})
    train[col] = train[col].astype(int)

In [None]:
multi_answer_columns = ['multiple_lines','online_security', 'online_backup',
       'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies']

for col in train[multi_answer_columns]:
    train.loc[:,col] = np.where(train[col] == 'Yes', 1, 0)
    train[col] = train[col].astype(int)

In [None]:
train[['one_year_contract','two_year_contract']] = pd.get_dummies(train.contract_type,drop_first=True).astype(int)
train = train.drop(columns=['contract_type'])

In [None]:
train.internet_service_type.value_counts()

In [None]:
train[['dsl','fiber_optic','no_internet']] = pd.get_dummies(train.internet_service_type).astype(int)
train = train.drop(columns=['internet_service_type','no_internet'])

In [None]:
train[['pay_credit_card','pay_electronic_check','pay_mailed_check']] = pd.get_dummies(train.payment_type,drop_first=True).astype(int)
train = train.drop(columns=['payment_type'])

In [None]:
train.info()

**4. Repeat the same steps on validate and test.**

**5. Create a function named prep_telco that accepts the train, validate, and test telco data, and returns the dataframes ready for modeling.**

In [None]:
def prep_telco(train,validate,test):
    dataframe_set = [train,validate,test]
    
    for df in dataframe_set:
        df['is_male'] = pd.get_dummies(df.gender,drop_first=True).astype(int)
        df = df.drop(columns=['gender'])
        df.insert(0, 'is_male', train.pop('is_male')) 
        
        
        
        
        