In [1]:
import pandas as pd
import numpy as np
from pydataset import data
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from env import get_db_url
import os
import acquire
from sklearn.model_selection import train_test_split

# Preparation

## The end product of this exercise should be the specified functions in a python script named prepare.py. Do these in your classification_exercises.ipynb first, then transfer to the prepare.py file.

## This work should all be saved in your local classification-exercises repo. Then add, commit, and push your changes.

### Using the Iris Data:

>**1. Use the function defined in acquire.py to load the iris data.**

In [2]:
df_iris = acquire.get_iris_data()
df_iris.head()

File exists - reading CSV file


Unnamed: 0,species_id,measurement_id,sepal_length,sepal_width,petal_length,petal_width,species_name
0,1,1,5.1,3.5,1.4,0.2,setosa
1,1,2,4.9,3.0,1.4,0.2,setosa
2,1,3,4.7,3.2,1.3,0.2,setosa
3,1,4,4.6,3.1,1.5,0.2,setosa
4,1,5,5.0,3.6,1.4,0.2,setosa


>**2. Clean up the column names - replace the period with an underscore and lowercase.**

In [19]:
df_iris.columns = df_iris.columns.str.replace('.','_').str.lower()
df_iris.head()

Unnamed: 0,species_id,measurement_id,sepal_length,sepal_width,petal_length,petal_width,species_name
0,1,1,5.1,3.5,1.4,0.2,setosa
1,1,2,4.9,3.0,1.4,0.2,setosa
2,1,3,4.7,3.2,1.3,0.2,setosa
3,1,4,4.6,3.1,1.5,0.2,setosa
4,1,5,5.0,3.6,1.4,0.2,setosa


>**3. Drop the species_id and measurement_id columns.**

In [20]:
df_iris = df_iris.drop(columns = ['species_id', 'measurement_id'])
df_iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species_name
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


>**4. Rename the species_name column to just species.**

In [21]:
df_iris = df_iris.rename(columns={'species_name':'species'})
df_iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


>**5. Create a function named prep_iris that accepts the untransformed iris data, and returns the data with the transformations above applied.**

In [13]:
def prep_iris(dataframe):
    dataframe.columns = dataframe.columns.str.replace('.','_').str.lower()
    df = dataframe.drop(columns = ['species_id', 'measurement_id'])
    df = df.rename(columns={'species_name':'species'})
    return df

In [90]:
df_iris = prep_iris(df_iris)

In [91]:
df_iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


### Using the Titanic dataset

>**1. Use the function defined in acquire.py to load the Titanic data.**

In [69]:
df_titanic = acquire.get_titanic_data()

File exists - reading CSV file


>**2. Drop any unnecessary, unhelpful, or duplicated columns.**

In [66]:
df_titanic = df_titanic.drop(columns=['embarked','class','deck','age'])

>**3. Create a function named prep_titanic that accepts the raw titanic data, and returns the data with the transformations above applied.**

In [70]:
def prep_titanic(dataframe):
    df = dataframe.drop(columns=['embarked','class','deck','age'])
    df.pclass = df.pclass.astype(object)
    df.embark_town = df.embark_town.fillna('Southampton')
    return df

In [71]:
df_titanic = prep_titanic(df_titanic)

In [72]:
df_titanic.head()

Unnamed: 0_level_0,survived,pclass,sex,sibsp,parch,fare,embark_town,alone
passenger_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0,3,male,1,0,7.25,Southampton,0
1,1,1,female,1,0,71.2833,Cherbourg,0
2,1,3,female,0,0,7.925,Southampton,1
3,1,1,female,1,0,53.1,Southampton,0
4,0,3,male,0,0,8.05,Southampton,1


### Using the Telco dataset:

>**1. Use the function defined in acquire.py to load the Telco data.**

In [82]:
df_telco = acquire.get_telco_data()

File exists - reading CSV file


>**2. Drop any unnecessary, unhelpful, or duplicated columns. This could mean dropping foreign key columns but keeping the corresponding string values, for example.**

In [76]:
df_telco = df_telco.drop(columns=['payment_type_id','internet_service_type_id','contract_type_id'])

>**3. Handle null values.** 

In [77]:
df_telco.isnull().sum()

customer_id                 0
gender                      0
senior_citizen              0
partner                     0
dependents                  0
tenure                      0
phone_service               0
multiple_lines              0
online_security             0
online_backup               0
device_protection           0
tech_support                0
streaming_tv                0
streaming_movies            0
paperless_billing           0
monthly_charges             0
total_charges               0
churn                       0
contract_type               0
internet_service_type    1526
payment_type                0
dtype: int64

In [78]:
df_telco.internet_service_type.value_counts(dropna=False)

internet_service_type
Fiber optic    3096
DSL            2421
NaN            1526
Name: count, dtype: int64

In [79]:
df_telco.internet_service_type = df_telco.internet_service_type.fillna('No internet service')

In [80]:
df_telco.internet_service_type.value_counts()

internet_service_type
Fiber optic            3096
DSL                    2421
No internet service    1526
Name: count, dtype: int64

In [81]:
df_telco.isnull().sum()

customer_id              0
gender                   0
senior_citizen           0
partner                  0
dependents               0
tenure                   0
phone_service            0
multiple_lines           0
online_security          0
online_backup            0
device_protection        0
tech_support             0
streaming_tv             0
streaming_movies         0
paperless_billing        0
monthly_charges          0
total_charges            0
churn                    0
contract_type            0
internet_service_type    0
payment_type             0
dtype: int64

>**4. Create a function named prep_telco that accepts the raw telco data, and returns the data with the transformations above applied.**

In [83]:
def prep_telco(dataframe):
    df = dataframe.drop(columns=['payment_type_id','internet_service_type_id','contract_type_id'])
    df.internet_service_type = df.internet_service_type.fillna('No internet service')
    return df

In [84]:
df_telco = prep_telco(df_telco)

In [85]:
df_telco

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,0002-ORFBO,Female,0,Yes,Yes,9,Yes,No,No,Yes,...,Yes,Yes,No,Yes,65.60,593.3,No,One year,DSL,Mailed check
1,0003-MKNFE,Male,0,No,No,9,Yes,Yes,No,No,...,No,No,Yes,No,59.90,542.4,No,Month-to-month,DSL,Mailed check
2,0004-TLHLJ,Male,0,No,No,4,Yes,No,No,No,...,No,No,No,Yes,73.90,280.85,Yes,Month-to-month,Fiber optic,Electronic check
3,0011-IGKFF,Male,1,Yes,No,13,Yes,No,No,Yes,...,No,Yes,Yes,Yes,98.00,1237.85,Yes,Month-to-month,Fiber optic,Electronic check
4,0013-EXCHZ,Female,1,Yes,No,3,Yes,No,No,No,...,Yes,Yes,No,Yes,83.90,267.4,Yes,Month-to-month,Fiber optic,Mailed check
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,9987-LUTYD,Female,0,No,No,13,Yes,No,Yes,No,...,Yes,No,No,No,55.15,742.9,No,One year,DSL,Mailed check
7039,9992-RRAMN,Male,0,Yes,No,22,Yes,Yes,No,No,...,No,No,Yes,Yes,85.10,1873.7,Yes,Month-to-month,Fiber optic,Electronic check
7040,9992-UJOEL,Male,0,No,No,2,Yes,No,No,Yes,...,No,No,No,Yes,50.30,92.75,No,Month-to-month,DSL,Mailed check
7041,9993-LHIEB,Male,0,Yes,Yes,67,Yes,No,Yes,No,...,Yes,No,Yes,No,67.85,4627.65,No,Two year,DSL,Mailed check


### Split your data

>**1. Write a function to split your data into train, test and validate datasets. Add this function to prepare.py.**

In [86]:
def split_data(dataframe):
    train, validate_test = train_test_split(dataframe, 
                                            train_size=.6, 
                                            random_state=913
                                           )
    validate, test = train_test_split(validate_test,
                                      test_size=0.50, 
                                      random_state=913
                                     )
    return train, validate, test

>**2. Run the function in your notebook on the Iris dataset, returning 3 datasets, train_iris, validate_iris and test_iris.**

In [87]:
df_iris = acquire.get_iris_data()

File exists - reading CSV file


In [88]:
df_iris = prep_iris(df_iris)

In [89]:
train_iris, validate_iris , test_iris = split_data(df_iris)

In [90]:
print(f'Prepared df: {df_iris.shape}')
print()
print(f'Train: {train_iris.shape}')
print(f'Validate: {validate_iris.shape}')
print(f'Test: {test_iris.shape}')

Prepared df: (150, 5)

Train: (90, 5)
Validate: (30, 5)
Test: (30, 5)


>**3. Run the function on the Titanic dataset, returning 3 datasets, train_titanic, validate_titanic and test_titanic.**

In [91]:
df_titanic = acquire.get_titanic_data()

File exists - reading CSV file


In [92]:
df_titanic = prep_titanic(df_titanic)

In [93]:
train_titanic, validate_titanic, test_titanic = split_data(df_titanic)

In [94]:
print(f'Prepared df: {df_titanic.shape}')
print()
print(f'Train: {train_titanic.shape}')
print(f'Validate: {validate_titanic.shape}')
print(f'Test: {test_titanic.shape}')

Prepared df: (891, 8)

Train: (534, 8)
Validate: (178, 8)
Test: (179, 8)


>**4. Run the function on the Telco dataset, returning 3 datasets, train_telco, validate_telco and test_telco.**

In [95]:
df_telco = acquire.get_telco_data()

File exists - reading CSV file


In [96]:
df_telco = prep_telco(df_telco)

In [97]:
train_telco, validate_telco, test_telco = split_data(df_telco)

In [98]:
print(f'Prepared df: {df_telco.shape}')
print()
print(f'Train: {train_telco.shape}')
print(f'Validate: {validate_telco.shape}')
print(f'Test: {test_telco.shape}')

Prepared df: (7043, 21)

Train: (4225, 21)
Validate: (1409, 21)
Test: (1409, 21)
