In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pydataset import data

# train test split from sklearn
from sklearn.model_selection import train_test_split
# imputer from sklearn
from sklearn.impute import SimpleImputer

# filter out warnings
import warnings
warnings.filterwarnings('ignore')

# our own acquire script:
import acquire 

# Acquisition Lesson

4. In a jupyter notebook, classification_exercises.ipynb, use a python module (pydata or seaborn datasets) containing datasets as a source from the iris data. Create a pandas dataframe, df_iris, from this data.

In [2]:
data('iris', show_doc=True)

iris

PyDataset Documentation (adopted from R Documentation. The displayed examples are in R)

## Edgar Anderson's Iris Data

### Description

This famous (Fisher's or Anderson's) iris data set gives the measurements in
centimeters of the variables sepal length and width and petal length and
width, respectively, for 50 flowers from each of 3 species of iris. The
species are _Iris setosa_, _versicolor_, and _virginica_.

### Usage

    iris
    iris3

### Format

`iris` is a data frame with 150 cases (rows) and 5 variables (columns) named
`Sepal.Length`, `Sepal.Width`, `Petal.Length`, `Petal.Width`, and `Species`.

`iris3` gives the same data arranged as a 3-dimensional array of size 50 by 4
by 3, as represented by S-PLUS. The first dimension gives the case number
within the species subsample, the second the measurements with names `Sepal
L.`, `Sepal W.`, `Petal L.`, and `Petal W.`, and the third the species.

### Source

Fisher, R. A. (1936) The use of multiple measurements in taxonomi

In [3]:
df_iris = data('iris')


In [4]:
df_iris.head(3)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa


In [5]:
df_iris.shape

(150, 5)

In [6]:
df_iris.columns

Index(['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width',
       'Species'],
      dtype='object')

In [7]:
df_iris.dtypes

Sepal.Length    float64
Sepal.Width     float64
Petal.Length    float64
Petal.Width     float64
Species          object
dtype: object

In [8]:
df_iris.describe()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


5. Read the data from this google sheet into a dataframe, df_google.

In [9]:

sheet_url = 'https://docs.google.com/spreadsheets/d/1Uhtml8KY19LILuZsrDtlsHHDC9wuDGUSe8LTEwvdI5g/edit#gid=341089357'    

csv_export_url = sheet_url.replace('/edit#gid=', '/export?format=csv&gid=')

df_google = pd.read_csv(csv_export_url)

df_google.head(3)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [10]:
df_google.shape

(891, 12)

In [11]:
df_google.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [12]:
df_google.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [13]:
df_google.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [14]:
for col in df_google.columns:
#     print(col)
    if df_google[col].dtypes == 'object':
        print(f'{col} has {df_google[col].nunique()} unique values.')

Name has 891 unique values.
Sex has 2 unique values.
Ticket has 681 unique values.
Cabin has 147 unique values.
Embarked has 3 unique values.


In [15]:
df_google.Survived.value_counts(dropna=False)

0    549
1    342
Name: Survived, dtype: int64

In [16]:
df_google.Pclass.value_counts(dropna=False)

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [17]:
df_google.Sex.value_counts(dropna=False)

male      577
female    314
Name: Sex, dtype: int64

In [18]:
df_google.Embarked.value_counts(dropna=False)

S      644
C      168
Q       77
NaN      2
Name: Embarked, dtype: int64

6. Download the previous exercise's file into an excel (File → Download → Microsoft Excel). Read the downloaded file into a dataframe named df_excel.

In [19]:
df_excel = pd.read_excel('train.xlsx', sheet_name='train')

In [20]:
df_excel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    float64
 1   Survived     891 non-null    float64
 2   Pclass       891 non-null    float64
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    float64
 7   Parch        891 non-null    float64
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(7), object(5)
memory usage: 83.7+ KB


In [21]:
df_excel_sample = df_excel.head(100)

In [22]:
df_excel_sample.shape

(100, 12)

In [23]:
df_excel.columns[:5]

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex'], dtype='object')

In [24]:
df_excel.select_dtypes(include='object').head()

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S
1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,PC 17599,C85,C
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803.0,C123,S
4,"Allen, Mr. William Henry",male,373450.0,,S


In [25]:
df_excel.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1.0,0.0,3.0,"Braund, Mr. Owen Harris",male,22.0,1.0,0.0,A/5 21171,7.25,,S
1,2.0,1.0,1.0,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1.0,0.0,PC 17599,71.2833,C85,C
2,3.0,1.0,3.0,"Heikkinen, Miss. Laina",female,26.0,0.0,0.0,STON/O2. 3101282,7.925,,S


In [26]:
titanic_stats = df_excel[['Age', 'Fare']].describe().T
titanic_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,714.0,29.699118,14.526497,0.42,20.125,28.0,38.0,80.0
Fare,891.0,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292


# Prepare Lesson Exercises

Exercises
The end product of this exercise should be the specified functions in a python script named prepare.py. Do these in your classification_exercises.ipynb first, then transfer to the prepare.py file.

This work should all be saved in your local classification-exercises repo. Then add, commit, and push your changes.

## Using the Iris Data:
Use the function defined in acquire.py to load the iris data.



In [64]:
iris = acquire.new_iris_data()

In [65]:
iris.head()

Unnamed: 0,species_id,species_name,sepal_length,sepal_width,petal_length,petal_width
0,1,setosa,5.1,3.5,1.4,0.2
1,1,setosa,4.9,3.0,1.4,0.2
2,1,setosa,4.7,3.2,1.3,0.2
3,1,setosa,4.6,3.1,1.5,0.2
4,1,setosa,5.0,3.6,1.4,0.2


Drop the species_id and measurement_id columns.

In [66]:
columns_to_drop = ['species_id']

In [67]:
iris = iris.drop(columns = columns_to_drop)

In [68]:
iris.head()

Unnamed: 0,species_name,sepal_length,sepal_width,petal_length,petal_width
0,setosa,5.1,3.5,1.4,0.2
1,setosa,4.9,3.0,1.4,0.2
2,setosa,4.7,3.2,1.3,0.2
3,setosa,4.6,3.1,1.5,0.2
4,setosa,5.0,3.6,1.4,0.2


Rename the species_name column to just species.

In [69]:
iris = iris.rename(columns={'species_name': 'species'})

In [70]:
iris.head()

Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width
0,setosa,5.1,3.5,1.4,0.2
1,setosa,4.9,3.0,1.4,0.2
2,setosa,4.7,3.2,1.3,0.2
3,setosa,4.6,3.1,1.5,0.2
4,setosa,5.0,3.6,1.4,0.2


Create dummy variables of the species name and concatenate onto the iris dataframe. (This is for practice, we don't always have to encode the target, but if we used species as a feature, we would need to encode it).

In [71]:
# Encoding steps
# 1. Make a dataframe out of "dummy" columns
# 2. Concatenate our dummy dataframe to our original dataframe

dummy_iris = pd.get_dummies(iris[['species']], dummy_na = False)

In [72]:
dummy_iris.head()

Unnamed: 0,species_setosa,species_versicolor,species_virginica
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0


In [73]:
iris = pd.concat([iris, dummy_iris], axis=1)
iris

Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width,species_setosa,species_versicolor,species_virginica
0,setosa,5.1,3.5,1.4,0.2,1,0,0
1,setosa,4.9,3.0,1.4,0.2,1,0,0
2,setosa,4.7,3.2,1.3,0.2,1,0,0
3,setosa,4.6,3.1,1.5,0.2,1,0,0
4,setosa,5.0,3.6,1.4,0.2,1,0,0
5,setosa,5.4,3.9,1.7,0.4,1,0,0
6,setosa,4.6,3.4,1.4,0.3,1,0,0
7,setosa,5.0,3.4,1.5,0.2,1,0,0
8,setosa,4.4,2.9,1.4,0.2,1,0,0
9,setosa,4.9,3.1,1.5,0.1,1,0,0


Create a function named prep_iris that accepts the untransformed iris data, and returns the data with the transformations above applied.

In [74]:
def prep_iris(df):
    # drop column
    columns_to_drop = ['species_id']
    iris = iris.drop(columns = columns_to_drop)
    # rename column
    iris = iris.rename(columns={'species_name': 'species'})
    #encode species
    dummy_iris = pd.get_dummies(iris[['species']], dummy_na = False)
    iris = pd.concat([iris, dummy_iris], axis=1)
    return iris

## Using the Titanic dataset

Use the function defined in acquire.py to load the Titanic data.

In [75]:
titanic = acquire.get_titanic_data()

In [76]:
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


Drop any unnecessary, unhelpful, or duplicated columns

In [77]:
columns_to_drop = ['embarked', 'pclass', 'passenger_id', 'deck']

In [78]:
titanic = titanic.drop(columns = columns_to_drop) 

In [79]:
titanic.head()

Unnamed: 0,survived,sex,age,sibsp,parch,fare,class,embark_town,alone
0,0,male,22.0,1,0,7.25,Third,Southampton,0
1,1,female,38.0,1,0,71.2833,First,Cherbourg,0
2,1,female,26.0,0,0,7.925,Third,Southampton,1
3,1,female,35.0,1,0,53.1,First,Southampton,0
4,0,male,35.0,0,0,8.05,Third,Southampton,1


Encode the categorical columns. Create dummy variables of the categorical columns and concatenate them onto the dataframe.

In [80]:
# Encoding steps
# 1. Make a dataframe out of "dummy" columns
# 2. Concatenate our dummy dataframe to our original dataframe

dummy_titanic = pd.get_dummies(titanic[['sex', 'class', 'embark_town']], dummy_na=False, drop_first=[True, True])

In [81]:
dummy_titanic

Unnamed: 0,sex_male,class_Second,class_Third,embark_town_Queenstown,embark_town_Southampton
0,1,0,1,0,1
1,0,0,0,0,0
2,0,0,1,0,1
3,0,0,0,0,1
4,1,0,1,0,1
...,...,...,...,...,...
886,1,1,0,0,1
887,0,0,0,0,1
888,0,0,1,0,1
889,1,0,0,0,0


Create a function named prep_titanic that accepts the raw titanic data, and returns the data with the transformations above applied.

In [82]:
def prep_titanic(df):
    # drop column
    columns_to_drop = ['embarked', 'pclass', 'passenger_id', 'deck'] 
    titanic = titanic.drop(columns = columns_to_drop)
    # encode
    dummy_titanic = pd.get_dummies(titanic[['sex', 'class', 'embark_town']], dummy_na=False, drop_first=[True, True])
    titanic = pd.concat([titanic, dummy_titanic], axis=1)
    return titanic
    

In [98]:
prep_titanic(df)

UnboundLocalError: local variable 'titanic' referenced before assignment

## Using the Telco dataset

Use the function defined in acquire.py to load the Telco data.


In [83]:
telco = acquire.get_telco_data()


In [84]:
telco.shape

(7043, 24)

In [85]:
telco.head()

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,2,1,2,0002-ORFBO,Female,0,Yes,Yes,9,Yes,...,Yes,Yes,No,Yes,65.6,593.3,No,One year,DSL,Mailed check
1,2,1,1,0003-MKNFE,Male,0,No,No,9,Yes,...,No,No,Yes,No,59.9,542.4,No,Month-to-month,DSL,Mailed check
2,1,2,1,0004-TLHLJ,Male,0,No,No,4,Yes,...,No,No,No,Yes,73.9,280.85,Yes,Month-to-month,Fiber optic,Electronic check
3,1,2,1,0011-IGKFF,Male,1,Yes,No,13,Yes,...,No,Yes,Yes,Yes,98.0,1237.85,Yes,Month-to-month,Fiber optic,Electronic check
4,2,2,1,0013-EXCHZ,Female,1,Yes,No,3,Yes,...,Yes,Yes,No,Yes,83.9,267.4,Yes,Month-to-month,Fiber optic,Mailed check


Drop any unnecessary, unhelpful, or duplicated columns. This could mean dropping foreign key columns but keeping the corresponding string values, for example.

In [86]:
telco = telco.drop_duplicates()

In [87]:
telco.shape # no dups

(7043, 24)

In [88]:
columns_to_drop = ['payment_type_id', 'internet_service_type_id', 'contract_type_id']

In [89]:
telco = telco.drop(columns = columns_to_drop) 

In [90]:
telco.head()

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,0002-ORFBO,Female,0,Yes,Yes,9,Yes,No,No,Yes,...,Yes,Yes,No,Yes,65.6,593.3,No,One year,DSL,Mailed check
1,0003-MKNFE,Male,0,No,No,9,Yes,Yes,No,No,...,No,No,Yes,No,59.9,542.4,No,Month-to-month,DSL,Mailed check
2,0004-TLHLJ,Male,0,No,No,4,Yes,No,No,No,...,No,No,No,Yes,73.9,280.85,Yes,Month-to-month,Fiber optic,Electronic check
3,0011-IGKFF,Male,1,Yes,No,13,Yes,No,No,Yes,...,No,Yes,Yes,Yes,98.0,1237.85,Yes,Month-to-month,Fiber optic,Electronic check
4,0013-EXCHZ,Female,1,Yes,No,3,Yes,No,No,No,...,Yes,Yes,No,Yes,83.9,267.4,Yes,Month-to-month,Fiber optic,Mailed check


Encode the categorical columns. Create dummy variables of the categorical columns and concatenate them onto the dataframe.

In [91]:
#Encoding: Turning Categorical Values into Boolean Values (0,1)
#We have two options: simple encoding or one-hot encoding

# Encoding steps
# 1. Make a dataframe out of "dummy" columns
# 2. Concatenate our dummy dataframe to our original dataframe
telco.gender.map({'Female':1, 'Male':0})
telco['phone_service_']
dummy_telco = pd.get_dummies(telco[['gender', 'partner', 'dependents', 'phone_service', 'tech_support', 'streaming_tv', 'streaming_movies', 'paperless_billing', 'churn']], dummy_na=False, drop_first=[True, True])

In [92]:
dummy_telco

Unnamed: 0,partner_Yes,dependents_Yes,phone_service_Yes,tech_support_No internet service,tech_support_Yes,streaming_tv_No internet service,streaming_tv_Yes,streaming_movies_No internet service,streaming_movies_Yes,paperless_billing_Yes,churn_Yes
0,1,1,1,0,1,0,1,0,0,1,0
1,0,0,1,0,0,0,0,0,1,0,0
2,0,0,1,0,0,0,0,0,0,1,1
3,1,0,1,0,0,0,1,0,1,1,1
4,1,0,1,0,1,0,1,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...
7038,0,0,1,0,1,0,0,0,0,0,0
7039,1,0,1,0,0,0,0,0,1,1,1
7040,0,0,1,0,0,0,0,0,0,1,0
7041,1,1,1,0,1,0,0,0,1,0,0


In [93]:
# Concatenate my dummy_df to my data

telco = pd.concat([telco, dummy_telco], axis=1)
telco

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,...,dependents_Yes,phone_service_Yes,tech_support_No internet service,tech_support_Yes,streaming_tv_No internet service,streaming_tv_Yes,streaming_movies_No internet service,streaming_movies_Yes,paperless_billing_Yes,churn_Yes
0,0002-ORFBO,Female,0,Yes,Yes,9,Yes,No,No,Yes,...,1,1,0,1,0,1,0,0,1,0
1,0003-MKNFE,Male,0,No,No,9,Yes,Yes,No,No,...,0,1,0,0,0,0,0,1,0,0
2,0004-TLHLJ,Male,0,No,No,4,Yes,No,No,No,...,0,1,0,0,0,0,0,0,1,1
3,0011-IGKFF,Male,1,Yes,No,13,Yes,No,No,Yes,...,0,1,0,0,0,1,0,1,1,1
4,0013-EXCHZ,Female,1,Yes,No,3,Yes,No,No,No,...,0,1,0,1,0,1,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,9987-LUTYD,Female,0,No,No,13,Yes,No,Yes,No,...,0,1,0,1,0,0,0,0,0,0
7039,9992-RRAMN,Male,0,Yes,No,22,Yes,Yes,No,No,...,0,1,0,0,0,0,0,1,1,1
7040,9992-UJOEL,Male,0,No,No,2,Yes,No,No,Yes,...,0,1,0,0,0,0,0,0,1,0
7041,9993-LHIEB,Male,0,Yes,Yes,67,Yes,No,Yes,No,...,1,1,0,1,0,0,0,1,0,0


Create a function named prep_telco that accepts the raw telco data, and returns the data with the transformations above applied.

In [94]:
def prep_telco(df):
    # drop columncolumns_to_drop = ['payment_type_id', 'internet_service_type_id', 'contract_type_id']
    telco = telco.drop(columns = columns_to_drop)
    # encode
    dummy_telco = pd.get_dummies(telco[['partner', 'dependents', 'phone_service', 'tech_support', 'streaming_tv', 'streaming_movies', 'paperless_billing', 'churn']], dummy_na=False, drop_first=[True, True])
    telco = pd.concat([telco, dummy_telco], axis=1)
    return telco

## Split your data

Write a function to split your data into train, test and validate datasets. Add this function to prepare.py.


In [None]:
# 20% test, 80% train_validate
# then of the 80% train_validate: 30% validate, 70% train. 
def my_train_test_split(df, target):
    train, test = train_test_split(df, test_size=.2, random_state=123, stratify=df.survived)
    train, validate = train_test_split(train, test_size=.25, random_state=123, stratify=train.survived)
return train, validate, test

In [None]:
train_iris, validate_iris, test_iris = 

In [None]:
# Validate my split.

print(f'train -> {train.shape}')
print(f'validate -> {validate.shape}')
print(f'test -> {test.shape}')


In [None]:
Run the function in your notebook on the Iris dataset, returning 3 datasets, train_iris, validate_iris and test_iris.
target = 'species'


In [None]:
Run the function on the Titanic dataset, returning 3 datasets, train_titanic, validate_titanic and test_titanic.



In [None]:
Run the function on the Telco dataset, returning 3 datasets, train_telco, validate_telco and test_telco.
