In [1]:
!pip install palmerpenguins
import pandas as pd
import numpy as np
from palmerpenguins import load_penguins
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sys import exit
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

Collecting palmerpenguins
  Downloading palmerpenguins-0.1.4-py3-none-any.whl (17 kB)
Installing collected packages: palmerpenguins
Successfully installed palmerpenguins-0.1.4


# Palmer Penguins Modeling

Import the Palmer Penguins dataset and print out the first few rows.

Suppose we want to predict `bill_depth_mm` using the other variables in the dataset.

**Dummify** all variables that require this.

In [60]:
penguins = load_penguins()
penguins

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,Adelie,Torgersen,,,,,,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
...,...,...,...,...,...,...,...,...
339,Chinstrap,Dream,55.8,19.8,207.0,4000.0,male,2009
340,Chinstrap,Dream,43.5,18.1,202.0,3400.0,female,2009
341,Chinstrap,Dream,49.6,18.2,193.0,3775.0,male,2009
342,Chinstrap,Dream,50.8,19.0,210.0,4100.0,male,2009


In [61]:
penguins = penguins.dropna(subset=["bill_length_mm", "bill_depth_mm", "sex", "species", "island"])
columns_species_island = pd.get_dummies(penguins[["species", "island"]])
colums_sex = pd.get_dummies(penguins["sex"], drop_first=True)
df = penguins.drop(["species", "island", "sex"], axis=1)
df = pd.concat([df, columns_species_island, colums_sex], axis=1)
df

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,year,species_Adelie,species_Chinstrap,species_Gentoo,island_Biscoe,island_Dream,island_Torgersen,male
0,39.1,18.7,181.0,3750.0,2007,1,0,0,0,0,1,1
1,39.5,17.4,186.0,3800.0,2007,1,0,0,0,0,1,0
2,40.3,18.0,195.0,3250.0,2007,1,0,0,0,0,1,0
4,36.7,19.3,193.0,3450.0,2007,1,0,0,0,0,1,0
5,39.3,20.6,190.0,3650.0,2007,1,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
339,55.8,19.8,207.0,4000.0,2009,0,1,0,0,1,0,1
340,43.5,18.1,202.0,3400.0,2009,0,1,0,0,1,0,0
341,49.6,18.2,193.0,3775.0,2009,0,1,0,0,1,0,1
342,50.8,19.0,210.0,4100.0,2009,0,1,0,0,1,0,1


Let's use the other variables to predict `bill_depth_mm`. Prepare your data and fit the following models on a training dataset subset of the entire dataset:

* Four different models, each containing a different set of predictor variables

Create a plot like the right plot of Fig 1. in our `Model Validation` chapter with the training and test error plotted for each of your four models.

Which of your models was best?

In [62]:
y = df["bill_depth_mm"]
X = df[["bill_length_mm"]]

In [None]:
X_train_set, X_test_set, y_train_set, y_test_set = train_test_split(X, y, test_size=0.25)

models = [
    ['bill_length_mm'],
    ['bill_length_mm', 'flipper_length_mm', 'body_mass_g'],
    ['bill_length_mm', 'flipper_length_mm', 'body_mass_g', 'species_Chinstrap', 'species_Gentoo'],
    list(X_train_set.columns)
]
train_scores = []
test_scores = []

for vars in models:
    lr = LinearRegression()
    lr.fit(X_train_set[vars], y_test_set)
    train_scores.append(mean_squared_error(y_train_set, lr.predict(X_train_set[vars])))
    test_scores.append(mean_squared_error(y_test_set, lr.predict(X_test_set[vars])))
    print("MSE : Testing Data")
    print(mean_squared_error(y_test_set, lr.predict(X_test_set[vars])))

plt.figure()
plt.plot(range(1, len(models) + 1), train_scores, label='Training Error')
plt.plot(range(1, len(models) + 1), test_scores, label='Testing Error')
plt.xticks(ticks=range(1, len(models) + 1), labels=[f'Model {i}' for i in range(1, len(models) + 1)])
plt.legend()
plt.title('Training and Testing Error for Four Models')
plt.ylabel('Mean Squared Error')
plt.xlabel('Model')
plt.yscale('log')
plt.show()