### Exercício 9: Regularização Norma-L2 no Dataset Palmer Penguins

**Implemente um modelo de regressão linear com regularização norma-L2 utilizando o dataset Palmer Penguins.**

In [53]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier

In [54]:
# Read dataset
df = pd.read_csv('penguins.csv')
df = df.drop(['year', 'sex', 'island'], axis=1)
df.head()

features = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']

In [55]:
# Input values for NaN

imputer = SimpleImputer(strategy='mean')
df[features] = imputer.fit_transform(df[features])
df.head()

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
0,Adelie,39.1,18.7,181.0,3750.0
1,Adelie,39.5,17.4,186.0,3800.0
2,Adelie,40.3,18.0,195.0,3250.0
3,Adelie,43.92193,17.15117,200.915205,4201.754386
4,Adelie,36.7,19.3,193.0,3450.0


In [56]:
# Label encoder for species
le = LabelEncoder()
df.species = le.fit(df.species).transform(df.species)
df.head()

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
0,0,39.1,18.7,181.0,3750.0
1,0,39.5,17.4,186.0,3800.0
2,0,40.3,18.0,195.0,3250.0
3,0,43.92193,17.15117,200.915205,4201.754386
4,0,36.7,19.3,193.0,3450.0


In [57]:
# Gaussianize columns

transformador = PowerTransformer()
df[features] = transformador.fit_transform(df[features])
df.head(100)

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
0,0,-0.881427,0.780407,-1.585483,-0.483674
1,0,-0.804867,0.099432,-1.108407,-0.409669
2,0,-0.652608,0.410750,-0.341826,-1.312662
3,0,0.023142,-0.028159,0.105953,0.137390
4,0,-1.347052,1.102699,-0.502790,-0.960055
...,...,...,...,...,...
95,0,-0.558016,0.887283,0.591956,0.259660
96,0,-1.074110,0.727177,-0.754008,-0.559133
97,0,-0.652608,0.674088,-0.263218,0.320332
98,0,-2.067281,-0.557174,-1.891574,-2.012932


In [58]:
# StandardScaler - not necessary
'''
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])
df.head()
'''

'\nscaler = StandardScaler()\ndf[features] = scaler.fit_transform(df[features])\ndf.head()\n'

In [59]:
x = df[features]
y = df['species']

In [60]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

In [61]:
# Fit model Linear regression
lr = LinearRegression()
lr.fit(x_train, y_train)

# r_sq score
r_sq = lr.score(x,y)
print('Coefieciente de determinação (R2): ', r_sq)

# accuracy score
print("Test set score: {:.2f}".format(lr.score(x_test, y_test)))

Coefieciente de determinação (R2):  0.9082168713459849
Test set score: 0.90


In [66]:
# Fit model Linear regression
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train, y_train)

# accuracy score
print("Test set score: {:.2f}".format(knn.score(x_test, y_test)))

Test set score: 0.99
