### Preparació de les dades

In [32]:
import pandas as pd
import seaborn as sns

df = sns.load_dataset("penguins")
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


### Esborrar columnes amb valor "NA"

In [33]:
df = df.dropna()
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male


### Divisió de les dades en conjunt d'entrenament i conjunt de prova

In [34]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

X = df.drop(columns=['species'])
y = df.species

# Deixem un 20% de les dades per a test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 1, stratify = y)

# Modifiquem els valors dels tags en y_train i y_test per 0, 1 i 2
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

### Codificació one-hot

In [35]:
from sklearn.preprocessing import StandardScaler

categorical = ['island', 'sex']
numerical = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm']

sc = StandardScaler()
sc.fit(X_train[numerical])
X_train_std = sc.transform(X_train[numerical])
X_test_std = sc.transform(X_test[numerical])

In [36]:
from sklearn.feature_extraction import DictVectorizer

train_dict = X_train[categorical + numerical].to_dict(orient='records')
test_dict = X_test[categorical + numerical].to_dict(orient='records')

dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

0,1,2
,dtype,<class 'numpy.float64'>
,separator,'='
,sparse,False
,sort,True


In [37]:
X_train = dv.transform(train_dict)
X_test = dv.transform(test_dict)
dv.get_feature_names_out()

array(['bill_depth_mm', 'bill_length_mm', 'flipper_length_mm',
       'island=Biscoe', 'island=Dream', 'island=Torgersen', 'sex=Female',
       'sex=Male'], dtype=object)

### Entrenament dels models

#### Regressió logística

In [43]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='liblinear')
lr.fit(X_train, y_train) 



0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,100


#### SVM

In [39]:
from sklearn.svm import SVC

svm = SVC(kernel='linear',C=1.0, random_state=1, probability=True)

svm.fit(X_train, y_train)

0,1,2
,C,1.0
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,


#### Decision Tree

In [40]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(criterion='gini',max_depth=4,
                                    random_state=1)
dt.fit(X_train,y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,4
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,1
,max_leaf_nodes,
,min_impurity_decrease,0.0


#### KNN

In [41]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3, p=2, metric='minkowski')

knn.fit(X_train, y_train)

0,1,2
,n_neighbors,3
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


### Serialització dels models

In [44]:
import pickle

with open('../models/lr.pck', 'wb') as f:
    pickle.dump((dv, lr), f)

with open('../models/svm.pck', 'wb') as f:
    pickle.dump((dv, svm), f)

with open('../models/dt.pck', 'wb') as f:
    pickle.dump((dv, dt), f)

with open('../models/knn.pck', 'wb') as f:
    pickle.dump((dv, knn), f)