# Wczytanie bibliotek

In [61]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.datasets import load_digits
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
%matplotlib inline

### Zadanie 1: małe cyfry vs duże

Dla danych digits zbuduj model regresji logistycznej rozróżniającej małe cyfry od dużych:

1. Oddziel zbiór testowy

2. Przeskaluj dane i policz średnią dokładność za pomocą kroswalidacji dla modeli:

    regresja grzbietowa, C = 1, 0.1, 0.01
    
    regresja LASSO, C = 1, 0.1, 0.01
    
3. Porównaj wyniki, wybierz najlepszy model i policz dokładność i miarę F1 dla zbioru testowego.
    

In [2]:
digits = load_digits()

In [3]:
X = digits.data
y = digits.target

In [4]:
y = (y > 4).astype(int)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 17)

In [6]:
kfold = StratifiedKFold(n_splits = 10)

In [7]:
C = [1, 0.1, 0.01]
metody = ["ridge", "LASSO"]

In [8]:
wyniki = pd.DataFrame(index = C, columns = metody)

In [9]:
for c in C:
    for metoda in metody:
        if metoda == "LASSO":
            model = Pipeline([("standarization", StandardScaler()),("LogisticRegression", LogisticRegression(C = c, penalty = "l1", solver = "liblinear"))])
        if metoda == "ridge":
            model = Pipeline([("standarization", StandardScaler()),("LogisticRegression", LogisticRegression(C = c))])
        CV_score = cross_val_score(model, X_train, y_train, cv = kfold)
        wyniki.loc[c, metoda] = CV_score.mean()

In [10]:
wyniki

Unnamed: 0,ridge,LASSO
1.0,0.890851,0.89309
0.1,0.896064,0.887905
0.01,0.887164,0.839574


Najlepszy model dla regresji ridge, C = 0.1

In [11]:
model_fin = Pipeline([("standarization", StandardScaler()),("LogisticRegression", LogisticRegression(C = .1))])

In [12]:
model_fin.fit(X_train, y_train)

In [13]:
model_fin.score(X_test, y_test)

0.8822222222222222

In [14]:
pred_fin = model_fin.predict(X_test)

In [15]:
print(classification_report(y_test, pred_fin))

              precision    recall  f1-score   support

           0       0.89      0.87      0.88       226
           1       0.87      0.89      0.88       224

    accuracy                           0.88       450
   macro avg       0.88      0.88      0.88       450
weighted avg       0.88      0.88      0.88       450



In [35]:
model_enet = Pipeline([("standarization", StandardScaler()),("LogisticRegression", LogisticRegression(penalty = "elasticnet", solver = "saga", l1_ratio = 0.5, max_iter = 1000))])

In [36]:
model_enet.fit(X_train, y_train)

In [37]:
model_enet.score(X_test, y_test)

0.8911111111111111

# Źródła:

Hastie, Trevor, et al. The elements of statistical learning: data mining, inference, and prediction. Vol. 2. New York: springer, 2009.

Muller, Andreas, et al. Machine learning, Python i data science, Wprowadzenie. Helion, 2021.

Sklearn tutorials