<a href="https://colab.research.google.com/github/s34836/EWD/blob/main/lab9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Lab - Classification

## Tasks

1. Load the `diabetes.csv` dataset. Prepare the data for modelling (split it into training/validation/test sets and scale it).
Create predictive models to detect cases of diabetes (the `Outcome` column):
    - [`LogisticRegression`](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html),
    - [`GaussianNB`](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html),
    - [`LinearDiscriminantAnalysis`](https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.LinearDiscriminantAnalysis.html),
    - [`QuadraticDiscriminantAnalysis`](https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis.html).

    Compare the models using the accuracy/precision/recall/F1 scores on validation data. Select the best model and evaluate it using the test set.


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


diabetes_data = pd.read_csv('diabetes.csv')
diabetes_data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
diabetes_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [None]:
X = diabetes_data.drop('Outcome', axis=1)
y = diabetes_data['Outcome']

# train+val (80%) + test (20%)
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1, stratify=y
)

# train+val into = 75% training + 25% validation
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=1, stratify=y_train_val
)
# calosc = 60% train, 20% val, 20% test



# Skalowanie
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X.columns)
X_val_scaled_df = pd.DataFrame(X_val_scaled, columns=X.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X.columns)

# Shapes of sets:
print(f"Train: {X_train_scaled.shape}")
print(f"Val: {X_val_scaled.shape}")
print(f"Test: {X_test_scaled.shape}")

X_train_scaled_df.head()

Train: (460, 8)
Val: (154, 8)
Test: (154, 8)


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,-0.810999,-0.540786,-2.217427,1.043764,0.019727,1.454109,-0.89758,0.000944
1,-0.226724,1.734283,0.963557,0.358396,0.678221,0.148372,2.165537,1.650923
2,-0.810999,-0.634282,0.168311,-0.576197,-0.097539,-0.896219,0.600855,-0.433261
3,-0.226724,-0.135637,0.281917,-0.389278,0.218177,-0.765645,-1.137329,-0.780625
4,-1.103136,-1.662738,0.395524,-1.323871,-0.728971,1.715257,-0.862879,1.129877


In [None]:
models = {
    'Logistic Regression': LogisticRegression(random_state=10, max_iter=1000),
    'Gaussian Naive Bayes': GaussianNB(),
    'Linear Discriminant Analysis': LinearDiscriminantAnalysis(),
    'Quadratic Discriminant Analysis': QuadraticDiscriminantAnalysis()
}

results = {}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_val_pred = model.predict(X_val_scaled)
    metrics = {
        'Accuracy': accuracy_score(y_val, y_val_pred),
        'Precision': precision_score(y_val, y_val_pred),
        'Recall': recall_score(y_val, y_val_pred),
        'F1 Score': f1_score(y_val, y_val_pred)
    }

    # Store results
    results[name] = metrics


results_df = pd.DataFrame(results).T
print("\nWyniki:")
print(results_df)




Wyniki:
                                 Accuracy  Precision    Recall  F1 Score
Logistic Regression              0.753247   0.653846  0.629630  0.641509
Gaussian Naive Bayes             0.746753   0.622951  0.703704  0.660870
Linear Discriminant Analysis     0.746753   0.647059  0.611111  0.628571
Quadratic Discriminant Analysis  0.701299   0.574074  0.574074  0.574074


Accuracy the best: Logistic Regression<br>
Precision the best: Logistic Regression<br>
Recall the best: Gaussian Naive Bayes<br>
F1 Score the best: Gaussian Naive Bayes<br>
And the winner is: Gaussian Naive Bayes<br>
bo:<br>
1) Badania medyczne - recall najważniejszy<br>
2) f1 - the best

In [None]:
best_model = models['Gaussian Naive Bayes']

y_test_pred = best_model.predict(X_test_scaled)
y_test_prob = best_model.predict_proba(X_test_scaled)[:, 1]

test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print("Gaussian Naive Bayes - Test Set Evaluation:")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")
print(f"F1 Score: {test_f1:.4f}")



Gaussian Naive Bayes - Test Set Evaluation:
Accuracy: 0.6883
Precision: 0.5625
Recall: 0.5000
F1 Score: 0.5294


2. Classify mushrooms in `agaricus-lepiota.data` as *poisonous* (`p`) or *edible* (`e`) using [`CategoricalNB`](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.CategoricalNB.html).
    - Missing values in the dataset are stored as `?` (to load the dataset correctly, pass `na_values='?'` to  `read_csv()`). Remove all rows containing missing values (`dropna(axis='rows')`).
    - Encode the inputs (`X`) as 0,1,2,... using the [`OrdinalEncoder`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html).
    - Split the data into a training and test set. Print out the confusion matrix and check which type of error is more common (false positive/false negative). Calculate the accuracy, precision and recall.

In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
response = requests.get(url)
data_str = response.text

column_names = ['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
                'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
                'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
                'stalk-surface-below-ring', 'stalk-color-above-ring',
                'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
                'ring-type', 'spore-print-color', 'population', 'habitat']


mushroom_data = pd.read_csv(StringIO(data_str), names=column_names, na_values='?')

mushroom_data.head()


Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [None]:
mushroom_data.shape

(8124, 23)

In [None]:
mushroom_data_clean = mushroom_data.dropna(axis='rows')

In [None]:
mushroom_data_clean.shape

(5644, 23)

In [None]:
mushroom_data_clean.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [None]:
from sklearn.preprocessing import OrdinalEncoder

X = mushroom_data_clean.drop('class', axis=1)
y = mushroom_data_clean['class']


encoder = OrdinalEncoder()

X_encoded = encoder.fit_transform(X)
X_encoded = pd.DataFrame(X_encoded, columns=X.columns)
# p == poisonous (1), e == edible (0)
y_encoded = (y == 'p').astype(int)
#y_encoded = y

In [None]:
X_encoded

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,5.0,2.0,4.0,1.0,6.0,1.0,0.0,1.0,2.0,0.0,...,2.0,5.0,5.0,0.0,0.0,1.0,3.0,1.0,3.0,5.0
1,5.0,2.0,7.0,1.0,0.0,1.0,0.0,0.0,2.0,0.0,...,2.0,5.0,5.0,0.0,0.0,1.0,3.0,2.0,2.0,1.0
2,0.0,2.0,6.0,1.0,3.0,1.0,0.0,0.0,3.0,0.0,...,2.0,5.0,5.0,0.0,0.0,1.0,3.0,2.0,2.0,3.0
3,5.0,3.0,6.0,1.0,6.0,1.0,0.0,1.0,3.0,0.0,...,2.0,5.0,5.0,0.0,0.0,1.0,3.0,1.0,3.0,5.0
4,5.0,2.0,3.0,0.0,5.0,1.0,1.0,0.0,2.0,1.0,...,2.0,5.0,5.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5639,0.0,3.0,4.0,0.0,5.0,1.0,0.0,0.0,7.0,0.0,...,3.0,3.0,3.0,0.0,0.0,2.0,3.0,5.0,5.0,4.0
5640,5.0,3.0,4.0,0.0,5.0,1.0,0.0,0.0,7.0,0.0,...,3.0,3.0,3.0,0.0,0.0,2.0,3.0,5.0,5.0,4.0
5641,5.0,3.0,3.0,1.0,5.0,1.0,0.0,0.0,7.0,0.0,...,2.0,5.0,5.0,0.0,0.0,2.0,3.0,5.0,5.0,4.0
5642,5.0,3.0,1.0,0.0,4.0,1.0,0.0,0.0,8.0,0.0,...,3.0,1.0,1.0,0.0,0.0,0.0,2.0,5.0,1.0,0.0


In [None]:
y_encoded

Unnamed: 0,class
0,1
1,0
2,0
3,1
4,0
...,...
7986,0
8001,0
8038,0
8095,1


In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and test sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(
   X_encoded, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded
)

# Print the shapes to confirm the split
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (3950, 22)
X_test shape: (1694, 22)
y_train shape: (3950,)
y_test shape: (1694,)


In [None]:
y_train

Unnamed: 0,class
895,0
5877,1
4499,1
740,0
100,0
...,...
3506,1
1812,1
1993,0
3588,0


In [None]:
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

model = CategoricalNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

cm = confusion_matrix(y_test, y_pred)

print("Confusion Matrix:")
print(cm)

tn, fp, fn, tp = cm.ravel()
print(f"\nTN (Jadalny to jadalny): {tn}")
print(f"FP (Jadalny to trujacy [tak troche zle]): {fp}")
print(f"FN (Trujacy to jadalny [zle]): {fn}")
print(f"TP (Trujacy to trujacy): {tp}")

Confusion Matrix:
[[1045    2]
 [  44  603]]

TN (Jadalny to jadalny): 1045
FP (Jadalny to trujacy [tak troche zle]): 2
FN (Trujacy to jadalny [zle]): 44
TP (Trujacy to trujacy): 603


FN > FP

In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)


print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")


Accuracy:  0.9728
Precision: 0.9967
Recall:    0.9320
F1 Score:  0.9633


Recall 93,20% - to troche słabo jak na trujące grzyby