### Importy

In [9]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import timeit
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import pandas as pd
from typing import Union

import sys, os
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.insert(0, PROJECT_ROOT)


from src.performance_metrics.performance_metrics import PerformanceMetrics
from src.data.load_data import load_breath_cancer_data, load_students_data, prepare_data
from src.models.abstract_classifier import AbstractClassifier

### Załadnowanie danych

In [6]:
breast_cancer_data = load_breath_cancer_data()

breast_cancer_data.head()

Unnamed: 0,ID,Diagnosis,radius1,texture1,perimiter1,area1,smoothness1,compactness1,concavity1,concave_points1,...,radius3,texture3,perimiter3,area3,smoothness3,compactness3,concavity3,concave_points3,symetry3,fractial_dimention3
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [3]:
students_data = load_students_data()
students_data.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


# Porównanie własnej implementacji metryk z biblioteką sklearn

### Klasa Abstrakcyjna do trenowania

In [7]:
x_breast_cancer, x_test_breast_cancer, y_train_breast_cancer, y_test_breast_cancer = prepare_data(
    breast_cancer_data, target_column="Diagnosis")

In [None]:
def compare_metrics_df(
    perf: PerformanceMetrics,
    y_true: Union[list, np.ndarray],
    y_pred: Union[list, np.ndarray],
    average: str = 'micro'
) -> pd.DataFrame:

    # our metrics
    my_acc, my_prec, my_rec, my_f1 = (
        perf.accuracy(average),
        perf.precision(average),
        perf.recall(average),
        perf.f1_score(average)
    )
    # sklearn metrics
    sk_acc = accuracy_score(y_true, y_pred)
    avg_arg = None if average == 'per_class' else average
    labels = perf.labels.tolist()
    sk_prec = precision_score(y_true, y_pred, average=avg_arg, labels=labels)
    sk_rec  = recall_score(   y_true, y_pred, average=avg_arg, labels=labels)
    sk_f1   = f1_score(       y_true, y_pred, average=avg_arg, labels=labels)

    if average != 'per_class':
        return pd.DataFrame({
            'PerformanceMetrics':    [my_acc, my_prec, my_rec, my_f1],
            'sklearn': [sk_acc, sk_prec, sk_rec, sk_f1]
        }, index=['accuracy','precision','recall','f1'])

    # per-class indexing
    data = {
        ('precision','PerformanceMetrics'):    [perf.precision('per_class')[l] for l in labels],
        ('precision','sklearn'): sk_prec,
        ('recall','PerformanceMetrics'):       [perf.recall('per_class')[l]    for l in labels],
        ('recall','sklearn'):    sk_rec,
        ('f1','PerformanceMetrics'):           [perf.f1_score('per_class')[l] for l in labels],
        ('f1','sklearn'):        sk_f1,
    }
    df = pd.DataFrame(data, index=labels)
    df.columns = pd.MultiIndex.from_tuples(df.columns, names=['metric','source'])
    return df



### 1. Logistic Regression

In [20]:
model = LogisticRegression(max_iter=1000)

logistic_regression_classifier = AbstractClassifier(model, x_breast_cancer, x_test_breast_cancer, y_train_breast_cancer)
logistic_regression_classifier.fit()

In [21]:
predictions = logistic_regression_classifier.predict()
actual = y_test_breast_cancer

performance_metrics = PerformanceMetrics(predictions, actual)


In [None]:
micro = compare_metrics_df(performance_metrics, actual, predictions, average='micro')
macro = compare_metrics_df(performance_metrics, actual, predictions, average='macro')
per_class = compare_metrics_df(performance_metrics, actual, predictions, average='per_class')
weighted = compare_metrics_df(performance_metrics, actual, predictions, average='weighted')

In [15]:
micro.head()

Unnamed: 0,your,sklearn
accuracy,0.921053,0.921053
precision,0.921053,0.921053
recall,0.921053,0.921053
f1,0.921053,0.921053


In [16]:
macro.head()

Unnamed: 0,your,sklearn
accuracy,0.897817,0.921053
precision,0.935081,0.935081
recall,0.897817,0.897817
f1,0.911757,0.911757


In [17]:
per_class.head()

metric,precision,precision,recall,recall,f1,f1
source,your,sklearn,your,sklearn,your,sklearn
B,0.898734,0.898734,0.986111,0.986111,0.940397,0.940397
M,0.971429,0.971429,0.809524,0.809524,0.883117,0.883117


In [18]:
weighted.head()

Unnamed: 0,your,sklearn
accuracy,0.921053,0.921053
precision,0.925516,0.925516
recall,0.921053,0.921053
f1,0.919294,0.919294


In [None]:
import time.time as time
# assume y_true, y_pred are already in scope
start = time.perf_counter()
_ = accuracy_score(y_true, y_pred)
end   = time.perf_counter()

print(f"accuracy_score took {end - start:.6f} seconds")