# **Exercise 02: metrics**

## Configuration:

Import necessary entities:

In [1]:
from joblib import dump
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from pandas import (
    Series,
    DataFrame,
    read_csv,
)
from sklearn.metrics import (
    recall_score,
    roc_auc_score,
    accuracy_score,
    precision_score,
)

## Preprocessing:

Create a dictionary for `read_csv()` method callings:

In [2]:
read_csv_params: dict[str, str] = {
    "file": "day_of_week_not_scaled.csv",
    "additional_file": "day_of_week.csv",
    "file_path": "../../data/datasets/",
}

Read the file `day_of_week_no_scaled.csv` to a *Pandas* dataframe:

In [3]:
df: DataFrame = read_csv(
    read_csv_params["file_path"] + read_csv_params["file"],
)

Add a column `day_of_week` to the `df` *Pandas* dataframe:

In [4]:
df["day_of_week"] = read_csv(
    read_csv_params["file_path"] + read_csv_params["additional_file"],
)["day_of_week"]

Check `df` *Pandas* dataframe:

In [5]:
df.head()

Unnamed: 0,num_trials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1,day_of_week
0,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
1,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
2,3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
3,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
4,5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4


Prepare features and target variables:

In [6]:
X: DataFrame = df.drop(columns=["day_of_week", ], )
y: Series = df["day_of_week"]

Check `X` and `y` variables:

In [7]:
X.head()

Unnamed: 0,num_trials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [8]:
y.head()

0    4
1    4
2    4
3    4
4    4
Name: day_of_week, dtype: int64

Use `train_test_split()` function:

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    stratify=y,
    test_size=0.2,
    random_state=21,
)

Check `X_train`, `X_test`, `y_train`, `y_test` variables:

In [10]:
X_train.head()

Unnamed: 0,num_trials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
860,3,13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
385,6,11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
422,1,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
326,9,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
714,47,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [11]:
X_test.head()

Unnamed: 0,num_trials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
1087,67,17,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
16,1,13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
563,14,10,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1381,20,15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1199,9,13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [12]:
y_train.head()

860    6
385    4
422    5
326    6
714    1
Name: day_of_week, dtype: int64

In [13]:
y_test.head()

1087    1
16      5
563     6
1381    3
1199    2
Name: day_of_week, dtype: int64

## *SVC*:

Create a model of *SVC*:

In [14]:
svc_model: SVC = SVC(
    C=10,
    gamma="scale",
    kernel="linear",
    random_state=21,
    probability=True,
    class_weight="balanced",
)

Train *SVC* model on the data:

In [15]:
svc_model.fit(X_train, y_train, );

Calculate the *SVC* model: `accuracy`, `precision`, `recall`, `ROC AUC` metrics for the data:

In [16]:
print(
    f"Accuracy is {
        accuracy_score(
            y_true=y_test,
            y_pred=svc_model.predict(X_test, ),
        ):.3f
    }.",
)
print(
    f"Precision is {
        precision_score(
            y_true=y_test,
            average="weighted",
            y_pred=svc_model.predict(X_test, ),
        ):.3f
    }.",
)
print(
    f"Recall is {
        recall_score(
            y_true=y_test,
            average="weighted",
            y_pred=svc_model.predict(X_test, ),
        ):.3f
    }.",
)
print(
    f"ROC AUC is {
        roc_auc_score(
            y_true=y_test,
            multi_class="ovo",
            average="weighted",
            y_score=svc_model.predict_proba(X_test, ),
        ):.3f
    }.",
)

Accuracy is 0.731.
Precision is 0.759.
Recall is 0.731.
ROC AUC is 0.926.


## *Decision tree*:

Create a model of *decision tree*:

In [17]:
tree_model: DecisionTreeClassifier = DecisionTreeClassifier(
    max_depth=22,
    random_state=21,
    criterion="gini",
    class_weight=None,
)

Train *decision tree* model on the data:

In [18]:
tree_model.fit(X_train, y_train, );

Calculate the *decision tree* model: `accuracy`, `precision`, `recall`, `ROC AUC` metrics for the data:

In [19]:
print(
    f"Accuracy is {
        accuracy_score(
            y_true=y_test,
            y_pred=tree_model.predict(X_test, ),
        ):.3f
    }.",
)
print(
    f"Precision is {
        precision_score(
            y_true=y_test,
            average="weighted",
            y_pred=tree_model.predict(X_test, ),
        ):.3f
    }.",
)
print(
    f"Recall is {
        recall_score(
            y_true=y_test,
            average="weighted",
            y_pred=tree_model.predict(X_test, ),
        ):.3f
    }.",
)
print(
    f"ROC AUC is {
        roc_auc_score(
            y_true=y_test,
            multi_class="ovo",
            average="weighted",
            y_score=tree_model.predict_proba(X_test, ),
        ):.3f
    }.",
)

Accuracy is 0.861.
Precision is 0.866.
Recall is 0.861.
ROC AUC is 0.919.


## *Random forest tree*:

Create a model of *random forest tree*:

In [20]:
tree_forest_model: RandomForestClassifier = RandomForestClassifier(
    max_depth=20,
    random_state=21,
    criterion="gini",
    n_estimators=100,
    class_weight=None,
)

Train *random forest tree* model on the data:

In [21]:
tree_forest_model.fit(X_train, y_train, );

Calculate the *random forest tree* model: `accuracy`, `precision`, `recall`, `ROC AUC` metrics for the data:

In [22]:
print(
    f"Accuracy is {
        accuracy_score(
            y_true=y_test,
            y_pred=tree_forest_model.predict(X_test, ),
        ):.3f
    }.",
)
print(
    f"Precision is {
        precision_score(
            y_true=y_test,
            average="weighted",
            y_pred=tree_forest_model.predict(X_test, ),
        ):.3f
    }.",
)
print(
    f"Recall is {
        recall_score(
            y_true=y_test,
            average="weighted",
            y_pred=tree_forest_model.predict(X_test, ),
        ):.3f
    }.",
)
print(
    f"ROC AUC is {
        roc_auc_score(
            y_true=y_test,
            multi_class="ovo",
            average="weighted",
            y_score=tree_forest_model.predict_proba(X_test, ),
        ):.3f
    }.",
)

Accuracy is 0.932.
Precision is 0.934.
Recall is 0.932.
ROC AUC is 0.989.


## Prediction:

Train the best model on the data:

In [23]:
tree_forest_model.fit(X_train, y_train, );

Create a `prediction` *Pandas* dataframe column:

In [24]:
df["prediction"] = tree_forest_model.predict(X, )

Calculate the error percentage for each class:

In [25]:
(
    df[df["day_of_week"] != df["prediction"]]["day_of_week"].value_counts() /
    df["day_of_week"].value_counts()
).dropna() * 100

day_of_week
0    5.882353
1    2.189781
2    2.013423
3    0.505051
4    2.884615
5    1.107011
6    0.280899
Name: count, dtype: float64

`For which weekday the best model makes the most errors?`

Answer: for the `Monday`.

Save the best model:

In [26]:
dump(
    tree_forest_model,
    "../../models/ex_02_best_model.joblib",
);