# **Classification models**

## Configuration:

Import necessary entities:

In [1]:
from typing import Any
from joblib import dump
from sklearn.svm import SVC
from warnings import filterwarnings
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, accuracy_score
from pandas import (
    Series,
    DataFrame,
    read_csv,
)
from sklearn.model_selection import (
    GridSearchCV,
    cross_val_score,
    train_test_split,
)

Ignore all warnings:

In [2]:
filterwarnings("ignore", )

## Preprocessing:

Create a dictionary for `read_csv()` method callings:

In [3]:
read_csv_params: dict[str, str] = {
    "class_target": "class.csv",
    "features_file": "features.csv",
    "categorical_target": "categorical.csv",

    "targets_file_path": "../../../data/datasets/targets/",
    "features_file_path": "../../../data/datasets/processed/",
}

Read the `features.csv` data to a *Pandas* dataframe:

In [4]:
X: DataFrame = read_csv(
    read_csv_params["features_file_path"] + read_csv_params["features_file"],
    index_col=0,
)

Read the `class.csv` data to a *Pandas* dataframe:

In [5]:
class_y: Series = read_csv(
    read_csv_params["targets_file_path"] + read_csv_params["class_target"],
    index_col=0,
)

Read the `categorical.csv` data to a *Pandas* dataframe:

In [6]:
cat_y: Series = read_csv(
    read_csv_params["targets_file_path"] +
    read_csv_params["categorical_target"],
    index_col=0,
)

Check `X`, `class_y`, `cat_y` variables data:

In [7]:
X.head()

Unnamed: 0,cod,fig,egg,gin,ham,oat,nut,pea,rum,rye,...,fortified wine,sparkling wine,sugar snap pea,beef tenderloin,cranberry sauce,pork tenderloin,poultry sausage,pomegranate juice,jerusalem artichoke,hominy/cornmeal/masa
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
class_y.head()

Unnamed: 0,class_rating
0,so-so
1,great
2,great
3,great
4,so-so


In [9]:
cat_y.head()

Unnamed: 0,categorical_rating
0,2
1,4
2,4
3,5
4,3


Use `train_test_split()` function for splitting `class_y` target:

In [10]:
class_X_train, class_X_test, class_y_train, class_y_test = train_test_split(
    X,
    class_y,
    test_size=0.2,
    random_state=21,
    stratify=class_y,
)

Use `train_test_split()` function for splitting `cat_y` target:

In [11]:
cat_X_train, cat_X_test, cat_y_train, cat_y_test = train_test_split(
    X,
    cat_y,
    test_size=0.2,
    stratify=cat_y,
    random_state=21,
)

Check `cat_X_train`, `cat_X_test`, `cat_y_train`, `cat_y_test`, `class_X_train`, `class_X_test`, `class_y_train`, `class_y_test`:

In [12]:
cat_X_train.head()

Unnamed: 0,cod,fig,egg,gin,ham,oat,nut,pea,rum,rye,...,fortified wine,sparkling wine,sugar snap pea,beef tenderloin,cranberry sauce,pork tenderloin,poultry sausage,pomegranate juice,jerusalem artichoke,hominy/cornmeal/masa
16991,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12957,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15804,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10259,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
cat_X_test.head()

Unnamed: 0,cod,fig,egg,gin,ham,oat,nut,pea,rum,rye,...,fortified wine,sparkling wine,sugar snap pea,beef tenderloin,cranberry sauce,pork tenderloin,poultry sausage,pomegranate juice,jerusalem artichoke,hominy/cornmeal/masa
5532,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14955,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1157,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10288,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4974,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
cat_y_train.head()

Unnamed: 0,categorical_rating
16991,4
12957,3
8564,4
15804,4
10259,4


In [15]:
cat_y_test.head()

Unnamed: 0,categorical_rating
5532,4
14955,0
1157,4
10288,3
4974,0


In [16]:
class_X_train.head()

Unnamed: 0,cod,fig,egg,gin,ham,oat,nut,pea,rum,rye,...,fortified wine,sparkling wine,sugar snap pea,beef tenderloin,cranberry sauce,pork tenderloin,poultry sausage,pomegranate juice,jerusalem artichoke,hominy/cornmeal/masa
4966,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10789,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17598,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18534,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
class_X_test.head()

Unnamed: 0,cod,fig,egg,gin,ham,oat,nut,pea,rum,rye,...,fortified wine,sparkling wine,sugar snap pea,beef tenderloin,cranberry sauce,pork tenderloin,poultry sausage,pomegranate juice,jerusalem artichoke,hominy/cornmeal/masa
1975,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10759,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7092,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18206,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17798,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
class_y_train.head()

Unnamed: 0,class_rating
4966,great
10789,so-so
13147,bad
17598,great
18534,great


In [19]:
class_y_test.head()

Unnamed: 0,class_rating
1975,great
10759,great
7092,great
18206,great
17798,great


## Prediction:

**Is worse to predict a `bad` `rating` which is `good` in real life, or to predict a `good` `rating` which is `bad` in real life?**

Answer: it worse to predict `good` `rating` which is `bad`.

### Class naive solution:

Print the class naive solution metrics scores:

In [20]:
print(
    f"The class naive solution accuracy metric is {
        accuracy_score(
            class_y,
            [class_y.mode().iloc[0], ] * len(class_y, ),
        ):.3f
    }.",
    f"\nThe class naive solution precision metric is {
        precision_score(
            class_y,
            [class_y.mode().iloc[0], ] * len(class_y, ),
            average="weighted",
        ):.3f
    }.",
)

The class naive solution accuracy metric is 0.793. 
The class naive solution precision metric is 0.629.


### Category naive solution:

Print the category naive solution metrics scores:

In [21]:
print(
    f"The category naive solution accuracy metric is {
        accuracy_score(
            cat_y,
            [cat_y.mode().iloc[0], ]  * len(cat_y, ),
        ):.3f
    }.",
    f"\nThe category naive solution precision metric is {
        precision_score(
            cat_y,
            [cat_y.mode().iloc[0], ]  * len(cat_y, ),
            average="weighted",
        ):.3f
    }.",
)

The category naive solution accuracy metric is 0.658. 
The category naive solution precision metric is 0.433.


The classes are too limited and uninformative for predictions. Therefore, categories should be used.

### *Logistic regression* model:

Create a model of *logistic regression*:

In [22]:
log_reg_model: LogisticRegression = LogisticRegression(
    n_jobs=-1,
    random_state=21,
    multi_class="multinomial",
)

Print the *logistic regression cross-validation* model metrics scores:

In [23]:
print(
    f"The logistic regression cross-validation model accuracy metric score is {
        cross_val_score(
            X=X,
            cv=10,
            y=cat_y,
            n_jobs=-1,
            scoring="accuracy",
            estimator=log_reg_model,
        ).mean():.3f
    }.",
    "\nThe logistic regression cross-validation model precision metric score" +
    f" is {
        cross_val_score(
            X=X,
            cv=10,
            y=cat_y,
            n_jobs=-1,
            estimator=log_reg_model,
            scoring="precision_weighted",
        ).mean():.3f
    }.",
)

The logistic regression cross-validation model accuracy metric score is 0.667. 
The logistic regression cross-validation model precision metric score is 0.547.


Create a parameters grid for the *logistic regression* model:

In [24]:
log_reg_model_params_grid: dict[str, list[Any]] = {
    "fit_intercept": [True, False, ],
    "penalty": [
        "l1",
        "l2",
        "elasticnet",
    ],
    "solver": [
        "sag",
        "lbfgs",
        "liblinear",
        "newton-cg",
        "newton-cholesky",
    ],
}

Create a *gridsearch* model of the *logistic regression* model:

In [25]:
log_reg_grid_search_model: GridSearchCV = GridSearchCV(
    cv=5,
    n_jobs=-1,
    estimator=log_reg_model,
    scoring="precision_weighted",
    param_grid=log_reg_model_params_grid,
)

Train the *gridsearch* model of the *logistic regression* model:

In [26]:
log_reg_grid_search_model.fit(X, cat_y, );

Print the best *logistic regression* model *precision* metric score:

In [27]:
print(
    f"The best logistic regression model precision metric score is {
        log_reg_grid_search_model.best_score_:.3f
    }.",
)

The best logistic regression model precision metric score is 0.543.


### *SVC* model:

Create a model of *SVC*:

In [28]:
svc_model: SVC = SVC(
    random_state=21,
    decision_function_shape="ovr",
)

Print the *SVc cross-validation* model metrics scores:

In [29]:
print(
    f"The SVC cross-validation model accuracy metric score is {
        cross_val_score(
            X=X,
            cv=3,
            y=cat_y,
            n_jobs=-1,
            scoring="accuracy",
            estimator=svc_model,
        ).mean():.3f
    }.",
    f"\nThe SVC cross-validation model precision metric score is {
        cross_val_score(
            X=X,
            cv=3,
            y=cat_y,
            n_jobs=-1,
            estimator=svc_model,
            scoring="precision_weighted",
        ).mean():.3f
    }.",
)

The SVC cross-validation model accuracy metric score is 0.670. 
The SVC cross-validation model precision metric score is 0.580.


Create a parameters grid for the *SVC* model:

In [30]:
svc_model_params_grid: dict[str, list[str]] = {
    "kernel": [
        "rbf",
        "poly",
        "sigmoid",
    ],
}

Create a *gridsearch* model of the *SVC* model:

In [31]:
svc_grid_search_model: GridSearchCV = GridSearchCV(
    cv=3,
    n_jobs=-1,
    estimator=svc_model,
    scoring="precision_weighted",
    param_grid=svc_model_params_grid,
)

Train the *gridsearch* model of the *svc* model:

In [32]:
svc_grid_search_model.fit(X, cat_y, );

Print the best *SVC* model *precision* metric score:

In [33]:
print(
    f"The best SVC model precision metric score is {
        svc_grid_search_model.best_score_:.3f
    }.",
)

The best SVC model precision metric score is 0.601.


### *Decision tree* model:

Create a model of *decision tree*:

In [34]:
tree_model: DecisionTreeClassifier = DecisionTreeClassifier(random_state=21, )

Print the *decision tree cross-validation* model metrics scores:

In [35]:
print(
    f"The decision tree cross-validation model accuracy metric score is {
        cross_val_score(
            X=X,
            cv=10,
            y=cat_y,
            n_jobs=-1,
            scoring="accuracy",
            estimator=tree_model,
        ).mean():.3f
    }.",
    f"\nThe decision tree cross-validation model precision metric score is {
        cross_val_score(
            X=X,
            cv=10,
            y=cat_y,
            n_jobs=-1,
            estimator=tree_model,
            scoring="precision_weighted",
        ).mean():.3f
    }.",
)

The decision tree cross-validation model accuracy metric score is 0.573. 
The decision tree cross-validation model precision metric score is 0.555.


Create a parameters grid for the *decision tree* model:

In [36]:
tree_model_params_grid: dict[str, list[Any]] = {
    "max_depth": range(26, ),
    "criterion": [
        "gini",
        "entropy",
        "log_loss",
    ],
}

Create a *gridsearch* model of the *decision tree* model:

In [37]:
tree_grid_search_model: GridSearchCV = GridSearchCV(
    cv=5,
    n_jobs=-1,
    estimator=tree_model,
    scoring="precision_weighted",
    param_grid=tree_model_params_grid,
)

Train the *gridsearch* model of the *decision tree* model:

In [38]:
tree_grid_search_model.fit(X, cat_y, );

Print the best *decision tree* model *precision* metric score:

In [39]:
print(
    f"The best decision tree model precision metric score is {
        tree_grid_search_model.best_score_:.3f
    }.",
)

The best decision tree model precision metric score is 0.554.


### *KNN* model:

Create a model of *KNN*:

In [40]:
knn_model: KNeighborsClassifier = KNeighborsClassifier(n_jobs=-1, )

Print the *KNN cross-validation* model metrics scores:

In [41]:
print(
    f"The KNN cross-validation model accuracy metric score is {
        cross_val_score(
            X=X,
            cv=5,
            y=cat_y,
            n_jobs=-1,
            scoring="accuracy",
            estimator=knn_model,
        ).mean():.3f
    }.",
    f"\nThe KNN cross-validation model precision metric score is {
        cross_val_score(
            X=X,
            cv=5,
            y=cat_y,
            n_jobs=-1,
            estimator=knn_model,
            scoring="precision_weighted",
        ).mean():.3f
    }.",
)

The KNN cross-validation model accuracy metric score is 0.602. 
The KNN cross-validation model precision metric score is 0.518.


Create a parameters grid for the *KNN* model:

In [42]:
knn_model_params_grid: dict[str, list[str]] = {
    "weights": [
        "uniform",
        "distance",
    ],
    "algorithm": [
        "brute",
        "kd_tree",
        "ball_tree",
    ],
}

Create a *gridsearch* model of the *KNN* model:

In [43]:
knn_grid_search_model: GridSearchCV = GridSearchCV(
    cv=3,
    n_jobs=-1,
    estimator=knn_model,
    scoring="precision_weighted",
    param_grid=knn_model_params_grid,
)

Train the *gridsearch* model of the *KNN* model:

In [44]:
knn_grid_search_model.fit(X, cat_y, );

Print the best *KNN* model *precision* metric score:

In [45]:
print(
    f"The best KNN model precision metric score is {
        knn_grid_search_model.best_score_:.3f
    }.",
)

The best KNN model precision metric score is 0.557.


## Model selection:

Check the best classification model parameters:

In [46]:
svc_grid_search_model

0,1,2
,estimator,SVC(random_state=21)
,param_grid,"{'kernel': ['rbf', 'poly', ...]}"
,scoring,'precision_weighted'
,n_jobs,-1
,refit,True
,cv,3
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,C,1.0
,kernel,'poly'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


Train the best classification model:

In [47]:
svc_grid_search_model.fit(cat_X_train, cat_y_train, );

Print the best classification model metrics scores:

In [48]:
print(
    f"The best classfification model accuracy metric score is {
        accuracy_score(
            cat_y_test,
            svc_grid_search_model.predict(cat_X_test, ),
        ):.3f
    }.",
    f"\nThe best classfification model precision metric score is {
        precision_score(
            cat_y_test,
            svc_grid_search_model.predict(cat_X_test, ),
            average="weighted",
        ):.3f
    }.",
)

The best classfification model accuracy metric score is 0.673. 
The best classfification model precision metric score is 0.631.


## Save the model:

Regression models predict with a high degree of discretization. The difference in the `0.25` `rating` will not be so important to the user. It is worth using classification models.

Create a dictionary for `dump()` function calling:

In [49]:
dump_params: dict[str, str] = {
    "model_file": "classification_model.joblib",

    "models_files_path": "../../../models/",
}

Save the best classification model:

In [50]:
dump(
    svc_grid_search_model,
    dump_params["models_files_path"] + dump_params["model_file"],
);