We perform cross-validation for different models for classification and save the results.

# Imports

In [1]:
# Standard imports

import pandas as pd
import numpy as np

In [2]:
# Sklearn imports

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

# Load data

In [3]:
from src.utils import get_data

data = get_data(classification=True)
X_train, y_train = data[0], data[2]

# Define models

In [4]:
log_reg = (
    "log_reg",
    LogisticRegression(solver="liblinear"),
    {"log_reg__penalty": ["l1", "l2"], "log_reg__C": np.logspace(-2, 3, 6)},
)


knn = ("knn", KNeighborsClassifier(), {"knn__n_neighbors": [1, 3, 5, 10, 20, 50]})

svm_lin = (
    "svm_lin",
    SVC(kernel="linear"),
    {
        "svm_lin__C": np.logspace(-3, 3, 7),
    },
)

svm_rbf = (
    "svm_rbf",
    SVC(kernel="rbf"),
    {
        "svm_rbf__C": np.logspace(-3, 3, 7),
        "svm_rbf__gamma": np.logspace(-4, 2, 7),
    },
)

gaussian_nb = ("gaussian_nb", GaussianNB(), {})

random_forest = (
    "random_forest",
    RandomForestClassifier(random_state=314),
    {
        "random_forest__n_estimators": [10, 100, 500, 1000],
        "random_forest__max_depth": [1, 3, 5, 10],
    },
)


models = [log_reg, knn, svm_lin, svm_rbf, gaussian_nb, random_forest]

# Apply CV

In [5]:
from src.utils import grid_search_best, l2_score

results_dic = {}

for model in models:
    scores = []
    for scoring in ["accuracy", "f1_macro", l2_score]:
        scores.extend(grid_search_best(X_train, y_train, *model, scoring=scoring))
    results_dic[model[0]] = scores

results = pd.DataFrame(results_dic)

In [6]:
results

Unnamed: 0,log_reg,knn,svm_lin,svm_rbf,gaussian_nb,random_forest
0,"{'log_reg__C': 1.0, 'log_reg__penalty': 'l2'}",{'knn__n_neighbors': 10},{'svm_lin__C': 10.0},"{'svm_rbf__C': 1000.0, 'svm_rbf__gamma': 0.0001}",{},"{'random_forest__max_depth': 5, 'random_forest..."
1,0.409545,0.383168,0.455937,0.45595,0.389601,0.412782
2,"{'log_reg__C': 100.0, 'log_reg__penalty': 'l1'}",{'knn__n_neighbors': 10},{'svm_lin__C': 10.0},"{'svm_rbf__C': 100.0, 'svm_rbf__gamma': 0.01}",{},"{'random_forest__max_depth': 5, 'random_forest..."
3,0.311673,0.354751,0.395219,0.385643,0.36977,0.360211
4,"{'log_reg__C': 100.0, 'log_reg__penalty': 'l1'}",{'knn__n_neighbors': 10},{'svm_lin__C': 10.0},"{'svm_rbf__C': 100.0, 'svm_rbf__gamma': 0.01}",{},"{'random_forest__max_depth': 5, 'random_forest..."
5,-1.346276,-1.363874,-1.22548,-1.213796,-1.296457,-1.316283


We see that the SVM performs best. Considering the very small differences between kenels, it makes sense to use the linear kernel with C = 10.

Let us the log reg and SVM models try with polynomial features.

In [7]:
models_poly = [log_reg, svm_lin, svm_rbf]
results_dic_poly = {}

for model in models_poly:
    scores = []
    for scoring in ["accuracy", "f1_macro", l2_score]:
        scores.extend(grid_search_best(X_train, y_train, *model, scoring=scoring, polyfeat=True))
    results_dic_poly[model[0]] = scores

results_poly = pd.DataFrame(results_dic_poly)

In [8]:
results_poly

Unnamed: 0,log_reg,svm_lin,svm_rbf
0,"{'log_reg__C': 1.0, 'log_reg__penalty': 'l1', ...","{'poly_feat__degree': 1, 'svm_lin__C': 10.0}","{'poly_feat__degree': 1, 'svm_rbf__C': 1000.0,..."
1,0.432713,0.455937,0.45595
2,"{'log_reg__C': 10.0, 'log_reg__penalty': 'l1',...","{'poly_feat__degree': 1, 'svm_lin__C': 10.0}","{'poly_feat__degree': 1, 'svm_rbf__C': 100.0, ..."
3,0.393175,0.395219,0.385643
4,"{'log_reg__C': 1.0, 'log_reg__penalty': 'l1', ...","{'poly_feat__degree': 2, 'svm_lin__C': 1.0}","{'poly_feat__degree': 1, 'svm_rbf__C': 100.0, ..."
5,-1.235498,-1.21559,-1.213796


We only see an improvement for logistic regression, and it is still not as good as SVM, so we stick to linear SVM with C = 10.

In [9]:
# Save results

results.to_csv("../results/classification/CV_results.csv")
results_poly.to_csv("../results/classification/CV_poly_results.csv")