# SVM
This notebook will contain my attempts using various kernels with the SVM approach on both of eBoss and ManGa datasets.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import warnings
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import KFold, GridSearchCV, train_test_split
from sklearn.metrics import classification_report, recall_score, precision_score, f1_score
from sklearn.metrics import accuracy_score, auc, roc_auc_score, make_scorer
from sklearn.svm import SVC, LinearSVC, NuSVC

## eBoss Data

Reading in the eboss data

In [3]:
raw_data_eBoss = pd.read_csv("../../../Data/Astronomy.csv")

Taking out the id column

In [4]:
trainable_data = raw_data_eBoss.iloc[:, 1:].copy(deep=True)

Seperating the X, and Y variables, and doing needed preprocessing.

In [5]:
X = trainable_data.iloc[:, :-1].copy(deep=True)
Y = trainable_data.iloc[:, -1:].copy(deep=True)
X = X.apply(pd.to_numeric, args={'errors': 'coerce'})
X = X.fillna('0')
Y = Y.Hits.map({'bad': 0, 'good': 1})

ss = MinMaxScaler()
X_scaled = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
X = X_scaled

# SVM with Various Hyperparameters on eBoss

In [6]:
# Defining all the scoring metrics that we need to keep track of.
scoring_dict = {
    'precision (no)': make_scorer(precision_score, pos_label=0),
    'precision (yes)': make_scorer(precision_score, pos_label=1),
    'recall (no)': make_scorer(recall_score, pos_label=0),
    'recall (yes)': make_scorer(recall_score, pos_label=1),
    'accuracy': make_scorer(accuracy_score),
    'auc': make_scorer(auc),
    'roc_auc': make_scorer(roc_auc_score),
    'f1': make_scorer(f1_score),
}

# Setting up the params.
# params_grid = [
#     {"C": [.0001, .001, .1, 1],  "kernel": ['linear']},
#     {'C': [.0001, .001, .1, 1], "kernel": ['rbf', 'sigmoid'], 'gamma': [.001, 1, 100, 'auto']},
#     {"C": [.0001, .001, .1, 1], "kernel": ['poly'], "degree": [3,4,5], 'gamma': [.001, 1, 100, 'auto']}
# ]
params_grid = [{"C": [1],  "kernel": ['linear']}]

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    # Cross val with svm 10 fols.
    grid_clf = GridSearchCV(SVC(),
                            cv=KFold(n_splits=10, shuffle=True),
                            param_grid=params_grid,
                            scoring=scoring_dict,
                            refit='accuracy', 
                            return_train_score=False).fit(X,Y)

# Extracting relevant metrics. More detailed metrics are available in the cv_results_ object.
relevant_metric = {'mean_test_{}'.format(k): grid_clf.cv_results_['mean_test_{}'.format(k)] for k in scoring_dict.keys()}
relevant_metric['params'] = grid_clf.cv_results_['params']
relevant_metric

{'mean_test_precision (no)': array([0.88966983]),
 'mean_test_precision (yes)': array([0.94097057]),
 'mean_test_recall (no)': array([0.80224397]),
 'mean_test_recall (yes)': array([0.96011365]),
 'mean_test_accuracy': array([0.9252669]),
 'mean_test_auc': array([0.69928826]),
 'mean_test_roc_auc': array([0.88117881]),
 'mean_test_f1': array([0.9497891]),
 'params': [{'C': 1, 'kernel': 'linear'}]}

In [7]:
print(grid_clf.best_score_)
print(grid_clf.best_params_)

0.9252669039145908
{'C': 1, 'kernel': 'linear'}


# Linear SVM on eBoss

In [8]:
# Defining all the scoring metrics that we need to keep track of.
scoring_dict = {
    'precision (no)': make_scorer(precision_score, pos_label=0),
    'precision (yes)': make_scorer(precision_score, pos_label=1),
    'recall (no)': make_scorer(recall_score, pos_label=0),
    'recall (yes)': make_scorer(recall_score, pos_label=1),
    'accuracy': make_scorer(accuracy_score),
    'auc': make_scorer(auc),
    'roc_auc': make_scorer(roc_auc_score),
    'f1': make_scorer(f1_score),
}

# Setting up the params for linear SVC
params_grid = [
    {"C": [.0001, .001, .1, 1],  "loss": ['hinge', 'squared_hinge']},
]

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    # Cross val with svm 10 fols.
    grid_clf = GridSearchCV(LinearSVC(),
                            cv=KFold(n_splits=10, shuffle=True),
                            param_grid=params_grid,
                            scoring=scoring_dict,
                            refit='accuracy', 
                            return_train_score=False).fit(X,Y)

# Extracting relevant metrics. More detailed metrics are available in the cv_results_ object.
relevant_metric = {'mean_test_{}'.format(k): grid_clf.cv_results_['mean_test_{}'.format(k)] for k in scoring_dict.keys()}
relevant_metric['params'] = grid_clf.cv_results_['params']
relevant_metric

{'mean_test_precision (no)': array([0.        , 0.        , 0.        , 0.        , 0.8510083 ,
        0.85832345, 0.9059312 , 0.87280122]),
 'mean_test_precision (yes)': array([0.75800712, 0.75800712, 0.75800712, 0.75800712, 0.89421693,
        0.91959342, 0.94421743, 0.9443091 ]),
 'mean_test_recall (no)': array([0.        , 0.        , 0.        , 0.        , 0.66692651,
        0.73651923, 0.83812913, 0.83294357]),
 'mean_test_recall (yes)': array([1.        , 1.        , 1.        , 1.        , 0.95837822,
        0.95837822, 0.97228735, 0.96326396]),
 'mean_test_accuracy': array([0.75800712, 0.75800712, 0.75800712, 0.75800712, 0.88256228,
        0.90391459, 0.93594306, 0.92882562]),
 'mean_test_auc': array([1.        , 1.        , 1.        , 1.        , 0.75088968,
        0.65124555, 0.54982206, 0.54982206]),
 'mean_test_roc_auc': array([0.5       , 0.5       , 0.5       , 0.5       , 0.81265236,
        0.84744872, 0.90520824, 0.89810376]),
 'mean_test_f1': array([0.86074326

In [9]:
print(grid_clf.best_score_)
print(grid_clf.best_params_)

0.9359430604982206
{'C': 1, 'loss': 'hinge'}


# Manga Data:

In [11]:
raw_data_Manga = pd.read_csv("../../../Data/Astronomy20000_Original.csv")

Seperating the X, and Y variables, and doing needed preprocessing.

In [3]:
X = raw_data_Manga.iloc[:, 1:-1].copy(deep=True)
Y = raw_data_Manga.iloc[:, [-1]].copy(deep=True)
X = X.fillna('0')
Y = Y.Hits.map({'bad': 0, 'good': 1})

Making model and testing it.

# Linear SVM on Manga

In [None]:
# Defining all the scoring metrics that we need to keep track of.
scoring_dict = {
    'precision (no)': make_scorer(precision_score, pos_label=0),
    'precision (yes)': make_scorer(precision_score, pos_label=1),
    'recall (no)': make_scorer(recall_score, pos_label=0),
    'recall (yes)': make_scorer(recall_score, pos_label=1),
    'accuracy': make_scorer(accuracy_score),
    'auc': make_scorer(auc, reorder=True),
    'roc_auc': make_scorer(roc_auc_score),
    'f1': make_scorer(f1_score),
}

# Setting up the params for linear SVC
params_grid = [
    {"C": [.0001, .001, .1, 1]},
]

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    # Cross val with svm 10 fols.
    grid_clf = GridSearchCV(LinearSVC(),
                            cv=KFold(n_splits=10, shuffle=True),
                            param_grid=params_grid,
                            scoring=scoring_dict,
                            refit='accuracy', 
                            return_train_score=False).fit(X,Y)

# Extracting relevant metrics. More detailed metrics are available in the cv_results_ object.
relevant_metric = {'mean_test_{}'.format(k): grid_clf.cv_results_['mean_test_{}'.format(k)] for k in scoring_dict.keys()}
relevant_metric['params'] = grid_clf.cv_results_['params']
relevant_metric

In [5]:
print(grid_clf.best_score_)
print(grid_clf.best_params_)

NameError: name 'grid_clf' is not defined

# SVM Various HyperParameters on Manga

In [None]:
# Defining all the scoring metrics that we need to keep track of.
scoring_dict = {
    'precision (no)': make_scorer(precision_score, pos_label=0),
    'precision (yes)': make_scorer(precision_score, pos_label=1),
    'recall (no)': make_scorer(recall_score, pos_label=0),
    'recall (yes)': make_scorer(recall_score, pos_label=1),
    'accuracy': make_scorer(accuracy_score),
    'auc': make_scorer(auc),
    'roc_auc': make_scorer(roc_auc_score),
    'f1': make_scorer(f1_score),
}

# Setting up the params.
params_grid = [
    {"C": [.001],  "kernel": ['linear']},
    {'C': [1], "kernel": ['rbf', 'sigmoid'], 'gamma': ['auto']},
    {"C": [1], "kernel": ['poly'], "degree": [3], 'gamma': ['auto']}
]

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    # Cross val with svm 10 fols.
    grid_clf = GridSearchCV(SVC(),
                            cv=KFold(n_splits=10, shuffle=True),
                            param_grid=params_grid,
                            scoring=scoring_dict,
                            refit='accuracy', 
                            return_train_score=False).fit(X,Y)

# Extracting relevant metrics. More detailed metrics are available in the cv_results_ object.
relevant_metric = {'mean_test_{}'.format(k): grid_clf.cv_results_['mean_test_{}'.format(k)] for k in scoring_dict.keys()}
relevant_metric['params'] = grid_clf.cv_results_['params']
relevant_metric

In [None]:
print(grid_clf.best_score_)
print(grid_clf.best_params_)