### Predictive Model with SVM

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

#Load libraries for data processing
import pandas as pd #data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np
from scipy.stats import norm

## Supervised learning.
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix
from sklearn import metrics, preprocessing
from sklearn.metrics import classification_report

from scipy.io import loadmat 

# visualization
import seaborn as sns 
plt.style.use('fivethirtyeight')
sns.set_style("white")

plt.rcParams['figure.figsize'] = (8,4)

In [2]:
xx=loadmat('./mu_60_new.mat')

In [3]:
xx

{'__header__': b'MATLAB 5.0 MAT-file, Platform: PCWIN64, Created on: Sun Oct 29 19:23:43 2023',
 '__version__': '1.0',
 '__globals__': [],
 'mu_60': array([[-1.97530000e-02,  1.44514000e-01,  2.65190000e-02, ...,
          5.49296790e+01,  7.58764900e+01,  2.00000000e+00],
        [-2.04746000e-01,  2.32410000e-02,  1.07463000e-01, ...,
          5.48823670e+01,  7.56483330e+01,  1.00000000e+00],
        [ 2.72124000e-01, -9.39210000e-02,  1.17581000e-01, ...,
          5.49296790e+01,  7.59998190e+01,  2.00000000e+00],
        ...,
        [ 9.02400000e-03, -2.60900000e-02,  1.14258724e+02, ...,
          0.00000000e+00,  7.53030130e+01,  1.00000000e+00],
        [ 2.13560000e-02, -1.26808000e-01,  1.14400376e+02, ...,
          0.00000000e+00,  7.55065050e+01,  1.00000000e+00],
        [-5.67520000e-02, -9.59760000e-02,  1.14127190e+02, ...,
          0.00000000e+00,  7.53831770e+01,  1.00000000e+00]])}

In [4]:
df=xx['mu_60']
df2=xx['mu_60']

In [5]:
X=df[:,:8]

In [6]:
y=df2[:,-1]

In [None]:
X.shape

In [None]:
y.shape

In [None]:
X

In [None]:
y

In [None]:
type(y)

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

In [None]:
X = SelectKBest(f_classif, k=5).fit_transform(X, y)

In [None]:
X.shape

In [7]:
#transform the class labels from their original string representation (M and B) into integers
le = LabelEncoder()
y = le.fit_transform(y)

# Normalize the  data (center around 0 and scale to remove the variance).
scaler =StandardScaler()
Xs = scaler.fit_transform(X)

In [None]:
Xs.shape

In [None]:
# compare the number of repeats for repeated k-fold cross-validation
from scipy.stats import sem
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from matplotlib import pyplot

In [None]:
# evaluate a model with a given number of repeats
clf = SVC(probability=True)
def evaluate_model_acc(X, y, repeats):
    # prepare the cross-validation procedure
    cv = RepeatedKFold(n_splits=10, n_repeats=repeats, random_state=1)
    # create model
    # model = LogisticRegression()
    # evaluate model
    scores = cross_val_score(clf, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    return scores
 
def evaluate_model_apr(X, y, repeats):
    # prepare the cross-validation procedure
    cv = RepeatedKFold(n_splits=10, n_repeats=repeats, random_state=1)
    # create model
    # model = LogisticRegression()
    # evaluate model
    scores = cross_val_score(clf, X, y, scoring='average_precision', cv=cv, n_jobs=-1)
    return scores

def evaluate_model_f1(X, y, repeats):
    # prepare the cross-validation procedure
    cv = RepeatedKFold(n_splits=10, n_repeats=repeats, random_state=1)
    # create model
    # model = LogisticRegression()
    # evaluate model
    scores = cross_val_score(clf, X, y, scoring='f1', cv=cv, n_jobs=-1)
    return scores

def evaluate_model_recall(X, y, repeats):
    # prepare the cross-validation procedure
    cv = RepeatedKFold(n_splits=10, n_repeats=repeats, random_state=1)
    # create model
    # model = LogisticRegression()
    # evaluate model
    scores = cross_val_score(clf, X, y, scoring='recall', cv=cv, n_jobs=-1)
    return scores

# configurations to test
repeats = range(1,4)
results_acc = list()
results_apr = list()
results_f1 = list()
results_recall = list()

for r in repeats:
    # evaluate using a given number of repeats
    scores_acc = evaluate_model_acc(Xs, y, r)
    scores_apr = evaluate_model_apr(Xs, y, r)
    scores_f1 = evaluate_model_f1(Xs, y, r)
    scores_recall = evaluate_model_recall(Xs, y, r)
    # summarize
    print('>%d mean=%.4f se=%.3f std=%.3f' % (r, mean(scores_acc), sem(scores_acc), std(scores_acc)))
    print('>%d mean=%.4f se=%.3f std=%.3f' % (r, mean(scores_apr), sem(scores_apr), std(scores_apr)))
    print('>%d mean=%.4f se=%.3f std=%.3f' % (r, mean(scores_f1), sem(scores_f1), std(scores_f1)))
    print('>%d mean=%.4f se=%.3f std=%.3f' % (r, mean(scores_recall), sem(scores_recall),std(scores_recall)))
    # store
    results_acc.append(scores_acc)
    results_apr.append(scores_apr)
    results_f1.append(scores_f1)
    results_recall.append(scores_recall)
# plot the results
pyplot.boxplot(results_acc, labels=[str(r) for r in repeats], showmeans=True)
pyplot.show()

In [None]:
print('>mean=%.4f se=%.3f std=%.3f' % ( mean(scores_apr), sem(scores_apr), std(scores_apr)))

In [8]:
clf = SVC(probability=True)
X_train, X_test, y_train, y_test = train_test_split(Xs, y, test_size=0.3, random_state=2, stratify=y)
y_pred = clf.fit(X_train, y_train).predict(X_test)
cm = metrics.confusion_matrix(y_test, y_pred)
cm

array([[1875,   56],
       [ 303,  136]])

In [9]:
print(classification_report(y_test, y_pred ))

              precision    recall  f1-score   support

           0       0.86      0.97      0.91      1931
           1       0.71      0.31      0.43       439

    accuracy                           0.85      2370
   macro avg       0.78      0.64      0.67      2370
weighted avg       0.83      0.85      0.82      2370

