### Predictive Model with SVM

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

#Load libraries for data processing
import pandas as pd #data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np
from scipy.stats import norm

## Supervised learning.
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix
from sklearn import metrics, preprocessing
from sklearn.metrics import classification_report

from scipy.io import loadmat 

# visualization
import seaborn as sns 
plt.style.use('fivethirtyeight')
sns.set_style("white")

plt.rcParams['figure.figsize'] = (8,4)

In [2]:
xx=loadmat('./wk_60_new.mat')

In [3]:
xx

{'__header__': b'MATLAB 5.0 MAT-file, Platform: PCWIN64, Created on: Tue Oct 31 02:13:28 2023',
 '__version__': '1.0',
 '__globals__': [],
 'wk_60': array([[ 7.47990000e-02, -3.24134000e-01,  5.68730000e-02, ...,
          5.23274980e+01,  7.57963270e+01,  2.00000000e+00],
        [ 4.73560000e-01,  4.93945000e-01, -9.48970000e-02, ...,
          5.23274980e+01,  7.56483330e+01,  2.00000000e+00],
        [ 7.47990000e-02,  6.83048000e-01, -2.26431000e-01, ...,
          5.22801860e+01,  7.55496700e+01,  2.00000000e+00],
        ...,
        [ 1.51900000e-02,  1.01349000e-01,  1.30437411e+02, ...,
          0.00000000e+00,  7.56175010e+01,  2.00000000e+00],
        [-5.88070000e-02,  5.20180000e-02,  1.30670125e+02, ...,
          0.00000000e+00,  7.55065050e+01,  2.00000000e+00],
        [ 7.47990000e-02,  7.25730000e-02,  1.30154107e+02, ...,
          0.00000000e+00,  7.55311710e+01,  2.00000000e+00]])}

In [4]:
df=xx['wk_60']
df2=xx['wk_60']

In [5]:
X=df[:,:8]

In [6]:
y=df2[:,-1]

In [7]:
X.shape

(5800, 8)

In [8]:
y.shape

(5800,)

In [9]:
X

array([[ 7.47990000e-02, -3.24134000e-01,  5.68730000e-02, ...,
         2.14258395e+02,  5.20909360e+01,  5.23274980e+01],
       [ 4.73560000e-01,  4.93945000e-01, -9.48970000e-02, ...,
         2.14272931e+02,  5.21382490e+01,  5.23274980e+01],
       [ 7.47990000e-02,  6.83048000e-01, -2.26431000e-01, ...,
         2.14287453e+02,  5.20909360e+01,  5.22801860e+01],
       ...,
       [ 1.51900000e-02,  1.01349000e-01,  1.30437411e+02, ...,
         2.54587473e+02,  0.00000000e+00,  0.00000000e+00],
       [-5.88070000e-02,  5.20180000e-02,  1.30670125e+02, ...,
         2.54587473e+02,  0.00000000e+00,  0.00000000e+00],
       [ 7.47990000e-02,  7.25730000e-02,  1.30154107e+02, ...,
         2.54587473e+02,  0.00000000e+00,  0.00000000e+00]])

In [None]:
y

In [None]:
type(y)

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

In [None]:
X = SelectKBest(f_classif, k=5).fit_transform(X, y)

In [None]:
X.shape

In [10]:
#transform the class labels from their original string representation (M and B) into integers
le = LabelEncoder()
y = le.fit_transform(y)

# Normalize the  data (center around 0 and scale to remove the variance).
scaler =StandardScaler()
Xs = scaler.fit_transform(X)

In [11]:
Xs.shape

(5800, 8)

In [12]:
# compare the number of repeats for repeated k-fold cross-validation
from scipy.stats import sem
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from matplotlib import pyplot

In [None]:
# evaluate a model with a given number of repeats
clf = SVC(probability=True)
def evaluate_model_acc(X, y, repeats):
    # prepare the cross-validation procedure
    cv = RepeatedKFold(n_splits=10, n_repeats=repeats, random_state=1)
    # create model
    # model = LogisticRegression()
    # evaluate model
    scores = cross_val_score(clf, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    return scores
 
def evaluate_model_apr(X, y, repeats):
    # prepare the cross-validation procedure
    cv = RepeatedKFold(n_splits=10, n_repeats=repeats, random_state=1)
    # create model
    # model = LogisticRegression()
    # evaluate model
    scores = cross_val_score(clf, X, y, scoring='average_precision', cv=cv, n_jobs=-1)
    return scores

def evaluate_model_f1(X, y, repeats):
    # prepare the cross-validation procedure
    cv = RepeatedKFold(n_splits=10, n_repeats=repeats, random_state=1)
    # create model
    # model = LogisticRegression()
    # evaluate model
    scores = cross_val_score(clf, X, y, scoring='f1', cv=cv, n_jobs=-1)
    return scores

def evaluate_model_recall(X, y, repeats):
    # prepare the cross-validation procedure
    cv = RepeatedKFold(n_splits=10, n_repeats=repeats, random_state=1)
    # create model
    # model = LogisticRegression()
    # evaluate model
    scores = cross_val_score(clf, X, y, scoring='recall', cv=cv, n_jobs=-1)
    return scores

# configurations to test
repeats = range(1,4)
results_acc = list()
results_apr = list()
results_f1 = list()
results_recall = list()

for r in repeats:
    # evaluate using a given number of repeats
    scores_acc = evaluate_model_acc(Xs, y, r)
    scores_apr = evaluate_model_apr(Xs, y, r)
    scores_f1 = evaluate_model_f1(Xs, y, r)
    scores_recall = evaluate_model_recall(Xs, y, r)
    # summarize
    print('>%d mean=%.4f se=%.3f std=%.3f' % (r, mean(scores_acc), sem(scores_acc), std(scores_acc)))
    print('>%d mean=%.4f se=%.3f std=%.3f' % (r, mean(scores_apr), sem(scores_apr), std(scores_apr)))
    print('>%d mean=%.4f se=%.3f std=%.3f' % (r, mean(scores_f1), sem(scores_f1), std(scores_f1)))
    print('>%d mean=%.4f se=%.3f std=%.3f' % (r, mean(scores_recall), sem(scores_recall),std(scores_recall)))
    # store
    results_acc.append(scores_acc)
    results_apr.append(scores_apr)
    results_f1.append(scores_f1)
    results_recall.append(scores_recall)
# plot the results
pyplot.boxplot(results_acc, labels=[str(r) for r in repeats], showmeans=True)
pyplot.show()

In [None]:
print('>mean=%.4f se=%.3f std=%.3f' % ( mean(scores_apr), sem(scores_apr), std(scores_apr)))

In [13]:
clf = SVC(probability=True)
X_train, X_test, y_train, y_test = train_test_split(Xs, y, test_size=0.3, random_state=2, stratify=y)
y_pred = clf.fit(X_train, y_train).predict(X_test)
cm = metrics.confusion_matrix(y_test, y_pred)
cm

array([[916, 122],
       [396, 306]])

In [14]:
print(classification_report(y_test, y_pred ))

              precision    recall  f1-score   support

           0       0.70      0.88      0.78      1038
           1       0.71      0.44      0.54       702

    accuracy                           0.70      1740
   macro avg       0.71      0.66      0.66      1740
weighted avg       0.70      0.70      0.68      1740

