In [11]:
import pandas as pd
from sklearn.svm import SVC

In [12]:
df = pd.read_csv('large_data.csv')
df['TYPE'] = df['TYPE'].astype('category')

In [13]:
input_names = list(df.columns)
input_names.remove('TYPE')

all_inputs = df[input_names].values
all_labels = df['TYPE'].values

In [14]:
from sklearn.model_selection import train_test_split

(training_inputs,
 testing_inputs,
 training_classes,
 testing_classes) = train_test_split(all_inputs, all_labels, test_size=0.25, random_state=1, stratify=all_labels)

In [15]:
from sklearn.preprocessing import StandardScaler  

scaler = StandardScaler()
scaler.fit(training_inputs)
training_inputs = scaler.transform(training_inputs)
testing_inputs = scaler.transform(testing_inputs) 

In [16]:
# Create the classifier
clf = SVC()

# Train the classifier on the training set
clf.fit(training_inputs, training_classes)

# Validate the classifier on the testing set using classification accuracy
clf.score(testing_inputs, testing_classes)

0.919380960950153

In [17]:
from sklearn.metrics import classification_report, confusion_matrix

predictions = clf.predict(testing_inputs)
print(classification_report(testing_classes, predictions))

              precision    recall  f1-score   support

     ALLERGY       0.95      0.99      0.97      4096
        COLD       0.57      0.46      0.51       256
       COVID       0.40      0.42      0.41       512
         FLU       0.95      0.93      0.94      6250

    accuracy                           0.92     11114
   macro avg       0.72      0.70      0.71     11114
weighted avg       0.92      0.92      0.92     11114



In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
                
parameter_grid = {'C' : [0.1, 1, 10, 100], 
            'gamma' : [1,0.1,0.01,0.001],
            'kernel': ['rbf']}

cross_validation = StratifiedKFold(n_splits=5)

grid_search = GridSearchCV(SVC(),
                           param_grid=parameter_grid,
                           cv=cross_validation,
                           refit=True,
                           verbose=3)

grid_search.fit(all_inputs, all_labels)
print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))
print('Best estimator: {}'.format(grid_search.best_estimator_))

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END .....................C=0.1, gamma=1, kernel=rbf; total time= 6.8min
[CV 2/5] END .....................C=0.1, gamma=1, kernel=rbf; total time= 7.4min
[CV 3/5] END .....................C=0.1, gamma=1, kernel=rbf; total time= 7.3min
[CV 4/5] END .....................C=0.1, gamma=1, kernel=rbf; total time= 6.7min
[CV 5/5] END .....................C=0.1, gamma=1, kernel=rbf; total time= 6.4min
[CV 1/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=  23.5s
[CV 2/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=  23.6s
[CV 3/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=  22.6s
[CV 4/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=  22.8s
[CV 5/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=  22.7s
[CV 1/5] END ..................C=0.1, gamma=0.01, kernel=rbf; total time=  29.7s
[CV 2/5] END ..................C=0.1, gamma=0.01

In [9]:
parameter_grid = {'C' : [0.1, 1, 10, 100], 
            'gamma' : [0.1,0.01,0.001],
            'kernel': ['rbf', 'poly', 'sigmoid']}

cross_validation = StratifiedKFold(n_splits=5)

grid_search = GridSearchCV(SVC(),
                           param_grid=parameter_grid,
                           cv=cross_validation,
                           refit=True,
                           verbose=3)

grid_search.fit(all_inputs, all_labels)
print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))
print('Best estimator: {}'.format(grid_search.best_estimator_))

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=  19.9s
[CV 2/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=  19.4s
[CV 3/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=  19.9s
[CV 4/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=  24.9s
[CV 5/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=  20.5s
[CV 1/5] END ..................C=0.1, gamma=0.1, kernel=poly; total time=  19.2s
[CV 2/5] END ..................C=0.1, gamma=0.1, kernel=poly; total time=  16.6s
[CV 3/5] END ..................C=0.1, gamma=0.1, kernel=poly; total time=  17.4s
[CV 4/5] END ..................C=0.1, gamma=0.1, kernel=poly; total time=  16.2s
[CV 5/5] END ..................C=0.1, gamma=0.1, kernel=poly; total time=  17.0s
[CV 1/5] END ...............C=0.1, gamma=0.1, kernel=sigmoid; total time=  13.9s
[CV 2/5] END ...............C=0.1, gamma=0.1, k

In [10]:
parameter_grid = {'C' : [0.1, 0.5, 1, 10, 100],
            'kernel': ['linear']}

cross_validation = StratifiedKFold(n_splits=5)

grid_search = GridSearchCV(SVC(),
                           param_grid=parameter_grid,
                           cv=cross_validation,
                           refit=True,
                           verbose=3)

grid_search.fit(all_inputs, all_labels)
print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))
print('Best estimator: {}'.format(grid_search.best_estimator_))

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5] END ...........................C=0.1, kernel=linear; total time=  39.6s
[CV 2/5] END ...........................C=0.1, kernel=linear; total time=  28.2s
[CV 3/5] END ...........................C=0.1, kernel=linear; total time=  26.3s
[CV 4/5] END ...........................C=0.1, kernel=linear; total time=  32.6s
[CV 5/5] END ...........................C=0.1, kernel=linear; total time=  28.0s
[CV 1/5] END ...........................C=0.5, kernel=linear; total time=   8.9s
[CV 2/5] END ...........................C=0.5, kernel=linear; total time=  11.2s
[CV 3/5] END ...........................C=0.5, kernel=linear; total time=  13.2s
[CV 4/5] END ...........................C=0.5, kernel=linear; total time=  13.3s
[CV 5/5] END ...........................C=0.5, kernel=linear; total time=  10.4s
[CV 1/5] END .............................C=1, kernel=linear; total time=   8.7s
[CV 2/5] END .............................C=1, ke