# Support Vector Machines

In [14]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import accuracy_score, confusion_matrix
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np

import pandas as pd

In [2]:
raw_train_data = "../../data/train.csv"
raw_test_data = "../../data/test.csv"

train_data = pd.read_csv(raw_train_data)
test_data = pd.read_csv(raw_test_data)

In [6]:
train_data

Unnamed: 0,SNo,lat,lon,TMQ,U850,V850,UBOT,VBOT,QREFHT,PS,...,T200,T500,PRECT,TS,TREFHT,Z1000,Z200,ZBOT,time,Label
0,1,21.707953,275.0000,78.909431,-58.805229,36.200672,-58.260475,-0.722955,0.022321,97497.79688,...,227.119232,273.045563,1.390000e-05,302.350861,300.026764,66.339836,12400.22754,66.241585,20031024,1
1,2,21.707953,275.0000,78.909431,-58.805229,36.200672,-58.260475,-0.722955,0.022321,97497.79688,...,227.119232,273.045563,1.390000e-05,302.350861,300.026764,66.339836,12400.22754,66.241585,20031024,1
2,3,21.473272,275.0000,80.641357,-42.340290,58.060246,-56.400234,26.484743,0.022204,96611.60156,...,229.172897,273.580810,1.440000e-05,302.350861,299.798828,66.165573,12371.33203,66.165268,20031024,1
3,4,21.473272,275.0000,80.641357,-42.340290,58.060246,-56.400234,26.484743,0.022204,96611.60156,...,229.172897,273.580810,1.440000e-05,302.350861,299.798828,66.165573,12371.33203,66.165268,20031024,1
4,5,21.707953,275.3125,76.891205,-45.996342,42.691631,-48.993065,10.246445,0.022328,98168.55469,...,224.642563,272.152283,9.890000e-06,302.462708,300.160187,66.880089,12386.88574,66.275246,20031024,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44755,44756,13.494133,253.7500,58.471558,26.397322,5.664860,30.215387,1.547393,0.021974,98739.55469,...,217.845352,272.109589,1.120000e-07,301.878662,300.574646,66.438522,12340.96973,66.438522,20011207,1
44756,44757,13.494133,253.7500,58.471558,26.397322,5.664860,30.215387,1.547393,0.021974,98739.55469,...,217.845352,272.109589,1.120000e-07,301.878662,300.574646,66.438522,12340.96973,66.438522,20011207,0
44757,44758,13.494133,253.7500,58.471558,26.397322,5.664860,30.215387,1.547393,0.021974,98739.55469,...,217.845352,272.109589,1.120000e-07,301.878662,300.574646,66.438522,12340.96973,66.438522,20011207,1
44758,44759,13.494133,253.7500,58.471558,26.397322,5.664860,30.215387,1.547393,0.021974,98739.55469,...,217.845352,272.109589,1.120000e-07,301.878662,300.574646,66.438522,12340.96973,66.438522,20011207,1


In [16]:
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    train_data.drop("Label", axis=1), train_data.Label, test_size=0.3, random_state=42
)

# Standardize the data
scaler = StandardScaler().fit(X_train)
X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)

class_weights = {0: 1, 1: 16, 2: 5.3}  # for class 0  # for class 1  # for class 2

# Define a pipeline with feature selection and SVC
estimator = SVC()
selector = SelectKBest(f_classif, k='all')  # 'all' keeps all features. Adjust as needed.
pipeline = Pipeline([
    ('feature_selection', selector),
    ('classifier', estimator)
])


In [31]:
# Define the hyperparameter grid to search
param_grid = {
    'classifier__C': [0.1, 1],
    'classifier__kernel': ['linear', 'rbf'],
    'classifier__gamma': ['scale', 'auto', 0.1, 1],
    'classifier__class_weight': [class_weights, 'balanced'],
    'feature_selection__k': [5,7,9,11]
}

# Create the grid search with cross-validation using the pipeline
clf = GridSearchCV(pipeline, param_grid, cv=5, verbose=2, n_jobs=-1)

# Fit the model
clf.fit(X_train_std, y_train)

Fitting 5 folds for each of 128 candidates, totalling 640 fits


[CV] END classifier__C=0.1, classifier__class_weight={0: 1, 1: 16, 2: 5.3}, classifier__gamma=scale, classifier__kernel=linear, feature_selection__k=5; total time= 1.1min
[CV] END classifier__C=0.1, classifier__class_weight={0: 1, 1: 16, 2: 5.3}, classifier__gamma=scale, classifier__kernel=linear, feature_selection__k=5; total time= 1.1min
[CV] END classifier__C=0.1, classifier__class_weight={0: 1, 1: 16, 2: 5.3}, classifier__gamma=scale, classifier__kernel=linear, feature_selection__k=5; total time= 1.1min
[CV] END classifier__C=0.1, classifier__class_weight={0: 1, 1: 16, 2: 5.3}, classifier__gamma=scale, classifier__kernel=linear, feature_selection__k=5; total time= 1.1min
[CV] END classifier__C=0.1, classifier__class_weight={0: 1, 1: 16, 2: 5.3}, classifier__gamma=scale, classifier__kernel=linear, feature_selection__k=5; total time= 1.1min
[CV] END classifier__C=0.1, classifier__class_weight={0: 1, 1: 16, 2: 5.3}, classifier__gamma=scale, classifier__kernel=linear, feature_selection

In [32]:
# Print the best hyperparameters
print("Best hyperparameters:", clf.best_params_)

# Test the model
accuracy = clf.score(X_test_std, y_test)
print(f"Test accuracy with best hyperparameters: {accuracy:.2f}")

Best hyperparameters: {'classifier__C': 1, 'classifier__class_weight': {0: 1, 1: 16, 2: 5.3}, 'classifier__gamma': 1, 'classifier__kernel': 'rbf', 'feature_selection__k': 11}
Test accuracy with best hyperparameters: 0.85


In [7]:
param_grid = {'classifier__C': [1], 'classifier__class_weight': [{0: 1, 1: 16, 2: 5.3}], 'classifier__gamma': [1], 'classifier__kernel': ['rbf'], 'feature_selection__k': [11]}

# Create the grid search with cross-validation using the pipeline
clf = GridSearchCV(pipeline, param_grid, cv=2, verbose=2, n_jobs=-1)

# Fit the model
clf.fit(X_train_std, y_train)

Fitting 2 folds for each of 1 candidates, totalling 2 fits
[CV] END classifier__C=1, classifier__class_weight={0: 1, 1: 16, 2: 5.3}, classifier__gamma=1, classifier__kernel=rbf, feature_selection__k=11; total time=  14.6s
[CV] END classifier__C=1, classifier__class_weight={0: 1, 1: 16, 2: 5.3}, classifier__gamma=1, classifier__kernel=rbf, feature_selection__k=11; total time=  15.7s


In [18]:
confusion_matrix(y_test, clf.predict(X_test))



array([[10580,     0,     0],
       [  532,     0,     0],
       [ 2316,     0,     0]])

In [8]:
scaler = StandardScaler().fit(test_data)
X_test = scaler.transform(test_data)

In [10]:
test_data['Label'].value_counts(normalize=True)

0    0.671899
2    0.245640
1    0.082461
Name: Label, dtype: float64

In [19]:
pd.Series(clf.predict(X_test)).value_counts(normalize=True)



0    1.0
dtype: float64

In [9]:
test_data['Label'] = clf.predict(X_test)
submition = test_data[["SNo", "Label"]]
# submition.drop("index", axis=1, inplace=True)
submition.to_csv(f"predictions_{datetime.now()}_svm.csv", index=False)

In [None]:
# Extract information from cv_results_
mean_test_score = clf.cv_results_["mean_test_score"]  # Mean test accuracy
std_test_score = clf.cv_results_[
    "std_test_score"
]  # Standard deviation of test accuracy
mean_train_score = clf.cv_results_[
    "mean_train_score"
]  # Mean training accuracy (if return_train_score=True)
std_train_score = clf.cv_results_[
    "std_train_score"
]  # Standard deviation of training accuracy

# Generate x-axis labels (e.g., hyperparameter combinations)
params = [str(p) for p in clf.cv_results_["params"]]

# Plot validation errors
plt.figure(figsize=(10, 5))
plt.errorbar(
    np.arange(len(mean_test_score)),
    1 - mean_test_score,
    yerr=std_test_score,
    marker="o",
    label="Validation Error",
    capsize=5,
)
plt.errorbar(
    np.arange(len(mean_train_score)),
    1 - mean_train_score,
    yerr=std_train_score,
    marker="x",
    label="Training Error",
    capsize=5,
)
plt.xticks(np.arange(len(mean_test_score)), params, rotation=45, ha="right")
plt.ylabel("Error")
plt.legend()
plt.title("Validation and Training Errors for Different Hyperparameters")
plt.tight_layout()
plt.show()

# Plot accuracies
plt.figure(figsize=(10, 5))
plt.errorbar(
    np.arange(len(mean_test_score)),
    mean_test_score,
    yerr=std_test_score,
    marker="o",
    label="Validation Accuracy",
    capsize=5,
)
plt.errorbar(
    np.arange(len(mean_train_score)),
    mean_train_score,
    yerr=std_train_score,
    marker="x",
    label="Training Accuracy",
    capsize=5,
)
plt.xticks(np.arange(len(mean_test_score)), params, rotation=45, ha="right")
plt.ylabel("Accuracy")
plt.legend()
plt.title("Validation and Training Accuracies for Different Hyperparameters")
plt.tight_layout()
plt.show()