In [None]:
# Written by Rahi Misra

In [None]:
# This jupyter notebook was used in an attempt to find the best hyperparameters from different types of svm models using gridsearch to try multiple combinations of hyperparameters

In [None]:
# The blocks of code with no output would not finish running and were attempted again in google colab for more computing power

In [1]:
# Necessary libraries
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the merged_data_combined.csv file
tennis = pd.read_csv("FinalTennisCSV.csv")

In [3]:
# Split the dataset into features and target
X = tennis.drop("winner", axis=1)
y = tennis["winner"]

In [4]:
# The amounts of features and touples
X.shape

(899360, 78)

In [5]:
# The amount of targets
y.shape

(899360,)

In [6]:
# Splits the dataset into training set and test set
# Training set is 80% touples
# Test set is 20% tuples

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [8]:
# RBF was not finishing after days so we'll estimate its accuracy with subsamples
train_subsample_size = int(X_train.shape[0] / 10)
test_subsample_size = int(X_test.shape[0] / 10)
X_train_scaled_subsampled = X_train_scaled[:train_subsample_size]
y_train_subsampled = y_train[:train_subsample_size]
X_test_scaled_subsampled = X_test_scaled[:test_subsample_size]
y_test_subsampled = y_test[:test_subsample_size]

# ----- GridSearchCV -----

----- LinearSVM Kernel Classifier -----

In [9]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

linear_svc = LinearSVC()
param_grid = {'C': [0.1, 1, 10]}

grid_search = GridSearchCV(linear_svc, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled_subsampled, y_train_subsampled)

print("Best Parameters:", grid_search.best_params_)
best_linear_svc = grid_search.best_estimator_

Best Parameters: {'C': 0.1}


----- RBF Kernel Classifier -----

In [None]:
rbf_svc = SVC(kernel='rbf')
param_grid = {'C': [0.1, 1, 10], 'gamma': [0.1, 1, 10]}

grid_search = GridSearchCV(rbf_svc, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled_subsampled, y_train_subsampled)

print("Best Parameters:", grid_search.best_params_)
best_rbf_svc = grid_search.best_estimator_

----- Polynomial Kernel Classifier -----

In [None]:
poly_svc = SVC(kernel='poly')
param_grid = {'C': [0.1, 1, 10], 'degree': [2, 3], 'gamma': [0.1, 1, 10]}

grid_search = GridSearchCV(poly_svc, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled_subsampled, y_train_subsampled)

print("Best Parameters:", grid_search.best_params_)
best_poly_svc = grid_search.best_estimator_

----- Evaluate and Compare Kernel  -----

In [None]:
# Linear
# Get predictions
linear_svc_predictions = best_linear_svc.predict(X_test_scaled_subsampled)

# Accuracy
linear_svc_accuracy = accuracy_score(y_test_subsampled, linear_svc_predictions)
print(f"Linear SVC Accuracy: {linear_svc_accuracy:.2f}")

# F-score
linear_svc_f1 = f1_score(y_test_subsampled, linear_svc_predictions, average='weighted')
print(f"Linear SVC F-score: {linear_svc_f1:.2f}")

# Confusion Matrix
linear_svc_cm = confusion_matrix(y_test_subsampled, linear_svc_predictions)
print("Linear SVC Confusion Matrix:")
print(linear_svc_cm)

# RBF
# Get predictions
rbf_svc_predictions = best_rbf_svc.predict(X_test_scaled_subsampled)

# Accuracy
rbf_svc_accuracy = accuracy_score(y_test_subsampled, rbf_svc_predictions)
print(f"RBF Kernel Accuracy: {rbf_svc_accuracy:.2f}")

# F-score
rbf_svc_f1 = f1_score(y_test_subsampled, rbf_svc_predictions, average='weighted')
print(f"RBF Kernel F-score: {rbf_svc_f1:.2f}")

# Confusion Matrix
rbf_svc_cm = confusion_matrix(y_test_subsampled, rbf_svc_predictions)
print("RBF Kernel Confusion Matrix:")
print(rbf_svc_cm)

# Polynomial
# Get predictions
poly_svc_predictions = best_poly_svc.predict(X_test_scaled_subsampled)

# Accuracy
poly_svc_accuracy = accuracy_score(y_test_subsampled, poly_svc_predictions)
print(f"Polynomial Kernel Accuracy: {poly_svc_accuracy:.2f}")

# F-score
poly_svc_f1 = f1_score(y_test_subsampled, poly_svc_predictions, average='weighted')
print(f"Polynomial Kernel F-score: {poly_svc_f1:.2f}")

# Confusion Matrix
poly_svc_cm = confusion_matrix(y_test_subsampled, poly_svc_predictions)
print("Polynomial Kernel Confusion Matrix:")
print(poly_svc_cm)