In [None]:
# Written by Rahi Misra

In [None]:
# This jupyter notebook was used in an attempt to train different svm models with different hyperparameters individually

In [None]:
# The blocks of code with no output would likely not finish and were attempted again elsewhere

In [1]:
# Necessary libraries
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score 
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the merged_data_combined.csv file
tennis = pd.read_csv("FinalTennisCSV.csv")

In [3]:
# Split the dataset into 
X = tennis.drop("winner", axis=1)
y = tennis["winner"]

In [4]:
# The amounts of features and touples
X.shape

(899360, 78)

In [5]:
# The amount of targets
y.shape

(899360,)

In [6]:
# Splits the dataset into training set and test set
# training set is 80% touples
# test set is 20% tuples

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

----- LinearSVM Classifier -----

In [8]:
# C=0.1
linear_svc_c01 = LinearSVC(C=0.1, loss="hinge", random_state=42)
linear_svc_c01.fit(X_train_scaled, y_train)

In [9]:
linear_svc_c01_predictions = linear_svc_c01.predict(X_test_scaled)

In [10]:
linear_svc_c01_accuracy = accuracy_score(y_test, linear_svc_c01_predictions)
print(f"Linear SVC Accuracy c=0.1: {linear_svc_c01_accuracy:.2f}")

Linear SVC Accuracy c=0.1: 0.63


In [11]:
# C=1
linear_svc_c1 = LinearSVC(C=1, loss="hinge", random_state=42)
linear_svc_c1.fit(X_train_scaled, y_train)

In [12]:
linear_svc_c1_predictions = linear_svc_c1.predict(X_test_scaled)

In [13]:
linear_svc_c1_accuracy = accuracy_score(y_test, linear_svc_c1_predictions)
print(f"Linear SVC Accuracy c=1: {linear_svc_c1_accuracy:.2f}")

Linear SVC Accuracy c=1: 0.63


In [14]:
# C=10
linear_svc_c10 = LinearSVC(C=10, loss="hinge", random_state=42)
linear_svc_c10.fit(X_train_scaled, y_train)

In [15]:
linear_svc_c10_predictions = linear_svc_c10.predict(X_test_scaled)

In [16]:
linear_svc_c10_accuracy = accuracy_score(y_test, linear_svc_c10_predictions)
print(f"Linear SVC Accuracy c=10: {linear_svc_c10_accuracy:.2f}")

Linear SVC Accuracy c=10: 0.61


----- Gausian RBF Kernel Classifier 1% subset -----

In [8]:
# RBF was not finishing after days so we'll estimate its accuracy with subsamples (1% of dataset)
subsample_size = int(X.shape[0] / 100)
X_train_scaled_subsampled = X_train_scaled[:subsample_size]
y_train_subsampled = y_train[:subsample_size]

In [18]:
# C=0.1
rbf_svc_c01 = SVC(kernel='rbf', C=0.1, random_state=42)
rbf_svc_c01.fit(X_train_scaled_subsampled, y_train_subsampled)

In [19]:
rbf_svc_c01_predictions = rbf_svc_c01.predict(X_test_scaled)

In [21]:
rbf_svc_c01_accuracy = accuracy_score(y_test, rbf_svc_c01_predictions)
print(f"RBF SVC Accuracy c=0.1: {rbf_svc_c01_accuracy:.2f}")

RBF SVC Accuracy c=0.1: 0.62


In [22]:
# C=1
rbf_svc_c1 = SVC(kernel='rbf', C=1, random_state=42)
rbf_svc_c1.fit(X_train_scaled_subsampled, y_train_subsampled)

In [23]:
rbf_svc_c1_predictions = rbf_svc_c1.predict(X_test_scaled)

In [24]:
rbf_svc_c1_accuracy = accuracy_score(y_test, rbf_svc_c1_predictions)
print(f"RBF SVC Accuracy c=1: {rbf_svc_c1_accuracy:.2f}")

RBF SVC Accuracy c=1: 0.62


In [25]:
# C=10
rbf_svc_c10 = SVC(kernel='rbf', C=10, random_state=42)
rbf_svc_c10.fit(X_train_scaled_subsampled, y_train_subsampled)

In [26]:
rbf_svc_c10_predictions = rbf_svc_c10.predict(X_test_scaled)

In [28]:
rbf_svc_c10_accuracy = accuracy_score(y_test, rbf_svc_c10_predictions)
print(f"RBF SVC Accuracy c=10: {rbf_svc_c10_accuracy:.2f}")

RBF SVC Accuracy c=10: 0.59


----- Gausian RBF Kernel Classifier 10% subset -----

In [9]:
# RBF was not finishing after days so we'll estimate its accuracy with subsamples (10% of dataset)
train_subsample_size = int(X_train.shape[0] / 10)
test_subsample_size = int(X_test.shape[0] / 10)
X_train_scaled_subsampled = X_train_scaled[:train_subsample_size]
y_train_subsampled = y_train[:train_subsample_size]
X_test_scaled_subsampled = X_test_scaled[:test_subsample_size]
y_train_subsampled = y_train[:train_subsample_size]

In [10]:
# C=0.1
rbf_svc_c01 = SVC(kernel='rbf', C=0.1, random_state=42)
rbf_svc_c01.fit(X_train_scaled_subsampled, y_train_subsampled)

In [11]:
rbf_svc_c01_predictions = rbf_svc_c01.predict(X_test_scaled)

In [12]:
rbf_svc_c01_accuracy = accuracy_score(y_test, rbf_svc_c01_predictions)
print(f"RBF SVC Accuracy c=0.1: {rbf_svc_c01_accuracy:.2f}")

RBF SVC Accuracy c=0.1: 0.63


In [13]:
# C=1
rbf_svc_c1 = SVC(kernel='rbf', C=1, random_state=42)
rbf_svc_c1.fit(X_train_scaled_subsampled, y_train_subsampled)

In [14]:
rbf_svc_c1_predictions = rbf_svc_c1.predict(X_test_scaled)

In [15]:
rbf_svc_c1_accuracy = accuracy_score(y_test, rbf_svc_c1_predictions)
print(f"RBF SVC Accuracy c=1: {rbf_svc_c1_accuracy:.2f}")

RBF SVC Accuracy c=1: 0.64


In [9]:
# C=10
rbf_svc_c10 = SVC(kernel='rbf', C=10, random_state=42)
rbf_svc_c10.fit(X_train_scaled_subsampled, y_train_subsampled)

In [10]:
rbf_svc_c10_predictions = rbf_svc_c10.predict(X_test_scaled)

In [11]:
rbf_svc_c10_accuracy = accuracy_score(y_test, rbf_svc_c10_predictions)
print(f"RBF SVC Accuracy c=10: {rbf_svc_c10_accuracy:.2f}")

RBF SVC Accuracy c=10: 0.61


----- Polynomial Features 2nd degree Classifier -----

In [9]:
poly2 = PolynomialFeatures(degree=2)
X_train_poly2 = poly2.fit_transform(X_train_scaled_subsampled)
X_test_poly2 = poly2.transform(X_test_scaled)

In [10]:
# C=0.1
poly2_svc_c01 = LinearSVC(C=0.1, loss="hinge", random_state=42)
poly2_svc_c01.fit(X_train_poly2, y_train_subsampled)

In [11]:
poly2_svc_c01_predictions = poly2_svc_c01.predict(X_test_poly2)

In [13]:
poly2_svc_c01_accuracy = accuracy_score(y_test, poly2_svc_c01_predictions)
print(f"Poly2 SVM Accuracy c=0.1: {poly2_svc_c01_accuracy:.2f}")

Poly2 SVM Accuracy c=0.1: 0.63


In [14]:
# C=1
poly2_svc_c1 = LinearSVC(C=1, loss="hinge", random_state=42)
poly2_svc_c1.fit(X_train_poly2, y_train_subsampled)

In [16]:
poly2_svc_c1_predictions = poly2_svc_c1.predict(X_test_poly2)

In [18]:
poly2_svc_c1_accuracy = accuracy_score(y_test, poly2_svc_c1_predictions)
print(f"Poly2 SVM Accuracy c=1: {poly2_svc_c1_accuracy:.2f}")

Poly2 SVM Accuracy c=1: 0.56


In [19]:
# C=10
poly2_svc_c10 = LinearSVC(C=10, loss="hinge", random_state=42)
poly2_svc_c10.fit(X_train_poly2, y_train_subsampled)

In [20]:
poly2_svc_c10_predictions = poly2_svc_c10.predict(X_test_poly2)

In [21]:
poly2_svc_c10_accuracy = accuracy_score(y_test, poly2_svc_c10_predictions)
print(f"Poly2 SVM Accuracy c=10: {poly2_svc_c10_accuracy:.2f}")

Poly2 SVM Accuracy c=10: 0.55


----- Polynomial 2nd degree kernel -----

In [None]:
# C=0.1
poly2_kernel_svm_c01 = SVC(kernel='poly', degree=2, coef0=1, C=0.1, random_state=42)
poly2_kernel_svm_c01.fit(X_train_poly2, y_train_subsampled)

In [None]:
poly2_kernel_svm_c01_predictions = poly2_kernel_svm_c01.predict(X_test_poly2)

In [None]:
poly2_kernel_svm_c01_accuracy = accuracy_score(y_test, poly2_kernel_svm_c01_predictions)
print(f"Poly2 Kernel SVM Accuracy c=0.1: {poly2_kernel_svm_c01_accuracy:.2f}")

In [None]:
----- Polynomial Features 3rd degree Classifier -----

In [None]:
poly3 = PolynomialFeatures(degree=3)
X_train_poly3 = poly3.fit_transform(X_train_scaled_subsampled)
X_test_poly3 = poly3.transform(X_test_scaled)

In [None]:
# C=0.1
poly3_svc_c01 = LinearSVC(C=0.1, loss="hinge", random_state=42)
poly3_svc_c01.fit(X_train_poly3, y_train)

In [None]:
poly3_svc_c01_predictions = poly3_svc_c01.predict(X_test_poly)

In [None]:
poly3_svc_c01_accuracy = accuracy_score(y_test, poly3_svc_c01_predictions)
print(f"Poly3 SVM Accuracy c=0.1: {poly3_svc_c01_accuracy:.2f}")

In [None]:
# C=1
poly3_svc_c1 = LinearSVC(C=1, loss="hinge", random_state=42)
poly3_svc_c1.fit(X_train_poly3, y_train)

In [None]:
poly3_svc_c1_predictions = poly3_svc_c1.predict(X_test_poly)

In [None]:
poly3_svc_c1_accuracy = accuracy_score(y_test, poly3_svc_c1_predictions)
print(f"Poly3 SVM Accuracy c=1: {poly3_svc_c1_accuracy:.2f}")

In [None]:
# C=10
poly3_svc_c10 = LinearSVC(C=10, loss="hinge", random_state=42)
poly3_svc_c10.fit(X_train_poly3, y_train)

In [None]:
poly3_svc_c10_predictions = poly3_svc_c10.predict(X_test_poly3)

In [None]:
poly3_svc_c10_accuracy = accuracy_score(y_test, poly3_svc_c10_predictions)
print(f"Poly3 SVM Accuracy c=10: {poly3_svc_c10_accuracy:.2f}")

In [None]:
----- Polynomial 3rd degree kernel -----

In [None]:
# C=0.1
poly3_kernel_svm_c01 = SVC(kernel='poly', degree=3, 0.1, random_state=42)
poly3_kernel_svm_c01.fit(X_train_poly3, y_train)

In [None]:
poly3_kernel_svm_c01_predictions = poly3_kernel_svm_c01.predict(X_test_poly3)

In [None]:
poly3_kernel_svm_c01_accuracy = accuracy_score(y_test, poly3_kernel_svm_c01_predictions)
print(f"Poly3 Kernel SVM Accuracy c=0.1: {poly3_kernel_svm_c01_accuracy:.2f}")

In [None]:
# C=1
poly3_kernel_svm_c1 = SVC(kernel='poly', degree=3, 1, random_state=42)
poly3_kernel_svm_c1.fit(X_train_poly3, y_train)

In [None]:
poly3_kernel_svm_c1_predictions = poly3_kernel_svm_c1.predict(X_test_poly3)

In [None]:
poly3_kernel_svm_c1_accuracy = accuracy_score(y_test, poly3_kernel_svm_c1_predictions)
print(f"Poly3 Kernel SVM Accuracy c=1: {poly3_kernel_svm_c1_accuracy:.2f}")

In [None]:
# C=10
poly3_kernel_svm_c10 = SVC(kernel='poly', degree=3, 10, random_state=42)
poly3_kernel_svm_c10.fit(X_train_poly3, y_train)

In [None]:
poly3_kernel_svm_c10_predictions = poly3_kernel_svm_c10.predict(X_test_poly3)

In [None]:
poly3_kernel_svm_c10_accuracy = accuracy_score(y_test, poly3_kernel_svm_c10_predictions)
print(f"Poly3 Kernel SVM Accuracy c=10: {poly3_kernel_svm_c10_accuracy:.2f}")