In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

In [2]:
data = pd.read_csv("data/train.csv")
data = data.drop(columns=["Cabin"])
data = data.dropna(axis=0)
data.describe(include="all")
y = data.Survived
features = ["Pclass","Sex","Age","SibSp","Parch","Ticket","Fare","Embarked"]
X = pd.get_dummies(data[features])

X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=1)
data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Embarked'],
      dtype='object')

In [3]:
# just trying a simple linear one first

clf = svm.SVC(kernel="linear")
clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)

print(accuracy_score(y_val, y_pred))

0.8089887640449438


In [4]:
# hyperparameter optimization
# This block uses the Kfold thing for accuracy ratings (and the grid search), which is why 
# it's a little better/different than the next block

# C_val = [10**exp for exp in range(-5,6)]
C = [10**exp for exp in range(-5,6)]
kernels = ["linear","poly","rbf","sigmoid"]
degrees = [num for num in range(2,5)]
model = svm.SVC()

grid = dict(kernel=kernels,degree=degrees,C=C)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.820240 using {'C': 10000, 'degree': 2, 'kernel': 'poly'}
0.632069 (0.038520) with: {'C': 1e-05, 'degree': 2, 'kernel': 'linear'}
0.595509 (0.002725) with: {'C': 1e-05, 'degree': 2, 'kernel': 'poly'}
0.595509 (0.002725) with: {'C': 1e-05, 'degree': 2, 'kernel': 'rbf'}
0.595509 (0.002725) with: {'C': 1e-05, 'degree': 2, 'kernel': 'sigmoid'}
0.632069 (0.038520) with: {'C': 1e-05, 'degree': 3, 'kernel': 'linear'}
0.599724 (0.003704) with: {'C': 1e-05, 'degree': 3, 'kernel': 'poly'}
0.595509 (0.002725) with: {'C': 1e-05, 'degree': 3, 'kernel': 'rbf'}
0.595509 (0.002725) with: {'C': 1e-05, 'degree': 3, 'kernel': 'sigmoid'}
0.632069 (0.038520) with: {'C': 1e-05, 'degree': 4, 'kernel': 'linear'}
0.602531 (0.011198) with: {'C': 1e-05, 'degree': 4, 'kernel': 'poly'}
0.595509 (0.002725) with: {'C': 1e-05, 'degree': 4, 'kernel': 'rbf'}
0.595509 (0.002725) with: {'C': 1e-05, 'degree': 4, 'kernel': 'sigmoid'}
0.653098 (0.048905) with: {'C': 0.0001, 'degree': 2, 'kernel': 'linear'}
0.599724 (

In [6]:
# no Kfold; and me just brute forcing it with nested loops lol

def get_accuracy(C_val, kernel, degree, X_train, X_val, y_train, y_val):
    model = svm.SVC(C=C_val, kernel=kernel, degree=degree)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    return accuracy_score(y_val, y_pred)

accuracies = []  
    
for c_val in C:
    for kernel in kernels:
        for degree in degrees:
            accuracy = get_accuracy(c_val, kernel, degree, X_train, X_val, y_train, y_val)
            accuracies.append({"C": c_val,"kernel": kernel,"deg":degree, "accuracy":accuracy})

bestAccuracy = max(accuracies, key=lambda x:x["accuracy"])

print(*sorted(accuracies, key = lambda x:x["accuracy"], reverse = True), sep = "\n")

{'C': 100000, 'kernel': 'poly', 'deg': 2, 'accuracy': 0.8258426966292135}
{'C': 1, 'kernel': 'linear', 'deg': 2, 'accuracy': 0.8089887640449438}
{'C': 1, 'kernel': 'linear', 'deg': 3, 'accuracy': 0.8089887640449438}
{'C': 1, 'kernel': 'linear', 'deg': 4, 'accuracy': 0.8089887640449438}
{'C': 10, 'kernel': 'linear', 'deg': 2, 'accuracy': 0.8089887640449438}
{'C': 10, 'kernel': 'linear', 'deg': 3, 'accuracy': 0.8089887640449438}
{'C': 10, 'kernel': 'linear', 'deg': 4, 'accuracy': 0.8089887640449438}
{'C': 10000, 'kernel': 'poly', 'deg': 2, 'accuracy': 0.8089887640449438}
{'C': 10000, 'kernel': 'rbf', 'deg': 2, 'accuracy': 0.8089887640449438}
{'C': 10000, 'kernel': 'rbf', 'deg': 3, 'accuracy': 0.8089887640449438}
{'C': 10000, 'kernel': 'rbf', 'deg': 4, 'accuracy': 0.8089887640449438}
{'C': 100000, 'kernel': 'poly', 'deg': 3, 'accuracy': 0.8033707865168539}
{'C': 1000, 'kernel': 'rbf', 'deg': 2, 'accuracy': 0.797752808988764}
{'C': 1000, 'kernel': 'rbf', 'deg': 3, 'accuracy': 0.79775280898