In [1]:
import numpy as np
import pandas as pd
import os
from utils import get_label, load_data, preprocess_dataframe, get_tuner, save_model, load_model, print_scores
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
train_df, test_df = load_data()
label = get_label()

In [4]:
input_features = train_df.columns.tolist()
input_features.remove('PassengerId')
input_features.remove('Cabin')
input_features.remove('Parch')
input_features.remove('SibSp')
input_features.remove('Name')
input_features.remove('Survived')

In [5]:
X_train, y_train, X_val, y_val = preprocess_dataframe(
    train_df,
    input_features=input_features,
    drop_na=False,
    fill_na=True,
    enable_categorical=False,
    drop_duplicates=True,
)

X_test = preprocess_dataframe(
    test_df,
    input_features=input_features,
    drop_na=False,
    fill_na=True,
    enable_categorical=False,
    test_split=None
)

X_train.head(5)

Converting Pclass to label
Converting Sex to label
Converting Embarked to label
Converting Pclass to label
Converting Sex to label
Converting Embarked to label


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Has_Sibsp,Has_Parch,Has_Family
331,0,1,1.126882,-0.078684,2,0,0,0
733,1,1,-0.450229,-0.377145,2,0,0,0
382,2,1,0.180616,-0.474867,2,0,0,0
704,2,1,-0.239947,-0.47623,2,1,0,1
813,2,0,-1.641823,-0.025249,2,1,1,1


In [6]:
X_val.head(5)

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Has_Sibsp,Has_Parch,Has_Family
709,2,1,-0.062842,-0.392468,0,1,1,1
439,1,1,0.076392,-0.513112,2,0,0,0
840,2,1,-0.689399,-0.578571,2,0,0,0
720,1,0,-1.664042,0.058863,2,0,1,1
39,2,0,-1.107103,-0.494257,0,1,0,1


In [7]:
feature_subset = ['Age', 'Sex', 'Fare', 'Has_Family']
pipeline = Pipeline([('classifier', SVC(gamma='auto'))])
pipeline.fit(X_train[feature_subset], y_train)

In [8]:
print_scores(pipeline, X_train[feature_subset],
             y_train, X_val[feature_subset], y_val)

Train Score: 0.7815750371471025
Val score: 0.7821229050279329


(0.7815750371471025, 0.7821229050279329)

## Hyperparameter tuning

In [9]:
hparam_grid = {
    'classifier__kernel': ['linear', 'rbf', 'sigmoid'],
    'classifier__gamma': ['auto', 'scale'],
    'classifier__max_iter': [-1, 1, 10],
    'classifier__C': [1, 3, 5],
}

In [10]:
tuner = get_tuner(pipeline, hparam_grid)
tuner.fit(X_train[feature_subset], y_train)



In [11]:
print_scores(tuner, X_train[feature_subset],
             y_train, X_val[feature_subset], y_val)

Train Score: 0.7919762258543833
Val score: 0.7877094972067039


(0.7919762258543833, 0.7877094972067039)