In [1]:
#Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

In [2]:
#To test svc we used two different datasets. The first dataset is a dataset where all values will be scaled down to a 
#normal distribution. After figuring out the accuracy and confirming it using KFold, we try the algorithm again using
#Data where the fare colum is separated into four different bins ($0-100, $100-200, $300-400, $400-550) classified by
#using numbers 0-4. ie. a binned fare of 0 would mean the ticket cost between $0 and $100.

In [3]:
#Read in the Dataframe to be used for testing the scaling method on the data
path = 'Resources/cleaned_train.csv'
train_data = pd.read_csv(path)
survived_list = np.array(train_data["Survived"])
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.25,0
1,1,1,1,38.0,1,0,71.2833,1
2,1,3,1,26.0,0,0,7.925,0
3,1,1,1,35.0,1,0,53.1,0
4,0,3,0,35.0,0,0,8.05,0


In [4]:
#Drop survived column
X = train_data.drop(labels="Survived", axis=1)
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,0,22.0,1,0,7.25,0
1,1,1,38.0,1,0,71.2833,1
2,3,1,26.0,0,0,7.925,0
3,1,1,35.0,1,0,53.1,0
4,3,0,35.0,0,0,8.05,0


In [5]:
#Create a scaler that scales the data down to a standard normal (Guassian) distribution (mean=0, standard deviation=1)
X_scaler = StandardScaler().fit(X)
X_scaled = X_scaler.transform(X)

In [6]:
#Split training and testing data, random state defines the way the split occurs in order to ensure our group works with identical data
y = survived_list
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, random_state=42)

In [7]:
#SVC model was tested using multiple kernels ('poly', 'sigmoid', 'linear')
#The default kernel of 'rbf' was the most accurate so that one was used.
svc = SVC(kernel='rbf')
svc.fit(X_train, y_train)
print('Test Acc: %.3f' % svc.score(X_test, y_test))

Test Acc: 0.816


In [8]:
#Unit testing was performed by executing kfold validation. This splits the data up into k equal segments and uses 1 fold as test data
#and k-1 folds as training data. This then runs k times so that each segment is used as testing data exactly once.
#I then averaged the results to get a more accurate algorithm accuracy.
svc = SVC(kernel='rbf', probability=True)
kfold_test_scores = []
# 10 folds of the data with the same random state
kf = KFold(n_splits=10, random_state=42)
#Get indices for each split (the data has 714 rows: the first split contains the slice indices from 0 to 72 as test data, and 73-713 as train data)
for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]
    #Fit the model with this split, score it, and append the score to the list
    svc.fit(X_train, y_train)
    test_score = svc.score(X_test, y_test)
    kfold_test_scores.append(test_score)
#Print all scores, then average them
print(kfold_test_scores)
avg = (sum(kfold_test_scores)/len(kfold_test_scores))
print(avg)

[0.8472222222222222, 0.875, 0.7638888888888888, 0.8333333333333334, 0.8028169014084507, 0.7746478873239436, 0.7605633802816901, 0.7887323943661971, 0.8732394366197183, 0.8591549295774648]
0.8178599374021909


In [9]:
#After running both methods, the scaler ends up being more accurate so I save it here. (Accuracy=0.82)
#Save the scaler and the svc model to pkl files
joblib.dump(X_scaler, 'Resources/models/svc_scaler.pkl');
joblib.dump(svc, 'Resources/models/svc.pkl');

In [41]:
#Now we begin using the binned dataset. In this step we find the most accurate kernel, compute accuracy, then use KFold to validate it.
#Prepare Dataset to be used in binning
train_data_cleaned = pd.read_csv('Resources/AG_train-test_v2.csv')
#test corresponds to our X data from above
test = train_data_cleaned.drop('Survived', axis = 1)
test.drop('Unnamed: 0', axis = 1, inplace=True)
train_data_cleaned.drop('Unnamed: 0', axis = 1, inplace=True)
#target corresponds to our y data from above
target = train_data_cleaned['Survived']
train_data_cleaned.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,0,0
1,1,1,1,38.0,1,0,0,1
2,1,3,1,26.0,0,0,0,0
3,1,1,1,35.0,1,0,0,0
4,0,3,0,35.0,0,0,0,0


In [43]:
#Find Length of X data
print(len(test['Pclass']))

707


In [44]:
#Binned Dataset splitting
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(test, target, random_state=42)

In [46]:
#Multiple Kernels Tested. The most accurate kernel was 'rbf', so that kernel was used.
#Other kernels tested were 'linear' and 'sigmoid'. The 'poly' kernel did not work for this dataset.
svc = SVC(kernel='rbf')
svc.fit(X_train_2, y_train_2)
print('Test Acc: %.3f' % svc.score(X_test_2, y_test_2))

Test Acc: 0.802


In [47]:
#Unit testing was performed by executing kfold validation. This splits the data up into k equal segments and uses 1 fold as test data
#and k-1 folds as training data. This then runs k times so that each segment is used as testing data exactly once.
#I then averaged the results to get a more accurate algorithm accuracy.
svc = SVC(kernel='rbf')
kfold_test_scores = []
# 10 folds of the data with the same random state
kf = KFold(n_splits=10, random_state=42)
#Get indices for each split (the data has 707 rows: the first split contains the slice indices from 0 to 71 as test data, and 72-707 as train data)
for train_index, test_index in kf.split(test):
    X_train, X_test = test.iloc[train_index], test.iloc[test_index]
    y_train, y_test = target[train_index], target[test_index]
    #Fit the model with this split, score it, and append the score to the list
    svc.fit(X_train, y_train)
    test_score = svc.score(X_test, y_test)
    kfold_test_scores.append(test_score)
#Print all scores, then average them
print(kfold_test_scores)
avg = (sum(kfold_test_scores)/len(kfold_test_scores))
print(avg)

[0.7183098591549296, 0.8028169014084507, 0.7464788732394366, 0.7887323943661971, 0.8169014084507042, 0.8169014084507042, 0.7746478873239436, 0.8428571428571429, 0.8285714285714286, 0.8428571428571429]
0.7979074446680079


In [None]:
#The result when the data is scaled is more accurate, so that result is used as the model accuracy for SVC.