# Score 79.904%

Huge thank you to [Minsuk Heo](https://github.com/minsuk-heo)!

Most of this is his [work](https://github.com/minsuk-heo/kaggle-titanic/blob/master/titanic-solution.ipynb), I hope to expand on this in future submissions

#### Imports and loading data

In [4]:
import pandas as pd
from sklearn.svm import SVC

train = pd.read_csv('/Users/pbezuhov/git/Kaggle/data/titanic/train.csv')
test = pd.read_csv('/Users/pbezuhov/git/Kaggle/data/titanic/test.csv')

#### Pulling the title information out of people's name (like mr, mrs, dr, etc)

In [2]:
full_data = [train, test]

for dataset in full_data:
    dataset['Title'] = dataset['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

title_mapping = {"Mr": 0, "Miss": 1, "Mrs": 2, 
                 "Master": 3, "Dr": 3, "Rev": 3, "Col": 3, "Major": 3, "Mlle": 3,"Countess": 3,
                 "Ms": 3, "Lady": 3, "Jonkheer": 3, "Don": 3, "Dona" : 3, "Mme": 3,"Capt": 3,"Sir": 3 }
for dataset in full_data:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    
# delete unnecessary feature from dataset
training_data.drop('Name', axis=1, inplace=True)
testing_data.drop('Name', axis=1, inplace=True)

#### Make the categorical sex column into a numerical one

In [3]:
sex_mapping = {"male": 0, "female": 1}
for dataset in full_data:
    dataset['Sex'] = dataset['Sex'].map(sex_mapping)

In [4]:
training_data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,0,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,1,38.0,1,0,PC 17599,71.2833,C85,C,2
2,3,1,3,1,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,1,1,1,35.0,1,0,113803,53.1,C123,S,2
4,5,0,3,0,35.0,0,0,373450,8.05,,S,0


#### Fill in empty ages with the median value from the group

- It might help to add a column to the table saying which data I affected

In [4]:
# fill missing age with median age for each title (Mr, Mrs, Miss, Others)
training_data["Age"].fillna(training_data.groupby("Title")["Age"].transform("median"), inplace=True)
testing_data["Age"].fillna(testing_data.groupby("Title")["Age"].transform("median"), inplace=True)

#### The most embarked station is S, so it's safe to fill in nan values with S

- This rubs me the wrong way though, I think another column should definitely be added

In [5]:
for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')

#### If a fare is missing it is replaced with the median fare for the individual's pclass

In [6]:
# fill missing Fare with median fare for each Pclass
training_data["Fare"].fillna(training_data.groupby("Pclass")["Fare"].transform("median"), inplace=True)
testing_data["Fare"].fillna(testing_data.groupby("Pclass")["Fare"].transform("median"), inplace=True)

#### Cabin information from cabin data is pulled

In [7]:
for dataset in full_data:
    dataset['Cabin'] = dataset['Cabin'].str[:1]
    
training_data["Cabin"].fillna("None", inplace=True)
testing_data["Cabin"].fillna("None", inplace=True)

#### Age adjustments

In [23]:
for dataset in full_data:
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0,
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 26), 'Age'] = 1,
    dataset.loc[(dataset['Age'] > 26) & (dataset['Age'] <= 36), 'Age'] = 2,
    dataset.loc[(dataset['Age'] > 36) & (dataset['Age'] <= 62), 'Age'] = 3,
    dataset.loc[ dataset['Age'] > 62, 'Age'] = 4

#### Fare adjustments

In [28]:
for dataset in full_data:
    dataset.loc[ dataset['Fare'] <= 17, 'Fare'] = 0,
    dataset.loc[(dataset['Fare'] > 17) & (dataset['Fare'] <= 30), 'Fare'] = 1,
    dataset.loc[(dataset['Fare'] > 30) & (dataset['Fare'] <= 100), 'Fare'] = 2,
    dataset.loc[ dataset['Fare'] > 100, 'Fare'] = 3

#### Extra adjustments

In [9]:
train_data.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,0,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,1,38.0,1,0,PC 17599,71.2833,C,C,2
2,3,1,3,1,26.0,0,0,STON/O2. 3101282,7.925,,S,1


In [12]:
train_data["Cabin"].value_counts()

None    687
C        59
B        47
D        33
E        32
A        15
F        13
G         4
T         1
Name: Cabin, dtype: int64

In [13]:
cabin_mapping = {"None": 0, "C": 1, "B": 2, "D": 3, "E": 4, "A": 5, "F": 5, "G": 5, "T": 5}
for dataset in full_data:
    dataset['Cabin'] = dataset['Cabin'].map(cabin_mapping)

In [14]:
train_data["Embarked"].value_counts()

S    646
C    168
Q     77
Name: Embarked, dtype: int64

In [15]:
embarked_mapping = {"S": 0, "C": 1, "Q": 2}
for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].map(embarked_mapping)

#### The ticket column seems to have no information so I drop it

In [29]:
train_data = training_data.drop(['Ticket'], axis=1)
test_data = testing_data.drop(['Ticket'], axis=1)

#### Cross validation score

In [30]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
k_fold = KFold(shuffle=True, random_state=0)

In [37]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer

pipeline = make_pipeline(Imputer(), SVC())

train_y = train_data['Survived']
train_x = train_data.drop(['Survived', 'PassengerId'], axis=1)

scoring = 'accuracy'
score = cross_val_score(model, train_x, train_y, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)
print("\nAverage is ...")
print(sum(score) / len(score))

[0.81111111 0.79775281 0.83146067 0.82022472 0.84269663 0.80898876
 0.83146067 0.80898876 0.83146067 0.86516854]

Average is ...
0.8249313358302123


#### Fitting

In [38]:
pipeline = make_pipeline(Imputer(), SVC())
pipeline.fit(train_x, train_y)

Pipeline(memory=None,
     steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [39]:
test_x = test_data.drop("PassengerId", axis=1)
prediction = pipeline.predict(test_x)

In [40]:
submission = pd.DataFrame({
        "PassengerId": test_data["PassengerId"],
        "Survived": prediction
    })

submission.to_csv('/Users/pbezuhov/git/Kaggle/submissions/titanic/10_svc.csv', index=False)