# Score 79.904%

Huge thank you to [Minsuk Heo](https://github.com/minsuk-heo) ([his work](https://github.com/minsuk-heo/kaggle-titanic/blob/master/titanic-solution.ipynb)) and [Yassine Ghouzam](https://www.kaggle.com/yassineghouzam) ([his work](https://www.kaggle.com/yassineghouzam/titanic-top-4-with-ensemble-modeling))!

A lot of this work was inspired by them

### Imports

In [1]:
import pandas as pd
import numpy as np

### Data

In [5]:
train = pd.read_csv('/Users/pbezuhov/git/Kaggle/data/titanic/train.csv')
test  = pd.read_csv('/Users/pbezuhov/git/Kaggle/data/titanic/test.csv')

full  = pd.concat([train, test], axis=0).reset_index(drop=True)

train_N = len(train)
del train, test

full = full.fillna(np.nan)

### Creating a Title Column

In [15]:
# Pulling the title information out of people's name (like mr, mrs, dr, etc)
full['Title'] = full['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

title_mapping = {"Mr": 0, "Miss": 1, "Mrs": 1, "Mlle": 1, "Mme": 1, "Ms": 3,
                 "Master": 2, "Dr": 3, "Rev": 3, "Col": 3, "Major": 3 ,"Countess": 3,
                 "Lady": 3, "Jonkheer": 3, "Don": 3, "Dona" : 3,"Capt": 3,"Sir": 3 }

full['Title'] = full['Title'].map(title_mapping)
    
# delete unnecessary feature from dataset
full.drop('Name', axis=1, inplace=True)

### Make categorical columns numeric

In [16]:
sex_mapping = {"male": 0, "female": 1}
full['Sex'] = full['Sex'].map(sex_mapping)

cabin_mapping = {"None": 0, "C": 1, "B": 2, "D": 3, "E": 4, "A": 5, "F": 5, "G": 5, "T": 5}
full['Cabin'] = full['Cabin'].map(cabin_mapping)

embarked_mapping = {"S": 0, "C": 1, "Q": 2}
full['Embarked'] = full['Embarked'].map(embarked_mapping)

### Fill NaN values

In [4]:
full["Age"].fillna(full.groupby("Title")["Age"].transform("median"), inplace=True)

#### The most embarked station is S, so it's safe to fill in nan values with S
full['Embarked'] = full['Embarked'].fillna('S')

# fill missing Fare with median fare for each Pclass
test["Fare"].fillna(training_data.groupby("Pclass")["Fare"].transform("median"), inplace=True)

full['Cabin'] = full['Cabin'].str[:1]
full["Cabin"].fillna("None", inplace=True)

### Ticket Column

In [6]:
## Treat Ticket by extracting the ticket prefix. When there is no prefix it returns X. 
def ticket_prefix(x):
    if not x.isdigit() :
        return x.replace(".","").replace("/","").strip().split(' ')[0] #Take prefix
    else:
        return "X"

full["Ticket"] = full.Ticket.map(ticket_prefix)
full = pd.get_dummies(full, columns = ["Ticket"], prefix="T")
full.head(5)

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,...,T_SOTONO2,T_SOTONOQ,T_SP,T_STONO,T_STONO2,T_STONOQ,T_SWPP,T_WC,T_WEP,T_X
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,...,0,0,0,0,0,0,0,0,0,0
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,...,0,0,0,0,0,0,0,0,0,0
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,...,0,0,0,0,1,0,0,0,0,0
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,...,0,0,0,0,0,0,0,0,0,1
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,...,0,0,0,0,0,0,0,0,0,1


### Split the data back into train and test data

In [None]:
train = full[:train_N]
test = full[train_N:]
test.drop(labels=["Survived"], axis=1, inplace=True)

del full, train_N

### Split the training data into target and predictors

In [None]:
train["Survived"] = train["Survived"].astype(int)
train_y = train["Survived"]
train_x = train.drop(labels=["Survived"], axis=1)

# Modeling

### Cross validation score

In [37]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer

k_fold = KFold(shuffle=True, random_state=1)

pipeline = make_pipeline(Imputer(), SVC())

train_y = train_data['Survived']
train_x = train_data.drop(['Survived', 'PassengerId'], axis=1)

scoring = 'accuracy'
score = cross_val_score(model, train_x, train_y, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)
print("\nAverage is ...")
print(sum(score) / len(score))

[0.81111111 0.79775281 0.83146067 0.82022472 0.84269663 0.80898876
 0.83146067 0.80898876 0.83146067 0.86516854]

Average is ...
0.8249313358302123


#### Fitting

In [38]:
pipeline = make_pipeline(Imputer(), SVC())
pipeline.fit(train_x, train_y)

Pipeline(memory=None,
     steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [39]:
test_x = test_data.drop("PassengerId", axis=1)
prediction = pipeline.predict(test_x)

In [40]:
submission = pd.DataFrame({
        "PassengerId": test_data["PassengerId"],
        "Survived": prediction
    })

submission.to_csv('/Users/pbezuhov/git/Kaggle/submissions/titanic/10_svc.csv', index=False)