# Score 79.904%

- Huge thank you to Minsuk Heo!  Most of this is his work, I hope to expand on this in future submissions
https://github.com/minsuk-heo
https://github.com/minsuk-heo/kaggle-titanic/blob/master/titanic-solution.ipynb

#### Imports and loading data

In [1]:
import pandas as pd

train = pd.read_csv('../../data/titanic/train.csv')
test = pd.read_csv('../../data/titanic/test.csv')

#### Pulling the title information out of people's name (like mr, mrs, dr, etc)

In [2]:
full = [train, test]

for dataset in full:
    dataset['Title'] = dataset['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

title_mapping = {"Mr": 0, "Miss": 1, "Mrs": 2, 
                 "Master": 3, "Dr": 3, "Rev": 3, "Col": 3, "Major": 3, "Mlle": 3,"Countess": 3,
                 "Ms": 3, "Lady": 3, "Jonkheer": 3, "Don": 3, "Dona" : 3, "Mme": 3,"Capt": 3,"Sir": 3 }

for dataset in full:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    
# delete unnecessary feature from dataset
train.drop('Name', axis=1, inplace=True)
test.drop('Name', axis=1, inplace=True)

#### Make the categorical sex column into a numerical one

In [3]:
sex_mapping = {"male": 0, "female": 1}
for dataset in full:
    dataset['Sex'] = dataset['Sex'].map(sex_mapping)

#### Fill in empty ages with the median value from the group

- It might help to add a column to the table saying which data I affected

In [4]:
# fill missing age with median age for each title (Mr, Mrs, Miss, Others)
train["Age"].fillna(train.groupby("Title")["Age"].transform("median"), inplace=True)
test["Age"].fillna(test.groupby("Title")["Age"].transform("median"), inplace=True)

#### The most embarked station is S, so it's safe to fill in nan values with S

- This rubs me the wrong way though, I think another column should definitely be added

In [5]:
for dataset in full:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')

#### If a fare is missing it is replaced with the median fare for the individual's pclass

In [6]:
# fill missing Fare with median fare for each Pclass
train["Fare"].fillna(train.groupby("Pclass")["Fare"].transform("median"), inplace=True)
test["Fare"].fillna(test.groupby("Pclass")["Fare"].transform("median"), inplace=True)

#### Cabin information from cabin data is pulled

In [7]:
for dataset in full:
    dataset['Cabin'] = dataset['Cabin'].str[:1]
    
train["Cabin"].fillna("None", inplace=True)
test["Cabin"].fillna("None", inplace=True)

#### Age adjustments

In [8]:
for dataset in full:
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0,
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 26), 'Age'] = 1,
    dataset.loc[(dataset['Age'] > 26) & (dataset['Age'] <= 36), 'Age'] = 2,
    dataset.loc[(dataset['Age'] > 36) & (dataset['Age'] <= 62), 'Age'] = 3,
    dataset.loc[ dataset['Age'] > 62, 'Age'] = 4

#### Fare adjustments

In [9]:
for dataset in full:
    dataset.loc[ dataset['Fare'] <= 17, 'Fare'] = 0,
    dataset.loc[(dataset['Fare'] > 17) & (dataset['Fare'] <= 30), 'Fare'] = 1,
    dataset.loc[(dataset['Fare'] > 30) & (dataset['Fare'] <= 100), 'Fare'] = 2,
    dataset.loc[ dataset['Fare'] > 100, 'Fare'] = 3

#### Extra adjustments

In [10]:
cabin_mapping = {"None": 0, "C": 1, "B": 2, "D": 3, "E": 4, "A": 5, "F": 5, "G": 5, "T": 5}
for dataset in full:
    dataset['Cabin'] = dataset['Cabin'].map(cabin_mapping)

In [11]:
embarked_mapping = {"S": 0, "C": 1, "Q": 2}
for dataset in full:
    dataset['Embarked'] = dataset['Embarked'].map(embarked_mapping)

#### The ticket column seems to have no information so I drop it

In [12]:
train = train.drop(['Ticket'], axis=1)
test = test.drop(['Ticket'], axis=1)

#### Cross validation score

In [18]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

k_fold = KFold(n_splits=10, shuffle=True, random_state=0)

In [21]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer

pipeline = make_pipeline(Imputer(), SVC())

train_y = train['Survived']
train_x = train.drop(['Survived', 'PassengerId'], axis=1)

scoring = 'accuracy'
score = cross_val_score(pipeline, train_x, train_y, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)
print("\nAverage is ...")
print(sum(score) / len(score))

[ 0.81111111  0.79775281  0.83146067  0.82022472  0.84269663  0.80898876
  0.83146067  0.80898876  0.83146067  0.86516854]

Average is ...
0.82493133583


#### Fitting

In [15]:
pipeline = make_pipeline(Imputer(), SVC())
pipeline.fit(train_x, train_y)

Pipeline(memory=None,
     steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [16]:
test_x = test.drop("PassengerId", axis=1)
prediction = ExtC_best.predict(test_x) # pipeline.predict(test_x)


In [18]:
submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": prediction
    })

submission.to_csv('../../submissions/max_score.csv', index=False)