# Score 79.904%

Huge thank you to [Minsuk Heo](https://github.com/minsuk-heo) ([his work](https://github.com/minsuk-heo/kaggle-titanic/blob/master/titanic-solution.ipynb)) and [Yassine Ghouzam](https://www.kaggle.com/yassineghouzam) ([his work](https://www.kaggle.com/yassineghouzam/titanic-top-4-with-ensemble-modeling))!

A lot of this work was inspired by them

### Imports

In [1]:
import pandas as pd
import numpy as np

### Data

In [2]:
train = pd.read_csv('/Users/pbezuhov/git/Kaggle/data/titanic/train.csv')
test  = pd.read_csv('/Users/pbezuhov/git/Kaggle/data/titanic/test.csv')

### Remove outliers

### Concat train and test data for easier manipulation

In [None]:
full  = pd.concat([train, test], axis=0).reset_index(drop=True)

train_N = len(train)
del train, test

full = full.fillna(np.nan)

### Creating a Title Column

In [3]:
# Pulling the title information out of people's name (like mr, mrs, dr, etc)
full['Title'] = full['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

title_mapping = {"Mr": 0, "Miss": 1, "Mrs": 1, "Mlle": 1, "Mme": 1, "Ms": 3,
                 "Master": 2, "Dr": 3, "Rev": 3, "Col": 3, "Major": 3 ,"Countess": 3,
                 "Lady": 3, "Jonkheer": 3, "Don": 3, "Dona" : 3,"Capt": 3,"Sir": 3 }

full['Title'] = full['Title'].map(title_mapping)
    
# delete unnecessary feature from dataset
full.drop('Name', axis=1, inplace=True)

### Fill NaN values

In [4]:
#### Age
index_NaN_age = list(full["Age"][full["Age"].isnull()].index)
for i in index_NaN_age :
    age_med = full["Age"].median()
    age_pred = full["Age"][
                            ((full['SibSp'] == full.iloc[i]["SibSp"]) &
                             (full['Parch'] == full.iloc[i]["Parch"]) &
                             (full['Pclass'] == full.iloc[i]["Pclass"]))
                          ].median()
    if not np.isnan(age_pred) :
        full['Age'].iloc[i] = age_pred
    else :
        full['Age'].iloc[i] = age_med

#### Embarked
# The most embarked station is S, so it's safe to fill in nan values with S
full['Embarked'] = full['Embarked'].fillna('S')

#### Fare
# fill missing Fare with median fare for each Pclass
full["Fare"].fillna(full.groupby("Pclass")["Fare"].transform("median"), inplace=True)

#### Cabin
full['Cabin'] = full['Cabin'].str[:1]
full["Cabin"].fillna("None", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


### Create a family size column

Thanks again to Minsuk and Yassine!

In [17]:
# Create a family size descriptor from SibSp and Parch
full["Fsize"] = full["SibSp"] + full["Parch"] + 1

# Create new feature of family size
full['Single'] = full['Fsize'].map(lambda s: 1 if s == 1 else 0)
full['SmallF'] = full['Fsize'].map(lambda s: 1 if  s == 2  else 0)
full['MedF']   = full['Fsize'].map(lambda s: 1 if 3 <= s <= 4 else 0)
full['LargeF'] = full['Fsize'].map(lambda s: 1 if s >= 5 else 0)

NameError: name 'full' is not defined

### Reduce Skew

In [5]:
# Apply log to Fare to reduce skewness distribution
full["Fare"] = full["Fare"].map(lambda i: np.log(i) if i > 0 else 0)

### Ticket Column

In [6]:
## Treat Ticket by extracting the ticket prefix. When there is no prefix it returns X. 
def ticket_prefix(x):
    if not x.isdigit() :
        return x.replace(".","").replace("/","").strip().split(' ')[0] #Take prefix
    else:
        return "X"

full["Ticket"] = full.Ticket.map(ticket_prefix)
full.head(5)

Unnamed: 0,Age,Cabin,Embarked,Fare,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Title
0,22.0,,S,1.981001,0,1,3,male,1,0.0,A5,0
1,38.0,C,C,4.266662,0,2,1,female,1,1.0,PC,1
2,26.0,,S,2.070022,0,3,3,female,0,1.0,STONO2,1
3,35.0,C,S,3.972177,0,4,1,female,1,1.0,X,1
4,35.0,,S,2.085672,0,5,3,male,0,0.0,X,0


### Convert to dummy variables

In [7]:
# convert to indicator values Title and Embarked 
full = pd.get_dummies(full, columns = ["Title", "Sex", "Pclass"])
full = pd.get_dummies(full, columns = ["Ticket"],   prefix="T")
full = pd.get_dummies(full, columns = ["Embarked"], prefix="Em")
full = pd.get_dummies(full, columns = ["Cabin"],    prefix="Ca")

### Split the data back into train and test data

In [8]:
train = full[:train_N]
test = full[train_N:]
test.drop(labels=["Survived"], axis=1, inplace=True)

del full, train_N

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


### Split the training data into target and predictors

In [9]:
train["Survived"] = train["Survived"].astype(int)
train_y = train["Survived"]
train_x = train.drop(labels=["Survived", "PassengerId"], axis=1)

In [15]:
train_x.columns

Index(['Age', 'Fare', 'Parch', 'SibSp', 'Title_0', 'Title_1', 'Title_2',
       'Title_3', 'Sex_female', 'Sex_male', 'Pclass_1', 'Pclass_2', 'Pclass_3',
       'T_A', 'T_A4', 'T_A5', 'T_AQ3', 'T_AQ4', 'T_AS', 'T_C', 'T_CA',
       'T_CASOTON', 'T_FC', 'T_FCC', 'T_Fa', 'T_LINE', 'T_LP', 'T_PC', 'T_PP',
       'T_PPP', 'T_SC', 'T_SCA3', 'T_SCA4', 'T_SCAH', 'T_SCOW', 'T_SCPARIS',
       'T_SCParis', 'T_SOC', 'T_SOP', 'T_SOPP', 'T_SOTONO2', 'T_SOTONOQ',
       'T_SP', 'T_STONO', 'T_STONO2', 'T_STONOQ', 'T_SWPP', 'T_WC', 'T_WEP',
       'T_X', 'Em_C', 'Em_Q', 'Em_S', 'Ca_A', 'Ca_B', 'Ca_C', 'Ca_D', 'Ca_E',
       'Ca_F', 'Ca_G', 'Ca_None', 'Ca_T'],
      dtype='object')

# Modeling

### Cross validation score

In [10]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.svm import SVC

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer

k_fold = KFold(shuffle=True, random_state=1)

pipeline = make_pipeline(Imputer(), SVC())

scoring = 'accuracy'
score = cross_val_score(pipeline, train_x, train_y, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)
print("\nAverage is ...")
print(sum(score) / len(score))

[ 0.77104377  0.82491582  0.84175084]

Average is ...
0.812570145903


#### Fitting

In [11]:
pipeline = make_pipeline(Imputer(), SVC())
pipeline.fit(train_x, train_y)

Pipeline(memory=None,
     steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [12]:
test_x = test.drop("PassengerId", axis=1)
prediction = pipeline.predict(test_x)

In [13]:
submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": prediction
    })

submission.to_csv('/Users/pbezuhov/git/Kaggle/submissions/titanic.csv', index=False)