**Overview**
https://www.kaggle.com/c/titanic/data

In [90]:
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

In [91]:
#read data
train = pd.read_csv("~/Desktop/Kaggle/Titanic/train.csv")
test = pd.read_csv("~/Desktop/kaggle/Titanic/test.csv")

# Data Dictionary 

Variable Name | Description
--------------|-------------
Survived      | Survived (1) or died (0)
Pclass        | Passenger's class  (1 = 1st, 2 = 2nd, 3 = 3rd)  
Name          | Passenger's name
Sex           | Passenger's sex
Age           | Passenger's age
SibSp         | Number of siblings/spouses aboard
Parch         | Number of parents/children aboard
Ticket        | Ticket number
Fare          | Fare
Cabin         | Cabin
Embarked      | Port of embarkation(C = Cherbourg, Q = Queenstown, S = Southampton Variable Notes)

In [92]:
#see train data
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [93]:
#see test data
test.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q


In [94]:
#check if there is misssing data
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


** Age, Cabin Embarked contains null data.**

In [95]:
#replace characters to numbers 
#?????? train.Embarked = train.Embarked.replace(['C', 'S', 'Q'],[0, 1, 2])
#train.Cabin = train.Cabin.replace('NaN',0)
#train.Sex = train.Sex.replace(['male', 'female'],[0, 1])
#train.Age = train.Age.replace('NaN',0)

train = train.replace("male",0).replace("female",1).replace("C",0).replace("S",1).replace("Q",2)
test = test.replace("male",0).replace("female",1).replace("C",0).replace("S",1).replace("Q",2)

In [96]:
#take care of misssing value with mean (ignore Cabin)
train["Age"].fillna(train.Age.mean(), inplace=True)
train["Embarked"].fillna(train.Embarked.mean(), inplace=True)

In [97]:
#modify names
combine1 = [train]
for train in combine1:
    train['Salutation'] = train.Name.str.extract('([A-Za-z]+).', expand=False) #what does . means?
for train in combine:
        train['Salutation'] = train['Salutation'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkeer', 'Dona'], 'Rare')
    
for train in combine1: 
        train['Salutation'] = train['Salutation'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
        train['Salutation'] = train['Salutation'].replace('Mlle', 'Miss')
        train['Salutation'] = train['Salutation'].replace('Ms', 'Miss')
        train['Salutation'] = train['Salutation'].replace('Mme', 'Mrs')
        del train['Name']
Salutation_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5} 
for train in combine1: 
        train['Salutation'] = train['Salutation'].map(Salutation_mapping) 
        train['Salutation'] = train['Salutation'].fillna(0)


In [98]:
train['Name'] 

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
5                                       Moran, Mr. James
6                                McCarthy, Mr. Timothy J
7                         Palsson, Master. Gosta Leonard
8      Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)
9                    Nasser, Mrs. Nicholas (Adele Achem)
10                       Sandstrom, Miss. Marguerite Rut
11                              Bonnell, Miss. Elizabeth
12                        Saundercock, Mr. William Henry
13                           Andersson, Mr. Anders Johan
14                  Vestrom, Miss. Hulda Amanda Adolfina
15                      Hewlett, Mrs. (Mary D Kingcome) 
16                                  Rice, Master. Eugene
17                          Wil

In [99]:
train['Salutation'] 

0           Braund
1          Cumings
2        Heikkinen
3         Futrelle
4            Allen
5            Moran
6         McCarthy
7          Palsson
8          Johnson
9           Nasser
10       Sandstrom
11         Bonnell
12     Saundercock
13       Andersson
14         Vestrom
15         Hewlett
16            Rice
17        Williams
18          Vander
19      Masselmani
20          Fynney
21         Beesley
22         McGowan
23          Sloper
24         Palsson
25         Asplund
26            Emir
27         Fortune
28               O
29        Todoroff
          ...     
861          Giles
862          Swift
863           Sage
864           Gill
865        Bystrom
866          Duran
867       Roebling
868            van
869        Johnson
870         Balkic
871       Beckwith
872       Carlsson
873         Vander
874        Abelson
875          Najib
876     Gustafsson
877        Petroff
878         Laleff
879         Potter
880        Shelley
881         Markun
882       Da

In [68]:
combine1 = [train]

for train in combine1: 
        train['Salutation'] = train.Name.str.extract(' ([A-Za-z]+).', expand=False) 
for train in combine1: 
        train['Salutation'] = train['Salutation'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
        train['Salutation'] = train['Salutation'].replace('Mlle', 'Miss')
        train['Salutation'] = train['Salutation'].replace('Ms', 'Miss')
        train['Salutation'] = train['Salutation'].replace('Mme', 'Mrs')
        del train['Name']
Salutation_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5} 
for train in combine1: 
        train['Salutation'] = train['Salutation'].map(Salutation_mapping) 
        train['Salutation'] = train['Salutation'].fillna(0)