In [1]:
import pandas as pd
import numpy as np

df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')


In [2]:
#fill nan embarked with mode
df_train['Embarked'].fillna(df_train['Embarked'].mode()[0], inplace=True)

In [3]:
#drop cabin since only 204 / 891 available (for df_train)
df_train.drop(['Cabin','PassengerId','Ticket'],axis=1,inplace=True)
df_test.drop(['Cabin','PassengerId','Ticket'],axis=1,inplace=True)

In [8]:
#merge data set
targets = df_train['Survived']
df_train.drop('Survived', 1, inplace=True)

df = df_train.append(df_test)
df.reset_index(inplace=True)
df.drop('index', inplace=True, axis=1)

In [9]:
#fill missing fare with median fare
median_fare = df.loc[(df['Embarked'] == 'S') & (df['Pclass'] == 3)].median()['Fare']
df['Fare'].fillna(median_fare, inplace=True)

In [10]:
#name
import re

def replaceNameWithTitle(name):
    match = re.search("(Dr|Mr|Mrs|Dona|Ms|Miss|Master|Rev|Capt|Mlle|Col|Major|Sir|Jonkheer|Lady|the Countess|Mme|Don)",name)
    if match:
        return match.group(0)
    else:
        return "Other"
    
df['Title'] = df['Name'].apply(replaceNameWithTitle)  

Title_Dictionary = {
                        "Capt":       "Officer",
                        "Col":        "Officer",
                        "Major":      "Officer",
                        "Jonkheer":   "Royalty",
                        "Don":        "Royalty",
                        "Sir" :       "Royalty",
                        "Dr":         "Officer",
                        "Rev":        "Officer",
                        "the Countess":"Royalty",
                        "Dona":       "Royalty",
                        "Mme":        "Mrs",
                        "Mlle":       "Miss",
                        "Ms":         "Mrs",
                        "Mr" :        "Mr",
                        "Mrs" :       "Mrs",
                        "Miss" :      "Miss",
                        "Master" :    "Master",
                        "Lady" :      "Royalty"}

df['Title'] = df['Title'].map(Title_Dictionary)

title_dummies  = pd.get_dummies(df['Title'])
df = df.join(title_dummies)

df.drop(['Name','Title'],axis=1, inplace=True)

In [11]:
#pclass
# create dummy variables for Pclass column, & drop 3rd class as it has the lowest average of survived passengers
pclass_dummies  = pd.get_dummies(df['Pclass'])
pclass_dummies.columns = ['Class_1','Class_2','Class_3']

##pclass_dummies.drop(['Class_3'], axis=1, inplace=True)

df.drop(['Pclass'],axis=1,inplace=True)

df = df.join(pclass_dummies)

In [12]:
#family

df['Family'] = df["Parch"] + df["SibSp"]
df['Family'].loc[df['Family'] > 0] = 1
df['Family'].loc[df['Family'] == 0] = 0

#drop
df = df.drop(['SibSp','Parch'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [13]:
#sex
def set_person(passenger):
    age,sex = passenger
    return 'Child' if age < 16 else sex

df['Person'] = df[['Age','Sex']].apply(set_person, axis=1)
df.drop(['Sex'],axis=1,inplace=True)

person_dummies  = pd.get_dummies(df['Person'])
person_dummies.columns = ['Child','Female','Male']

df = df.join(person_dummies)
df.drop(['Person'],axis=1,inplace=True)

In [14]:
#embarked
embark_dummies = pd.get_dummies(df['Embarked'])
##embark_dummies.drop(['S'], axis=1, inplace=True)

df.drop(['Embarked'], axis=1,inplace=True)

df = df.join(embark_dummies)

In [19]:
#age
from sklearn.ensemble import RandomForestRegressor
 
### Populate missing ages  using RandomForestClassifier
def generateMissingAge(df):
    
    # Grab all the features that can be included in a Random Forest Regressor
    age_df = df[['Age','Fare','Master','Miss','Mr','Mrs','Officer','Royalty','Class_1','Class_2','Class_3','Family','Child','Female','Male','C','Q','S']]
    
    # Split into sets with known and unknown Age values
    knownAge = age_df.loc[ (df.Age.notnull()) ]
    unknownAge = age_df.loc[ (df.Age.isnull()) ]
    
    # All age values are stored in a target array
    y = knownAge.values[:, 0]
    
    # All the other values are stored in the feature array
    X = knownAge.values[:, 1::]
    
    # Create and fit a model
    rtr = RandomForestRegressor(n_estimators=2000, n_jobs=-1)
    rtr.fit(X, y)
    
    # Use the fitted model to predict the missing values
    predictedAges = rtr.predict(unknownAge.values[:, 1::])
    
    # Assign those predictions to the full data set
    df.loc[ (df.Age.isnull()), 'Age' ] = predictedAges 
    

In [16]:
df.head()

Unnamed: 0,Age,Fare,Master,Miss,Mr,Mrs,Officer,Royalty,Class_1,Class_2,Class_3,Family,Child,Female,Male,C,Q,S
0,22.0,7.25,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,1
1,38.0,71.2833,0,0,1,0,0,0,1,0,0,1,0,1,0,1,0,0
2,26.0,7.925,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,1
3,35.0,53.1,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,1
4,35.0,8.05,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1


In [20]:
generateMissingAge(df);

Unnamed: 0,Age,Fare,Master,Miss,Mr,Mrs,Officer,Royalty,Class_1,Class_2,Class_3,Family,Child,Female,Male,C,Q,S
0,22.000000,7.2500,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,1
1,38.000000,71.2833,0,0,1,0,0,0,1,0,0,1,0,1,0,1,0,0
2,26.000000,7.9250,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,1
3,35.000000,53.1000,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,1
4,35.000000,8.0500,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1
5,28.646116,8.4583,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,0
6,54.000000,51.8625,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1
7,2.000000,21.0750,1,0,0,0,0,0,0,0,1,1,1,0,0,0,0,1
8,27.000000,11.1333,0,0,1,0,0,0,0,0,1,1,0,1,0,0,0,1
9,14.000000,30.0708,0,0,1,0,0,0,0,1,0,1,1,0,0,1,0,0


In [24]:
def recover_train_test_target():
    
    train = df.head(891)
    test = df.iloc[891:]
    
    return train, test

train, test = recover_train_test_target()

In [27]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=100)

random_forest.fit(train, targets)

Y_pred = random_forest.predict(test)

random_forest.score(train, targets)

0.98428731762065091