In [1]:
import pandas as pd
import numpy as np
import csv as csv
from sklearn.ensemble import RandomForestClassifier

In [5]:
# Data Cleanup
# Train Data

train_df=pd.read_csv('train.csv')

In [6]:
type(train_df)

pandas.core.frame.DataFrame

In [7]:
train_df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [8]:
# Need to convert all strings to integer classifiers
# Need to fill in the missing values of the data and make it complete

# Female=0, Male =1
train_df['Gender']= train_df['Sex'].map({'female':0,'male':1}).astype(int)

In [9]:
train_df.head(3).T

Unnamed: 0,0,1,2
PassengerId,1,2,3
Survived,0,1,1
Pclass,3,1,3
Name,"Braund, Mr. Owen Harris","Cumings, Mrs. John Bradley (Florence Briggs Th...","Heikkinen, Miss. Laina"
Sex,male,female,female
Age,22,38,26
SibSp,1,1,0
Parch,0,0,0
Ticket,A/5 21171,PC 17599,STON/O2. 3101282
Fare,7.25,71.2833,7.925


In [16]:
# Embarked from 'C', 'Q', 'S'
# All missing Embarked -> just make them embark from most common place

if len(train_df.Embarked[train_df.Embarked.isnull()])>0:
    train_df.Embarked[train_df.Embarked.isnull()]=train_df.Embarked.dropna().mode().values

In [29]:
Ports = list(enumerate(np.unique(train_df['Embarked'])))
 

In [30]:
Ports

[(0, 'C'), (1, 'Q'), (2, 'S')]

In [31]:
Ports_dict = {name:i for i, name in Ports} # set up a dictionary for the unique values of Embarked

In [32]:
Ports_dict

{'C': 0, 'Q': 1, 'S': 2}

In [33]:
train_df.Embarked = train_df.Embarked.map(lambda x: Ports_dict[x]).astype(int)

In [34]:
train_df.Embarked.head()

0    2
1    0
2    2
3    2
4    2
Name: Embarked, dtype: int64

In [35]:
# All the ages with no data -> make the median of all Ages
median_age = train_df['Age'].dropna().median()

In [38]:
if len(train_df.Age[train_df.Age.isnull()])>0:
    train_df.Age[train_df.Age.isnull()]=median_age

In [40]:
# Remove the Name column, Cabin, Ticket, and Sex (since I copied and filled it to Gender)
train_df = train_df.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'PassengerId'], axis=1)

In [78]:
train_df.head(5)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked,Gender
0,0,3,22.0,1,0,7.25,2,1
1,1,1,38.0,1,0,71.2833,0,0
2,1,3,26.0,0,0,7.925,2,0
3,1,1,35.0,1,0,53.1,2,0
4,0,3,35.0,0,0,8.05,2,1


In [43]:
# Test Data

test_df = pd.read_csv('test.csv',header=0)

In [44]:
# I need to convert all strings to integer classifiers:
# female = 0, Male = 1

test_df['Gender']=test_df['Sex'].map({'female':0,'male':1}).astype(int)

In [46]:
test_df.Embarked[test_df.Embarked.isnull()]

Series([], Name: Embarked, dtype: object)

In [47]:
# Embarked from 'C', 'Q', 'S'
# All missing Embarked -> just make them embark from most common place

if len(test_df.Embarked[test_df.Embarked.isnull()])>0:
    test_df.Embarked[test_df.Embarked.isnull()] = test_df.Embarked.dropna().mode().values

In [48]:
# Again convert all Embarked strings to int
test_df.Embarked = test_df.Embarked.map( lambda x: Ports_dict[x]).astype(int)

In [50]:
test_df.Embarked.head(5)

0    1
1    2
2    1
3    2
4    2
Name: Embarked, dtype: int64

In [52]:
len(test_df.Age[ test_df.Age.isnull() ])

86

In [54]:
if len(test_df.Age[test_df.Age.isnull()])>0:
    test_df.Age[test_df.Age.isnull()]=median_age

In [55]:
len(test_df.Age[ test_df.Age.isnull() ])

0

In [57]:
test_df.Fare[test_df.Fare.isnull()]

152   NaN
Name: Fare, dtype: float64

In [58]:
test_df.head().T

Unnamed: 0,0,1,2,3,4
PassengerId,892,893,894,895,896
Pclass,3,3,2,3,3
Name,"Kelly, Mr. James","Wilkes, Mrs. James (Ellen Needs)","Myles, Mr. Thomas Francis","Wirz, Mr. Albert","Hirvonen, Mrs. Alexander (Helga E Lindqvist)"
Sex,male,female,male,male,female
Age,34.5,47,62,27,22
SibSp,0,1,0,0,1
Parch,0,0,0,0,1
Ticket,330911,363272,240276,315154,3101298
Fare,7.8292,7,9.6875,8.6625,12.2875
Cabin,,,,,


In [61]:
if len(test_df.Fare[test_df.Fare.isnull()])>0:
    median_fare = np.zeros(3)
    
    # calculate the median fare per class basis
    for f in range(0,3):
        median_fare[f]=test_df[test_df.Pclass ==f+1]['Fare'].dropna().median()
    
    for f in range (0,3):
        test_df.Fare[(test_df.Fare.isnull())& (test_df.Pclass ==f+1)]= median_fare[f]
    

In [92]:
# Collect the test data's PassengerIds before dropping it
ids = test_df['PassengerId'].values


KeyError: 'PassengerId'

In [93]:
len(ids)

418

In [64]:
# Remove the Name column, Cabin, Ticket, and Sex (since I copied and filled it to Gender)
test_df = test_df.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'PassengerId'], axis=1) 

In [77]:
test_df.head(3)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Embarked,Gender
0,3,34.5,0,0,7.8292,1,1
1,3,47.0,1,0,7.0,2,0
2,2,62.0,0,0,9.6875,1,1


In [82]:
# The data is now ready to go. So lets fit to the train, then predict to the test!
# Convert back to a numpy array

# type(train_df.values)
train_data=train_df.values
test_data = test_df.values

In [90]:
test_data.shape

(418, 7)

In [83]:
train_df.head(3)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked,Gender
0,0,3,22.0,1,0,7.25,2,1
1,1,1,38.0,1,0,71.2833,0,0
2,1,3,26.0,0,0,7.925,2,0


In [84]:
print 'Training...'
forest= RandomForestClassifier(n_estimators=100)
forest= forest.fit(train_data[0::,1::],train_data[0::,0])

Training...


In [85]:
print 'Predicting...'
output = forest.predict(test_data).astype(int)

Predicting...


In [86]:
len(output)

418

In [89]:
predictions_file = open("myfirstforest_rg.csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["PassengerId","Survived"])

In [94]:
open_file_object.writerows(zip(ids,output))

In [95]:
predictions_file.close()
print 'Done'

Done
