In [23]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc

In [3]:
#Loading the training set data
df_training = pd.read_csv("train.csv")
print(df_training.head(5))

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [4]:
df_training.shape

(891, 12)

In [5]:
df_training.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
# Cleaning
# We will remove ‘Cabin’, ‘Name’ and ‘Ticket’ columns 
df_training_dropped = df_training.drop(['Cabin', 'Name', 'Ticket'], axis=1) 
df_training_dropped.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


In [7]:
# Fill the missing age values by the mean value
# Filling missing Age values with mean
df_training_dropped['Age'] = df_training_dropped['Age'].fillna(df_training_dropped['Age'].mean())

In [8]:
# Filling missing Embarked values with most common value
df_training_dropped['Embarked'] = df_training_dropped['Embarked'].fillna(df_training_dropped['Embarked'].mode()[0])

In [9]:
df_training_dropped['Embarked'].mode()[0]

'S'

In [10]:
# check wich embark value appears the most (mode)
df_training_dropped['Embarked'].value_counts()

S    646
C    168
Q     77
Name: Embarked, dtype: int64

In [11]:
# ‘Pclass’ is a categorical feature so we convert its values to strings

df_training_dropped['Pclass'] = df_training_dropped['Pclass'].apply(str)

In [12]:
# one hot encoding of categorical features
df_training_dropped.dtypes

PassengerId      int64
Survived         int64
Pclass          object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Fare           float64
Embarked        object
dtype: object

In [13]:
df_training_dummied = pd.get_dummies(df_training_dropped, columns=["Pclass", 'Sex', 'Embarked'])


In [14]:
df_training_dummied.head(3)

Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0,22.0,1,0,7.25,0,0,1,0,1,0,0,1
1,2,1,38.0,1,0,71.2833,1,0,0,1,0,1,0,0
2,3,1,26.0,0,0,7.925,0,0,1,1,0,0,0,1


In [15]:
X_df = df_training_dummied.drop('Survived', axis=1)
y_df = df_training_dummied['Survived']

In [18]:
# Split into train and test sets.
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size = 0.25, random_state =42)

In [19]:
# instantiate the RF classifier
# Set the random state for reproducibility
clf = RandomForestClassifier(random_state=42)

In [20]:
# train the algorithm utilizing the training and target class
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [21]:
# predict the test set
y_pred = clf.predict(X_test)

In [24]:
# comparing actual response values (y_test) with predicted response values (y_pred)
print("model accuracy:", metrics.accuracy_score(y_test, y_pred)* 100)

model accuracy: 83.85650224215246


In [25]:
# Prepare test dataset
df_testing = pd.read_csv("test.csv")

In [26]:
df_testing.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [27]:
df_testing.shape

(418, 11)

In [28]:
df_testing.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [29]:
# Cleaning
# We will remove ‘Cabin’, ‘Name’ and ‘Ticket’ columns 
df_testing_dropped = df_testing.drop(['Cabin', 'Name', 'Ticket'], axis=1) 
df_testing_dropped.head(5)

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,male,34.5,0,0,7.8292,Q
1,893,3,female,47.0,1,0,7.0,S
2,894,2,male,62.0,0,0,9.6875,Q
3,895,3,male,27.0,0,0,8.6625,S
4,896,3,female,22.0,1,1,12.2875,S


In [30]:
# Fill the missing age values by the mean value
# Filling missing Age values with mean
df_testing_dropped['Age'] = df_testing_dropped['Age'].fillna(df_testing_dropped['Age'].mean())

In [31]:
# Filling missing Embarked values with most common value
df_testing_dropped['Embarked'] = df_testing_dropped['Embarked'].fillna(df_testing_dropped['Embarked'].mode()[0])

In [32]:
df_training_dropped['Embarked'].mode()[0]

'S'

In [33]:
# check wich embark value appears the most (mode)
df_testing_dropped['Embarked'].value_counts()

S    270
C    102
Q     46
Name: Embarked, dtype: int64

In [34]:
# ‘Pclass’ is a categorical feature so we convert its values to strings

df_testing_dropped['Pclass'] = df_testing_dropped['Pclass'].apply(str)

In [35]:
# one hot encoding of categorical features
df_testing_dropped.dtypes

PassengerId      int64
Pclass          object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Fare           float64
Embarked        object
dtype: object

In [36]:
# one hot encoding of categorical features
df_testing_dropped.dtypes

PassengerId      int64
Pclass          object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Fare           float64
Embarked        object
dtype: object

In [37]:
pass_id = df_testing_dropped['PassengerId']

In [38]:
df_testing_dummied = pd.get_dummies(df_testing_dropped, columns=["Pclass", 'Sex', 'Embarked'])


In [39]:
df_testing_dummied.head(3)

Unnamed: 0,PassengerId,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,892,34.5,0,0,7.8292,0,0,1,0,1,0,1,0
1,893,47.0,1,0,7.0,0,0,1,1,0,0,0,1
2,894,62.0,0,0,9.6875,0,1,0,0,1,0,1,0


In [40]:
df_testing_dummied.isnull().any()

PassengerId    False
Age            False
SibSp          False
Parch          False
Fare            True
Pclass_1       False
Pclass_2       False
Pclass_3       False
Sex_female     False
Sex_male       False
Embarked_C     False
Embarked_Q     False
Embarked_S     False
dtype: bool

In [41]:
null_col = df_testing_dummied.columns[df_testing_dummied.isnull().any()]

In [42]:
df_testing_dummied[null_col] = df_testing_dummied[null_col].replace(np.nan,0)

In [43]:
df_testing_dummied.isnull().any()

PassengerId    False
Age            False
SibSp          False
Parch          False
Fare           False
Pclass_1       False
Pclass_2       False
Pclass_3       False
Sex_female     False
Sex_male       False
Embarked_C     False
Embarked_Q     False
Embarked_S     False
dtype: bool

In [44]:
y_output = clf.predict(df_testing_dummied)

In [45]:
y_output.shape

(418,)

In [46]:
y_output

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [47]:
data = list(zip(pass_id, y_output))

In [48]:
new = pd.DataFrame(data, columns= ['Passenger ID','Survived'])

In [49]:
new.head(5)

Unnamed: 0,Passenger ID,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [50]:
# Writing file:
new.to_csv('Titanic_predictions.csv', index=False)