## Building Predictive Models

In [2]:
import pandas as pd
import os
import numpy as np

### Import Data 

In [3]:
#Set the path of the processed data
processed_data_path = os.path.join(os.path.pardir, 'data', 'processed')
train_file_path = os.path.join(processed_data_path, 'train.csv')
test_file_path = os.path.join(processed_data_path, 'test.csv')

In [4]:
#Now read the processed data into training and test dataframes using pd.read_csv
train_df = pd.read_csv(train_file_path, index_col='PassengerId')
test_df = pd.read_csv(test_file_path, index_col='PassengerId')

In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 34 columns):
Survived              891 non-null int64
Age                   891 non-null float64
Fare                  891 non-null float64
FamilySize            891 non-null int64
IsMother              891 non-null int64
IsMale                891 non-null int64
Deck_A                891 non-null int64
Deck_B                891 non-null int64
Deck_C                891 non-null int64
Deck_D                891 non-null int64
Deck_E                891 non-null int64
Deck_F                891 non-null int64
Deck_G                891 non-null int64
Deck_Z                891 non-null int64
Pclass_1              891 non-null int64
Pclass_2              891 non-null int64
Pclass_3              891 non-null int64
Title_Lady            891 non-null int64
Title_Master          891 non-null int64
Title_Miss            891 non-null int64
Title_Mr              891 non-null int64
Title_Mrs             891 non-

In [6]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 33 columns):
Age                   418 non-null float64
Fare                  418 non-null float64
FamilySize            418 non-null int64
IsMother              418 non-null int64
IsMale                418 non-null int64
Deck_A                418 non-null int64
Deck_B                418 non-null int64
Deck_C                418 non-null int64
Deck_D                418 non-null int64
Deck_E                418 non-null int64
Deck_F                418 non-null int64
Deck_G                418 non-null int64
Deck_Z                418 non-null int64
Pclass_1              418 non-null int64
Pclass_2              418 non-null int64
Pclass_3              418 non-null int64
Title_Lady            418 non-null int64
Title_Master          418 non-null int64
Title_Miss            418 non-null int64
Title_Mr              418 non-null int64
Title_Mrs             418 non-null int64
Title_Offcer          418 n

### Data Preparation, splitting train dataset into train and test. Actual test dataset doesnt have Survived thus cannot be used for supervised training model

In [7]:
#Extract all columns from train_df, starting from Age column, turn it into matrix
X = train_df.loc[:, 'Age':].as_matrix().astype('float')
print(X.shape)


(891, 33)


In [8]:
#Extract only the survived column and put it in y
y = train_df['Survived'].ravel()

In [9]:
print(y.shape)

(891,)


In [10]:
#train test split for training dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(712, 33) (712,)
(179, 33) (179,)


### Train a LOGISTIC REGRESSION model on the training data. Use LogisticRegression( ) from scikit-learn library 

In [11]:
#train the model 
import sklearn

In [12]:
#make sure sklearn version is 0.19 or higher. Otherwise upgrade and restart kernel
sklearn.__version__

'0.19.1'

In [39]:
#import function
from sklearn.linear_model import LogisticRegression

In [58]:
#instantiate the model LogisticRegression
model_logreg = LogisticRegression(random_state=0)


In [59]:
#train the model
model_logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [98]:
model_logreg.predict(X_test)

array([0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0])

In [99]:
model_logreg.score(X_train, y_train)

0.8426966292134831

In [100]:
model_logreg.score(X_test, y_test)

0.8324022346368715

In [101]:
print('score for logistic regression on titanic dataset - version 1 : {0}'.format(model_logreg.score(X_test, y_test)))

score for logistic regression on titanic dataset - version 1 : 0.8324022346368715


In [102]:
#print('Accuracy score for logistic regression on titanic dataset - version 1 : {0:.2f}'.format(accuracy_score(X_test, y_test)))

In [103]:
#print('confusion matrix for logistic regression on titanic dataset - version 1 : \n {0}'.format(confusion_matrix.score(X_test, y_test)))

In [104]:
#model coeffs
model_logreg.coef_

array([[-0.02832818,  0.00456758, -0.49915892,  0.61420685, -0.79119899,
         0.10736693, -0.15870957, -0.387724  ,  0.52082146,  1.09808616,
         0.40534955, -0.18054496, -0.30000935,  0.96898   ,  0.48239473,
        -0.34673851,  0.27752236,  1.19244202,  0.55467461, -1.47687232,
         1.06363823,  0.25896307, -0.28253931, -0.48319245,  0.16402179,
         0.25016121,  0.28211457,  0.40833865,  0.48665062,  0.46708005,
         0.15090555,  0.37416567,  0.73047055]])

### Kaggle Submission for Titanic with Logistic Regression

In [105]:
#define kaggle submission function
def get_submission_file(model, filename):
    #convert to matrix
    test_X = test_df.as_matrix().astype('float')
    
    #make predictions
    predictions = model.predict(test_X)
    
    #submission dataframe
    df_submission = pd.DataFrame({'PassengerId': test_df.index, 'Survived' : predictions})
    
    #submission file
    submission_data_path = os.path.join(os.path.pardir, 'data', 'external')
    submission_file_path = os.path.join(submission_data_path, filename)
    
    #write to the file, index =False will supress the index columns 0,1,2...417
    df_submission.to_csv(submission_file_path, index=False)
    

In [106]:
#get submission file
get_submission_file(model_logreg, 'titanic_logreg_kaggle_sub.csv')

## Train a DECISION TREE model on the training data. Use DecisionTreeClassifier( ) from scikit-learn library 

In [107]:
from sklearn.tree import DecisionTreeClassifier

In [108]:
model_decision_tree = DecisionTreeClassifier(random_state = 0)

In [109]:
model_decision_tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [110]:
model_decision_tree.predict(X_test)

array([0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 1])

In [111]:
model_decision_tree.score(X_train, y_train)

0.9845505617977528

In [112]:
model_decision_tree.score(X_test, y_test)

0.7877094972067039

### Kaggle Submission for Titanic with Decision Tree

In [119]:
#define kaggle submission function
def get_submission_file(model, filename):
    #convert to matrix
    test_X = test_df.as_matrix().astype('float')
    
    #make predictions
    predictions = model.predict(test_X)
    
    #submission dataframe
    df_submission = pd.DataFrame({'PassengerId': test_df.index, 'Survived' : predictions})
    
    #submission file
    submission_data_path = os.path.join(os.path.pardir, 'data', 'external')
    submission_file_path = os.path.join(submission_data_path, filename)
    
    #write to the file, index =False will supress the index columns 0,1,2...417
    df_submission.to_csv(submission_file_path, index=False)
    

In [120]:
#get submission file
get_submission_file(model_decision_tree, 'titanic_dtree_kaggle_sub.csv')

## Train a RANDOM FOREST model on the training data. Use RandomForestClassifier( ) from scikit-learn library 

In [138]:
from sklearn.ensemble import RandomForestClassifier

In [139]:
model_random_forest = RandomForestClassifier(random_state = 0)

In [140]:
model_random_forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [141]:
model_random_forest.predict(X_test)

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 1])

In [142]:
model_random_forest.score(X_train, y_train)

0.9662921348314607

In [143]:
model_random_forest.score(X_test, y_test)

0.8324022346368715

### Kaggle Submission for Titanic with Random Forest

In [144]:
#define kaggle submission function
def get_submission_file(model, filename):
    #convert to matrix
    test_X = test_df.as_matrix().astype('float')
    
    #make predictions
    predictions = model.predict(test_X)
    
    #submission dataframe
    df_submission = pd.DataFrame({'PassengerId': test_df.index, 'Survived' : predictions})
    
    #submission file
    submission_data_path = os.path.join(os.path.pardir, 'data', 'external')
    submission_file_path = os.path.join(submission_data_path, filename)
    
    #write to the file, index =False will supress the index columns 0,1,2...417
    df_submission.to_csv(submission_file_path, index=False)
    

In [145]:
#get submission file
get_submission_file(model_random_forest, 'titanic_rforest_kaggle_sub.csv')