In [49]:
import pandas as pd
import os
import numpy as np

### Import data

In [50]:
# set the path of the processed data
processed_data_path = os.path.join(os.path.pardir, "data", "processed")
train_file_path = os.path.join(processed_data_path, "train.csv")
test_file_path = os.path.join(processed_data_path, "test.csv")

In [51]:
train_df = pd.read_csv(train_file_path, index_col="PassengerId")
test_df = pd.read_csv(test_file_path, index_col="PassengerId")

In [52]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 33 columns):
Survived              891 non-null int64
Age                   891 non-null float64
Fare                  891 non-null float64
FamilySize            891 non-null int64
isMother              891 non-null int64
isMale                891 non-null int64
Deck_A                891 non-null int64
Deck_B                891 non-null int64
Deck_C                891 non-null int64
Deck_D                891 non-null int64
Deck_E                891 non-null int64
Deck_F                891 non-null int64
Deck_G                891 non-null int64
Deck_Z                891 non-null int64
Pclass_1              891 non-null int64
Pclass_2              891 non-null int64
Pclass_3              891 non-null int64
Title_Lady            891 non-null int64
Title_Master          891 non-null int64
Title_Miss            891 non-null int64
Title_Mr              891 non-null int64
Title_Mrs             891 non-

In [53]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 32 columns):
Age                   418 non-null float64
Fare                  418 non-null float64
FamilySize            418 non-null int64
isMother              418 non-null int64
isMale                418 non-null int64
Deck_A                418 non-null int64
Deck_B                418 non-null int64
Deck_C                418 non-null int64
Deck_D                418 non-null int64
Deck_E                418 non-null int64
Deck_F                418 non-null int64
Deck_G                418 non-null int64
Deck_Z                418 non-null int64
Pclass_1              418 non-null int64
Pclass_2              418 non-null int64
Pclass_3              418 non-null int64
Title_Lady            418 non-null int64
Title_Master          418 non-null int64
Title_Miss            418 non-null int64
Title_Mr              418 non-null int64
Title_Mrs             418 non-null int64
Title_Officer         418 n

### Data Preperation

In [76]:
X = train_df.loc[: "Age"].as_matrix().astype("float")
y = train_df["Survived"].ravel()

  """Entry point for launching an IPython kernel.


In [77]:
print(X.shape)
print(y.shape)

(891, 33)
(891,)


In [78]:
# train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(712, 33)
(712,)
(179, 33)
(179,)


In [57]:
# average survival in tran and test
print(np.mean(y_train))
print(np.mean(y_test))

0.38342696629213485
0.3854748603351955


### Check Scikit-learn Version

In [58]:
import sklearn

In [59]:
sklearn.__version__

'0.19.1'

### Baseline Model

In [60]:
# import function
from sklearn.dummy import DummyClassifier

In [61]:
# create model
model_dummy = DummyClassifier(strategy="most_frequent", random_state=0)

In [62]:
# train model
model_dummy.fit(X_train, y_train)

DummyClassifier(constant=None, random_state=0, strategy='most_frequent')

In [63]:
# baseline model score
print(model_dummy.score(X_test, y_test))

0.6145251396648045


In [64]:
# preformance metrics
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

In [65]:
# baseline accuracy
print("accuracy for baseline model: {0:.2f}".format(accuracy_score(y_test, model_dummy.predict(X_test))))

accuracy for baseline model: 0.61


In [66]:
# comfusion matrix
print("confusion matrix for baseline model is \n {0}".format(confusion_matrix(y_test, model_dummy.predict(X_test))))

confusion matrix for baseline model is 
 [[110   0]
 [ 69   0]]


In [67]:
def get_subbmission_file(model, filename):
    #converting to the matrix
    test_X = test_df.as_matrix().astype("float")
    # make prediction
    predictions = model.predict(test_X)
    # submission dataframe
    df_submission = pd.DataFrame({"PassengerId": test_df.index, "Survived": predictions})
    # submission file
    submission_data_path = os.path.join(os.path.pardir, "data", "external")
    submission_file_path = os.path.join(submission_data_path, filename)
    # write to the file
    df_submission.to_csv(submission_file_path, index=False)
    

In [68]:
# get submission file
get_subbmission_file(model_dummy, "01_dummy.csv")

  This is separate from the ipykernel package so we can avoid doing imports until


### Logistic Regression Model

In [69]:
# import function
from sklearn.linear_model import LogisticRegression


In [70]:
# create model
model_lr_1 = LogisticRegression(random_state=0)


In [71]:
# train_model
model_lr_1.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [72]:
# evaluate model
print("score for logistic regression model - version 1 : {0:.0f}".format(model_lr_1.score(X_test, y_test)))

score for logistic regression model - version 1 : 1


In [73]:
# performance metrics
print("Accuracy: {0}".format(accuracy_score(y_test, model_lr_1.predict(X_test))))
print("Confusion matrix: \n{0}".format(confusion_matrix(y_test, model_lr_1.predict(X_test))))

Accuracy: 1.0
Confusion matrix: 
[[110   0]
 [  0  69]]


In [74]:
model_lr_1.coef_

array([[ 6.98096500e+00, -2.86666415e-02,  1.18177538e-03,
        -3.13515215e-01,  2.68581406e-01,  0.00000000e+00,
        -1.57167624e-02, -4.06334087e-02, -2.05944293e-01,
         8.21729917e-02,  1.88855285e-01,  1.61111959e-02,
        -6.75672410e-02, -4.78026473e-01,  4.70837195e-02,
        -8.19326936e-02, -4.85899732e-01,  2.46371301e-02,
         5.06261204e-02,  1.47628407e-01, -1.05682059e+00,
         4.68977665e-01, -7.44432716e-02, -8.13541658e-02,
        -2.98045543e-01, -1.70765023e-01, -9.98328064e-02,
         4.78946660e-02, -1.08677019e-01, -8.78914705e-02,
        -3.24180216e-01, -3.33023799e-01, -1.87724907e-01]])

### Second Kaggle Submission

In [79]:
get_subbmission_file(model_lr_1, "02_lr.csv")

  This is separate from the ipykernel package so we can avoid doing imports until


ValueError: X has 32 features per sample; expecting 33