In [1]:
import os
import pandas as pd
import numpy as np

### import data

In [2]:
# Define path for processed data and read
processed_data_path = os.path.join(os.path.pardir, 'data', 'processed')
train_file_path = os.path.join(processed_data_path, 'train.csv')
test_file_path = os.path.join(processed_data_path, 'test.csv')

train_df = pd.read_csv(train_file_path, index_col='PassengerId')
test_df = pd.read_csv(test_file_path, index_col='PassengerId')

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 33 columns):
Survived              891 non-null int64
Age                   891 non-null float64
Fare                  891 non-null float64
FamilySize            891 non-null int64
IsMother              891 non-null int64
IsMale                891 non-null int64
Deck_A                891 non-null int64
Deck_B                891 non-null int64
Deck_C                891 non-null int64
Deck_D                891 non-null int64
Deck_E                891 non-null int64
Deck_F                891 non-null int64
Deck_G                891 non-null int64
Deck_Z                891 non-null int64
Pclass_1              891 non-null int64
Pclass_2              891 non-null int64
Pclass_3              891 non-null int64
Title_Lady            891 non-null int64
Title_Master          891 non-null int64
Title_Miss            891 non-null int64
Title_Mr              891 non-null int64
Title_Mrs             891 non-

### Data preparation

In [4]:
X = train_df.loc[:, 'Age':].values.astype('float')
y = train_df['Survived'].ravel()
print(X.shape, y.shape)

(891, 32) (891,)


In [5]:
# train test split data and labels
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(712, 32) (179, 32)
(712,) (179,)


In [6]:
# average survival rate
print('mean survival in train : {0:.3f}'.format(np.mean(y_train)))
print('mean survival in test : {0:.3f}'.format(np.mean(y_test)))

mean survival in train : 0.383
mean survival in test : 0.385


### Build data baseline model

In [7]:
import sklearn
sklearn.__version__

'0.20.2'

In [8]:
from sklearn.dummy import DummyClassifier

In [9]:
# create baseline model
baseline_model = DummyClassifier(strategy='most_frequent', random_state=0)

In [10]:
# train the model
baseline_model.fit(X_train, y_train)

DummyClassifier(constant=None, random_state=0, strategy='most_frequent')

In [11]:
# check score
baseline_score = baseline_model.score(X_test, y_test)
print('The baseline score is: {0:.2f}'.format(baseline_score))

The baseline score is: 0.61


In [12]:
# performance metrics
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

In [13]:
# baseline model accuracy
baseline_accuracy = accuracy_score(y_test, baseline_model.predict(X_test))
print('The baseline accuracy is: {0:.2f}'.format(baseline_accuracy))

The baseline accuracy is: 0.61


In [14]:
# confusion matrix
baseline_confusion_matrix = confusion_matrix(y_test, baseline_model.predict(X_test))
print('The baseline confusion matrix is: \n{0}'.format(baseline_confusion_matrix))

The baseline confusion matrix is: 
[[110   0]
 [ 69   0]]


In [None]:
# Precision and recall score
print('The baseline precision score is: {0:.2f}'.format(precision_score(y_test, baseline_model.predict(X_test))))
print('The baseline recall score is: {0:.2f}'.format(recall_score(y_test, baseline_model.predict(X_test))))

### kaggle submission

In [16]:
# Convert to matrix
test_X = test_df.values.astype('float')

In [17]:
# get test predictions
test_predictions = baseline_model.predict(test_X)

In [18]:
# submission result into dataframe
submission_df = pd.DataFrame({'PassengerId': test_df.index, 'Survived': test_predictions})
submission_df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [19]:
# get path and write result to csv
submission_data_path = os.path.join(os.path.pardir, 'data', 'external')
submission_file_path = os.path.join(submission_data_path, '01_dummy.csv')

submission_df.to_csv(submission_file_path, index=False)

In [20]:
# Define a function to get submission file
def get_submission_file(model, filename):
    # Convert to matrix
    test_X = test_df.values.astype('float')
    # get test predictions
    test_predictions = model.predict(test_X)
    submission_df = pd.DataFrame({'PassengerId': test_df.index, 'Survived': test_predictions})
    # get path and write result to csv
    submission_data_path = os.path.join(os.path.pardir, 'data', 'external')
    submission_file_path = os.path.join(submission_data_path, filename)

    submission_df.to_csv(submission_file_path, index=False)

In [21]:
get_submission_file(baseline_model, '01_dummy.csv')

### Building a logistic regression maodel

In [22]:
from sklearn.linear_model import LogisticRegression

In [23]:
model_lr_1 = LogisticRegression(random_state=0, solver='liblinear')

In [24]:
# train model
model_lr_1.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [25]:
# check score
model_lr_1_score = model_lr_1.score(X_test, y_test)
print('The model_lr_1 vrsion 1 score is: {0:.2f}'.format(model_lr_1_score))

The model_lr_1 vrsion 1 score is: 0.83


In [26]:
# performance matrix
# model accuracy
print('The model_lr_1 vrsion 1 accuracy is: {0:.2f}'.format(accuracy_score(y_test, model_lr_1.predict(X_test))))
# confusion matrix
print('The model_lr_1 vrsion 1 confusion matrix is: \n{0}'.format(confusion_matrix(y_test, model_lr_1.predict(X_test))))
# Precision and recall score
print('The baseline precision score is: {0:.2f}'.format(precision_score(y_test, model_lr_1.predict(X_test))))
print('The baseline recall score is: {0:.2f}'.format(recall_score(y_test, model_lr_1.predict(X_test))))

The model_lr_1 vrsion 1 accuracy is: 0.83
The model_lr_1 vrsion 1 confusion matrix is: 
[[95 15]
 [15 54]]
The baseline precision score is: 0.78
The baseline recall score is: 0.78


In [27]:
# model coefficient
model_lr_1.coef_

array([[-0.02827099,  0.00455908, -0.5037446 ,  0.67717745, -0.80602385,
         0.12651351, -0.17332933, -0.39103988,  0.52416029,  1.09828192,
         0.40381112, -0.18652561, -0.30075444,  0.96237481,  0.48458056,
        -0.34583779,  0.28247594,  1.22342274,  0.57560209, -1.44467049,
         1.04906341, -0.11269531, -0.47208079,  0.36744371,  0.73367388,
         0.16170627,  0.24113193,  0.28229584,  0.41598355,  0.49150916,
         0.46115935,  0.14844908]])

### Kaggle submission

In [28]:
get_submission_file(model_lr_1, '01_lr_1.csv')

### Model optimization
- overfitting and underfitting . 
- Regularization .  
- Hyperparamenter optimization use a gridsearch and K-fold cross-validation . 
- Feature normalization . 


In [29]:
model_lr_2 = LogisticRegression(random_state=0, solver='liblinear')

In [30]:
# Hyperparamenter optimization use a gridsearch
from sklearn.model_selection import GridSearchCV
parameters = {'C':[1.0, 10.0, 50.0, 100.0, 1000.0], 'penalty':['l1', 'l2']}

In [31]:
clf = GridSearchCV(model_lr_2, param_grid=parameters, cv=3)

In [32]:
clf.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [1.0, 10.0, 50.0, 100.0, 1000.0], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [33]:
print("Best score using gridsearch : {0:.2f}".format(clf.best_score_))

Best score using gridsearch : 0.83


In [34]:
clf.best_params_

{'C': 1.0, 'penalty': 'l1'}

In [35]:
# test on test data
print("Test score for best model: {0:.2f}".format(clf.score(X_test, y_test)))

Test score for best model: 0.83


In [36]:
# make a file to submit on kaggle
get_submission_file(clf, '01_lr_2.csv')

### Feature Normalization and Standardization

In [37]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

#### feature normalization

In [38]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [39]:
X_train_scaled[:,0].min(), X_train_scaled[:,0].max()

(0.0, 1.0)

### Create model with standardizarion

In [40]:
model_lr_3 = LogisticRegression(random_state=0, solver='liblinear')
parameters = {'C':[1.0, 10.0, 50.0, 100.0, 1000.0], 'penalty':['l1', 'l2']}
clf2 = GridSearchCV(model_lr_3, param_grid=parameters, cv=3)
clf2.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [1.0, 10.0, 50.0, 100.0, 1000.0], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [41]:
print("Best score using gridsearch : {0:.2f}".format(clf2.best_score_))

Best score using gridsearch : 0.83


In [42]:
# make a file to submit on kaggle
get_submission_file(clf2, '01_lr_4.csv')

### Model Persistence

In [43]:
import pickle

In [44]:
model_file_path = os.path.join(os.path.pardir, 'models', 'lr_model.pkl')
scaler_file_path = os.path.join(os.path.pardir, 'models', 'lr_scaler.pkl')

In [45]:
# open the fils to write
model_file_pickle = open(model_file_path, 'wb')
scaler_file_pickle = open(scaler_file_path, 'wb')

# persist the model and scaler
pickle.dump(clf2, model_file_pickle)
pickle.dump(scaler, scaler_file_pickle)

model_file_pickle.close()
scaler_file_pickle.close()

### Load pickle file with mode

In [46]:
# open the files to write
model_file_pickle = open(model_file_path, 'rb')
scaler_file_pickle = open(scaler_file_path, 'rb')

# persist the model and scaler
clf_loaded = pickle.load(model_file_pickle)
scaler_loaded = pickle.load(scaler_file_pickle)

model_file_pickle.close()
scaler_file_pickle.close()

In [47]:
clf_loaded

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [1.0, 10.0, 50.0, 100.0, 1000.0], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [48]:
scaler_loaded

MinMaxScaler(copy=True, feature_range=(0, 1))

In [49]:
print("Best score using gridsearch : {0:.2f}".format(clf_loaded.best_score_))

Best score using gridsearch : 0.83


In [50]:
submit_load_data_script_file = os.path.join(os.path.pardir, 'src', 'models', 'load_submit_data.py')

In [51]:
%%writefile $submit_load_data_script_file

import os, pickle
import pandas as pd
import numpy as np

def load_model_scaler():
    model_file_path = os.path.join('models', 'lr_model.pkl')
    scaler_file_path = os.path.join('models', 'lr_scaler.pkl')
    # open the files to write
    model_file_pickle = open(model_file_path, 'rb')
    scaler_file_pickle = open(scaler_file_path, 'rb')

    # persist the model and scaler
    clf_loaded = pickle.load(model_file_pickle)
    scaler_loaded = pickle.load(scaler_file_pickle)

    model_file_pickle.close()
    scaler_file_pickle.close()
    return (clf_loaded, scaler_loaded)


# Define a function to get submission file
def get_submission_file(model, filename, scaler):
    test_file_path = os.path.join('data', 'processed', 'test.csv')
    # Convert to matrix
    test_df = pd.read_csv(test_file_path, index_col='PassengerId')
    test_X = test_df.values.astype('float')
    X_test_scaled = scaler.transform(test_X)
    # get test predictions
    test_predictions = model.predict(X_test_scaled)
    submission_df = pd.DataFrame({'PassengerId': test_df.index, 'Survived': test_predictions})
    # get path and write result to csv
    submission_data_path = os.path.join('data', 'external')
    submission_file_path = os.path.join(submission_data_path, filename)

    submission_df.to_csv(submission_file_path, index=False)


Overwriting ../src/models/load_submit_data.py
