In [1]:

import pandas as pd
import os
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import pandas_profiling

### Baseline Model

In [2]:
# set the path of the processed data
processed_data_path = os.path.join(os.path.pardir, "data", "processed")
train_file_path = os.path.join(processed_data_path, "train.csv")
test_file_path = os.path.join(processed_data_path, "test.csv")

In [3]:
train_df = pd.read_csv(train_file_path, index_col="PassengerId")
test_df = pd.read_csv(test_file_path, index_col="PassengerId")

## Data preperation

In [4]:
prepped_train_data = train_df.loc[:, "Age":].as_matrix().astype("float")
prepped_train_test_data = train_df["Survived"].ravel()

  """Entry point for launching an IPython kernel.


In [5]:
print(prepped_train_data.shape)
print(prepped_train_test_data.shape)

(891, 32)
(891,)


In [6]:


prepped_train, prepped_test, test_train, test_test = \
train_test_split(prepped_train_data, prepped_train_test_data, test_size=0.2, random_state=0)
print(prepped_train.shape)
print(test_train.shape)
print(prepped_test.shape)
print(test_test.shape)

(712, 32)
(712,)
(179, 32)
(179,)


In [7]:
model_dummy = DummyClassifier(strategy="most_frequent", random_state=0)

In [8]:
model_dummy.fit(prepped_train, test_train)

DummyClassifier(constant=None, random_state=0, strategy='most_frequent')

In [9]:
model_dummy.score(prepped_test, test_test)

0.6145251396648045

In [10]:
confusion_matrix(test_test, model_dummy.predict(prepped_test))

array([[110,   0],
       [ 69,   0]])

In [11]:
model_logreg_1 = LogisticRegression(random_state=0)

In [12]:
model_logreg_1.fit(prepped_train, test_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [13]:
model_logreg_1.score(prepped_test, test_test)

0.8268156424581006

In [14]:
confusion_matrix(test_test, model_logreg_1.predict(prepped_test))

array([[95, 15],
       [16, 53]])

In [15]:
def get_subbmission_file(model, filename):
    #converting to the matrix
    test_X = test_df.as_matrix().astype("float")
    # make prediction
    predictions = model.predict(test_X)
    # submission dataframe
    df_submission = pd.DataFrame({"PassengerId": test_df.index, "Survived": predictions})
    # submission file
    submission_data_path = os.path.join(os.path.pardir, "data", "external")
    submission_file_path = os.path.join(submission_data_path, filename)
    # write to the file
    df_submission.to_csv(submission_file_path, index=False)

In [16]:
get_subbmission_file(model_logreg_1, "02_lr.csv")

  This is separate from the ipykernel package so we can avoid doing imports until


In [17]:
model_logreg_1.coef_

array([[-0.02923553,  0.00451899, -0.50687734,  0.66961288,  0.        ,
         0.10012052, -0.18832707, -0.41398124,  0.51092837,  1.08121044,
         0.38318919, -0.19260299, -0.33615771,  0.89986506,  0.43281204,
        -0.38829759,  0.37731842,  0.91161511,  0.98600282, -1.81358953,
         1.46349644, -0.3356932 , -0.64477056,  0.1166403 ,  0.19947119,
         0.24640572,  0.3818623 ,  0.44346438,  0.40641003,  0.0945051 ,
         0.30654211,  0.63783741]])

In [18]:
parameters = {"C": [1.0, 10.0, 50.0, 100.0, 1000.0], "penalty": ["l1","l2"]}
clf = GridSearchCV(model_logreg_1, param_grid=parameters, cv=3)

In [19]:
clf.fit(prepped_train, test_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1.0, 10.0, 50.0, 100.0, 1000.0], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [20]:
clf.best_params_

{'C': 1.0, 'penalty': 'l1'}

In [21]:
clf.best_score_

0.8300561797752809

In [22]:
clf.score(prepped_test, test_test)

0.8268156424581006

In [23]:
get_subbmission_file(clf, "03_lr.csv")

  This is separate from the ipykernel package so we can avoid doing imports until


### Feature Normalization and Standardization

#### Feature Normalization 

In [24]:
# feature normalization
scaler = MinMaxScaler()
x_trained_scaled = scaler.fit_transform(prepped_train)

In [25]:
x_trained_scaled[:,0].min(), x_trained_scaled[:,0].max()

(0.0, 1.0)

In [26]:
x_test_scaled = scaler.transform(prepped_test)

#### Feature Stanadardization

In [28]:
# feature standardization
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(prepped_train)
x_test_scaled = scaler.transform(prepped_test)

#### Create model after standardization

In [29]:
# base model
model_lr = LogisticRegression(random_state=0)
parameteres = {"C": [1.0, 10.0, 100.0, 1000.0], "penalty": ["l1", "l2"]}
clf = GridSearchCV(model_lr, param_grid=parameteres, cv=3)
clf.fit(x_train_scaled, test_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1.0, 10.0, 100.0, 1000.0], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [30]:
clf.best_score_

0.8146067415730337

## Basic Profiling

In [31]:
pandas_profiling.ProfileReport(train_df)

0,1
Number of variables,34
Number of observations,891
Total Missing (%),0.0%
Total size in memory,236.8 KiB
Average record size in memory,272.1 B

0,1
Numeric,4
Categorical,0
Boolean,29
Date,0
Text (Unique),0
Rejected,1
Unsupported,0

0,1
Distinct count,88
Unique (%),9.9%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,29.304
Minimum,0.42
Maximum,80
Zeros (%),0.0%

0,1
Minimum,0.42
5-th percentile,5.0
Q1,22.0
Median,29.0
Q3,35.0
95-th percentile,54.0
Maximum,80.0
Range,79.58
Interquartile range,13.0

0,1
Standard deviation,13.243
Coef of variation,0.45191
Kurtosis,0.81209
Mean,29.304
MAD,9.6321
Skewness,0.46587
Sum,26110
Variance,175.37
Memory size,7.0 KiB

Value,Count,Frequency (%),Unnamed: 3
29.0,139,15.6%,
22.0,63,7.1%,
35.0,35,3.9%,
24.0,30,3.4%,
18.0,26,2.9%,
28.0,25,2.8%,
30.0,25,2.8%,
19.0,25,2.8%,
21.0,24,2.7%,
25.0,23,2.6%,

Value,Count,Frequency (%),Unnamed: 3
0.42,1,0.1%,
0.67,1,0.1%,
0.75,2,0.2%,
0.83,2,0.2%,
0.92,1,0.1%,

Value,Count,Frequency (%),Unnamed: 3
70.0,2,0.2%,
70.5,1,0.1%,
71.0,2,0.2%,
74.0,1,0.1%,
80.0,1,0.1%,

0,1
Distinct count,2
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.86869

0,1
1,774
0,117

Value,Count,Frequency (%),Unnamed: 3
1,774,86.9%,
0,117,13.1%,

0,1
Distinct count,2
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.13131

0,1
0,774
1,117

Value,Count,Frequency (%),Unnamed: 3
0,774,86.9%,
1,117,13.1%,

0,1
Distinct count,2
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.016835

0,1
0,876
1,15

Value,Count,Frequency (%),Unnamed: 3
0,876,98.3%,
1,15,1.7%,

0,1
Distinct count,2
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.05275

0,1
0,844
1,47

Value,Count,Frequency (%),Unnamed: 3
0,844,94.7%,
1,47,5.3%,

0,1
Distinct count,2
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.066218

0,1
0,832
1,59

Value,Count,Frequency (%),Unnamed: 3
0,832,93.4%,
1,59,6.6%,

0,1
Distinct count,2
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.037037

0,1
0,858
1,33

Value,Count,Frequency (%),Unnamed: 3
0,858,96.3%,
1,33,3.7%,

0,1
Distinct count,2
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.035915

0,1
0,859
1,32

Value,Count,Frequency (%),Unnamed: 3
0,859,96.4%,
1,32,3.6%,

0,1
Distinct count,2
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.01459

0,1
0,878
1,13

Value,Count,Frequency (%),Unnamed: 3
0,878,98.5%,
1,13,1.5%,

0,1
Distinct count,2
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.0044893

0,1
0,887
1,4

Value,Count,Frequency (%),Unnamed: 3
0,887,99.6%,
1,4,0.4%,

0,1
Distinct count,2
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.77217

0,1
1,688
0,203

Value,Count,Frequency (%),Unnamed: 3
1,688,77.2%,
0,203,22.8%,

0,1
Distinct count,2
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.1908

0,1
0,721
1,170

Value,Count,Frequency (%),Unnamed: 3
0,721,80.9%,
1,170,19.1%,

0,1
Distinct count,2
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.08642

0,1
0,814
1,77

Value,Count,Frequency (%),Unnamed: 3
0,814,91.4%,
1,77,8.6%,

0,1
Distinct count,2
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.72278

0,1
1,644
0,247

Value,Count,Frequency (%),Unnamed: 3
1,644,72.3%,
0,247,27.7%,

0,1
Distinct count,9
Unique (%),1.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,1.9046
Minimum,1
Maximum,11
Zeros (%),0.0%

0,1
Minimum,1
5-th percentile,1
Q1,1
Median,1
Q3,2
95-th percentile,6
Maximum,11
Range,10
Interquartile range,1

0,1
Standard deviation,1.6135
Coef of variation,0.84714
Kurtosis,9.1597
Mean,1.9046
MAD,1.0904
Skewness,2.7274
Sum,1697
Variance,2.6032
Memory size,7.0 KiB

Value,Count,Frequency (%),Unnamed: 3
1,537,60.3%,
2,161,18.1%,
3,102,11.4%,
4,29,3.3%,
6,22,2.5%,
5,15,1.7%,
7,12,1.3%,
11,7,0.8%,
8,6,0.7%,

Value,Count,Frequency (%),Unnamed: 3
1,537,60.3%,
2,161,18.1%,
3,102,11.4%,
4,29,3.3%,
5,15,1.7%,

Value,Count,Frequency (%),Unnamed: 3
5,15,1.7%,
6,22,2.5%,
7,12,1.3%,
8,6,0.7%,
11,7,0.8%,

0,1
Distinct count,248
Unique (%),27.8%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,32.204
Minimum,0
Maximum,512.33
Zeros (%),1.7%

0,1
Minimum,0.0
5-th percentile,7.225
Q1,7.9104
Median,14.454
Q3,31.0
95-th percentile,112.08
Maximum,512.33
Range,512.33
Interquartile range,23.09

0,1
Standard deviation,49.693
Coef of variation,1.5431
Kurtosis,33.398
Mean,32.204
MAD,28.164
Skewness,4.7873
Sum,28694
Variance,2469.4
Memory size,7.0 KiB

Value,Count,Frequency (%),Unnamed: 3
8.05,43,4.8%,
13.0,42,4.7%,
7.8958,38,4.3%,
7.75,34,3.8%,
26.0,31,3.5%,
10.5,24,2.7%,
7.925,18,2.0%,
7.775,16,1.8%,
26.55,15,1.7%,
0.0,15,1.7%,

Value,Count,Frequency (%),Unnamed: 3
0.0,15,1.7%,
4.0125,1,0.1%,
5.0,1,0.1%,
6.2375,1,0.1%,
6.4375,1,0.1%,

Value,Count,Frequency (%),Unnamed: 3
227.525,4,0.4%,
247.5208,2,0.2%,
262.375,2,0.2%,
263.0,4,0.4%,
512.3292,3,0.3%,

0,1
Distinct count,2
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.25701

0,1
0,662
1,229

Value,Count,Frequency (%),Unnamed: 3
0,662,74.3%,
1,229,25.7%,

0,1
Distinct count,2
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.2514

0,1
0,667
1,224

Value,Count,Frequency (%),Unnamed: 3
0,667,74.9%,
1,224,25.1%,

0,1
Distinct count,2
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.2413

0,1
0,676
1,215

Value,Count,Frequency (%),Unnamed: 3
0,676,75.9%,
1,215,24.1%,

0,1
Distinct count,2
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.25028

0,1
0,668
1,223

Value,Count,Frequency (%),Unnamed: 3
0,668,75.0%,
1,223,25.0%,

0,1
Distinct count,891
Unique (%),100.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,446
Minimum,1
Maximum,891
Zeros (%),0.0%

0,1
Minimum,1.0
5-th percentile,45.5
Q1,223.5
Median,446.0
Q3,668.5
95-th percentile,846.5
Maximum,891.0
Range,890.0
Interquartile range,445.0

0,1
Standard deviation,257.35
Coef of variation,0.57703
Kurtosis,-1.2
Mean,446
MAD,222.75
Skewness,0
Sum,397386
Variance,66231
Memory size,7.0 KiB

Value,Count,Frequency (%),Unnamed: 3
891,1,0.1%,
293,1,0.1%,
304,1,0.1%,
303,1,0.1%,
302,1,0.1%,
301,1,0.1%,
300,1,0.1%,
299,1,0.1%,
298,1,0.1%,
297,1,0.1%,

Value,Count,Frequency (%),Unnamed: 3
1,1,0.1%,
2,1,0.1%,
3,1,0.1%,
4,1,0.1%,
5,1,0.1%,

Value,Count,Frequency (%),Unnamed: 3
887,1,0.1%,
888,1,0.1%,
889,1,0.1%,
890,1,0.1%,
891,1,0.1%,

0,1
Distinct count,2
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.24242

0,1
0,675
1,216

Value,Count,Frequency (%),Unnamed: 3
0,675,75.8%,
1,216,24.2%,

0,1
Distinct count,2
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.20651

0,1
0,707
1,184

Value,Count,Frequency (%),Unnamed: 3
0,707,79.3%,
1,184,20.7%,

0,1
Distinct count,2
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.55107

0,1
1,491
0,400

Value,Count,Frequency (%),Unnamed: 3
1,491,55.1%,
0,400,44.9%,

0,1
Distinct count,2
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.38384

0,1
0,549
1,342

Value,Count,Frequency (%),Unnamed: 3
0,549,61.6%,
1,342,38.4%,

0,1
Distinct count,2
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.0022447

0,1
0,889
1,2

Value,Count,Frequency (%),Unnamed: 3
0,889,99.8%,
1,2,0.2%,

0,1
Distinct count,2
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.044893

0,1
0,851
1,40

Value,Count,Frequency (%),Unnamed: 3
0,851,95.5%,
1,40,4.5%,

0,1
Distinct count,2
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.20651

0,1
0,707
1,184

Value,Count,Frequency (%),Unnamed: 3
0,707,79.3%,
1,184,20.7%,

0,1
Distinct count,2
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.58025

0,1
1,517
0,374

Value,Count,Frequency (%),Unnamed: 3
1,517,58.0%,
0,374,42.0%,

0,1
Distinct count,2
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.14254

0,1
0,764
1,127

Value,Count,Frequency (%),Unnamed: 3
0,764,85.7%,
1,127,14.3%,

0,1
Distinct count,2
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.013468

0,1
0,879
1,12

Value,Count,Frequency (%),Unnamed: 3
0,879,98.7%,
1,12,1.3%,

0,1
Distinct count,2
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.010101

0,1
0,882
1,9

Value,Count,Frequency (%),Unnamed: 3
0,882,99.0%,
1,9,1.0%,

0,1
Constant value,0

0,1
Distinct count,2
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.061728

0,1
0,836
1,55

Value,Count,Frequency (%),Unnamed: 3
0,836,93.8%,
1,55,6.2%,

Unnamed: 0_level_0,Survived,Age,Fare,FamilySize,isMother,isMale,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_Z,Pclass_1,Pclass_2,Pclass_3,Title_Lady,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Sir,Fare_Bin_very_low,Fare_Bin_low,Fare_Bin_high,Fare_Bin_very_high,Embarked_C,Embarked_Q,Embarked_S,AgeState_Adult,AgeState_Child
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
1,0,22.0,7.25,2,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,1,0
2,1,38.0,71.2833,2,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0
3,1,26.0,7.925,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0
4,1,35.0,53.1,2,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,1,0
5,0,35.0,8.05,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,1,0


## Model Persistence

In [43]:
import pickle

In [44]:
# create the file path
model_file_path = os.path.join(os.path.pardir, "models", "lr_model.pkl")
scaler_file_path = os.path.join(os.path.pardir, "models", "lr_scaler.pkl")

In [45]:
# open the files to write
model_file_pickle = open(model_file_path, "wb")
scaler_file_pickle = open(scaler_file_path, "wb")

In [46]:
# persist the model and scaler
pickle.dump(clf, model_file_pickle)
pickle.dump(scaler, scaler_file_pickle)

In [47]:
model_file_pickle.close()
scaler_file_pickle.close()

### load the presisted file

In [50]:
# open files in read mode
model_file_pickle = open(model_file_path, "rb")
scaler_file_pickle = open(scaler_file_path, "rb")

#load files
clf_loaded = pickle.load(model_file_pickle)
scaler_loaded = pickle.load(scaler_file_pickle)

# close files
model_file_pickle.close()
scaler_file_pickle.close()

In [51]:
clf_loaded

GridSearchCV(cv=3, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1.0, 10.0, 100.0, 1000.0], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [52]:
scaler_loaded

StandardScaler(copy=True, with_mean=True, with_std=True)

In [54]:
X_test_scaled = scaler_loaded.transform(prepped_test)
clf_loaded.score(X_test_scaled, test_test)

0.8435754189944135