In [1]:
# import all the libraries
import pandas as pd
import numpy as np
import os

#### Import Data

In [2]:
processed_data_path = os.path.join(os.path.pardir, 'data','processed')
train_file_path = os.path.join(processed_data_path, 'train.csv')
test_file_path = os.path.join(processed_data_path,'test.csv')

In [3]:
df_train = pd.read_csv(train_file_path, index_col='PassengerId')
df_test = pd.read_csv(test_file_path, index_col='PassengerId')

In [4]:
df_train.info()
# Survived is the output and rest is the input


<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 33 columns):
Survived              891 non-null int64
Age                   891 non-null float64
Fare                  891 non-null float64
FamilySize            891 non-null int64
IsMother              891 non-null int64
IsMale                891 non-null int64
Deck_A                891 non-null float64
Deck_B                891 non-null float64
Deck_C                891 non-null float64
Deck_D                891 non-null float64
Deck_E                891 non-null float64
Deck_F                891 non-null float64
Deck_G                891 non-null float64
Deck_Z                891 non-null float64
Pclass_1              891 non-null float64
Pclass_2              891 non-null float64
Pclass_3              891 non-null float64
Title_Lady            891 non-null float64
Title_Master          891 non-null float64
Title_Miss            891 non-null float64
Title_Mr              891 non-null float64


In [5]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 33 columns):
Survived              418 non-null int64
Age                   418 non-null float64
Fare                  418 non-null float64
FamilySize            418 non-null int64
IsMother              418 non-null int64
IsMale                418 non-null int64
Deck_A                418 non-null float64
Deck_B                418 non-null float64
Deck_C                418 non-null float64
Deck_D                418 non-null float64
Deck_E                418 non-null float64
Deck_F                418 non-null float64
Deck_G                418 non-null float64
Deck_Z                418 non-null float64
Pclass_1              418 non-null float64
Pclass_2              418 non-null float64
Pclass_3              418 non-null float64
Title_Lady            418 non-null float64
Title_Master          418 non-null float64
Title_Miss            418 non-null float64
Title_Mr              418 non-null float

#### Data preparation

machine learning algorithms expect numerical arrays.

We will seperate output features and input features of the train data. 
Output feature is 'Survived' column
Input feature is all the other column after Survived column

In [6]:
X = df_train.loc[:, 'Age':].as_matrix().astype('float')
y = df_train['Survived'].ravel()  #ravel is a numpy function for creating one dimensional array

In [7]:
print X.shape, y.shape

(891, 32) (891,)


In [8]:
!pip install sklearn

[33mDEPRECATION: Python 2.7 will reach the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 won't be maintained after that date. A future version of pip will drop support for Python 2.7.[0m
[33mYou are using pip version 19.0.2, however version 19.0.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [10]:
# we will divide the data into test and traninig data

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state =0)
print X_train.shape, y_train.shape
print X_test.shape, y_test.shape

(712, 32) (712,)
(179, 32) (179,)


In [11]:
print 'Mean survival in train: {0:.3f}'.format(np.mean(y_train))
print 'Mean survival in test: {0: .3f}'.format(np.mean(y_test))

Mean survival in train: 0.383
Mean survival in test:  0.385


#### Check Scikit Learn Version for making Baseline Model as we will use DummyClassifier which is available after 0.19 version

In [14]:
import sklearn

In [15]:
sklearn.__version__

'0.20.3'

#### Baseline Model

In [16]:
# import function
from sklearn.dummy import DummyClassifier

In [17]:
# create model
model_dummy = DummyClassifier(strategy='most_frequent', random_state=0)

In [19]:
#   Train mode;
model_dummy.fit(X_train,y_train) # in fit function , we provide two things, 1: input data, 2:output data

DummyClassifier(constant=None, random_state=0, strategy='most_frequent')

#### Pass test data to evaluate the performance
The model will first predict the output on X_test and the it will compare the predicted output(X_test) with the original output (y_test).
For classification problem, this score is called accuracy

In [21]:
 print 'score for baseline model : {0:.2f}'.format(model_dummy.score(X_test, y_test))

score for baseline model : 0.61


The above score means without any machine learning algorithm, if we always predict the value of 0 or nonsurvived then we get the 61% accuracy

#### Performance Matrics

In [22]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

##### The method signature for all these matrix are also same

first parameter = actual output

second parameter = predicted output

In [23]:
# accuracy score
print 'accuracy for baseline model: {0:.2f}'.format(accuracy_score(y_test, model_dummy.predict(X_test)))

accuracy for baseline model: 0.61


In [25]:
# confusion matrix
print 'confusion matrix for baseline model: \n{0}'.format(confusion_matrix(y_test, model_dummy.predict(X_test)))

confusion matrix for baseline model: 
[[110   0]
 [ 69   0]]


In [26]:
# precision and recall scores
print 'precision for baseline model: {0:.2f}'.format(precision_score(y_test, model_dummy.predict(X_test)))
print 'recall for baseline model: {0:.2f}'.format(recall_score(y_test, model_dummy.predict(X_test)))

precision for baseline model: 0.00
recall for baseline model: 0.00


  'precision', 'predicted', average, warn_for)


### Logistic Regression Model

In [27]:
# import function
from sklearn.linear_model import LogisticRegression

In [28]:
# create model
model_lr_initial = LogisticRegression(random_state=0)

In [29]:
# train model
model_lr_initial.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [30]:
# evaluate model
print "Score for logistic model :{0:.2f}".format(model_lr_initial.score (X_test, y_test)
                                                )

Score for logistic model :0.83


#### Performance metrics

In [32]:
# accuracy
print 'accuracy for logistic regression - initial version: {0:.2f}'.format(accuracy_score(y_test, model_lr_initial.predict(X_test)))

accuracy for logistic regression - initial version: 0.83


In [35]:
# confusion matrix for logistic regression
print 'confusion matrix for logistic regression -initial version: \n {0}'.format(confusion_matrix(y_test, model_lr_initial.predict(X_test)))

confusion matrix for logistic regression -initial version: 
 [[95 15]
 [15 54]]


In [36]:
# precision
print 'precision for logistic regression - initial version : {0:.2f}'.format(precision_score(y_test, model_lr_initial.predict(X_test)))

precision for logistic regression - initial version : 0.78


In [38]:
# recall for logistic regression
print 'recall for logistic regression {0: .2f}'.format(recall_score(y_test, model_lr_initial.predict(X_test)))

recall for logistic regression  0.78


##### Model coefficients

In [40]:
# model coefficients or model weights or parameters
model_lr_initial.coef_

array([[-0.02842272,  0.00455452, -0.50009091,  0.61781316, -0.8139233 ,
         0.12845079, -0.17281789, -0.39317833,  0.52159977,  1.09941224,
         0.40341217, -0.18345051, -0.30036042,  0.96533486,  0.48256743,
        -0.34483447,  0.28089593,  1.21761327,  0.56363966, -1.44586304,
         1.07245552, -0.11273706, -0.47293647,  0.16255646,  0.24716929,
         0.28009437,  0.4132477 ,  0.49183529,  0.46198829,  0.14924424,
         0.37283517,  0.73023265]])

### Logistic regresion model with Hyperparameter optimization

In [65]:
# logistic model
model_lr = LogisticRegression(random_state=0)

#### Using GridSearch technic for hyperparameter optimization

In [66]:
# importing GridSearchCV
from sklearn.model_selection import GridSearchCV

In [73]:
# you can chose C as much as you want and the values
parameters = {'C':[1.0, 2.0,90.0,200.0,500,800.0, 950.0],'penalty':['l1','l2'] }


In [74]:
# creating grid search object
# this function takes, base model, gridparameters, and crossvalidation fold, here we will give cv=3 means 3 fold cross validation
clf = GridSearchCV(model_lr, param_grid= parameters, cv=3)


In [75]:
# traning the model
clf.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'penalty': ['l1', 'l2'], 'C': [1.0, 2.0, 90.0, 200.0, 500, 800.0, 950.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [76]:
# for finding the best combination we can use best_params 
clf.best_params_

{'C': 1.0, 'penalty': 'l1'}

In [77]:
# best score
print 'best score : {0:.2f}'.format(clf.best_score_)

best score : 0.83


In [78]:
# evaluate
print 'evaluate the model: {0:.2f}'.format(clf.score(X_test, y_test))

evaluate the model: 0.83


#### Feature Normalization and Standardization

In [81]:
# MinMaxScaler for normalization
# StandardScaler function for standardization
from sklearn.preprocessing import MinMaxScaler, StandardScaler

##### feature normalization
Feature normalization means all the features in the data should be on the same scale.
There are two types of scaling.

Scale Type 1 : 0  to 1

Scale Type 2 : -1  to 1

By default it will scale between 0 to 1

In [82]:
# create scaler object and pass the data into fit_transform function
scalar = MinMaxScaler()
X_train_scale = scalar.fit_transform(X_train)

In [83]:
X_train_scale[:,0].min(), X_train_scale[:,0].max()

(0.0, 1.0)

#### We need to scale our test data also because we train the model with scale data

In [84]:
# normalize test data
X_test_scale = scalar.transform(X_test)

##### Feature Standardization
standardize the feature in such a way that all of the features have 
	*	Mean = 0
	* Variance = 1

In [85]:
scaler_stand = StandardScaler()

In [86]:
X_train_scale = scaler_stand.fit_transform(X_train)
X_test_scale = scaler_stand.transform(X_test)

##### Create model after feature scaling and standardization

In [87]:
model_lr = LogisticRegression(random_state=0)
# you can chose C as much as you want and the values
parameters = {'C':[1.0, 2.0,90.0,200.0,500,800.0, 950.0],'penalty':['l1','l2'] }
clf = GridSearchCV(model_lr, parameters, cv=3)
clf.fit(X_train_scale, y_train)



GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'penalty': ['l1', 'l2'], 'C': [1.0, 2.0, 90.0, 200.0, 500, 800.0, 950.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [89]:
print 'Best score {0:.5f}'.format(clf.best_score_)

Best score 0.81320


In [90]:
print 'Score for logistic regression {0:.5f}'.format(clf.score(X_test_scale, y_test))

Score for logistic regression 0.84358


#### Normally for logistic regression, features standardization does not add to much value. We can see it is more or less same value which we found above without standardization. But algorithms like Neural network k it is  always suggested to do feature normalization before feeding the data to the model.
