In [1]:
#imports
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline

processed_data_path = os.path.join(os.getcwd(), 'data', 'processed')
read_train_path = os.path.join(processed_data_path, 'train.csv')
read_test_path = os.path.join(processed_data_path, 'test.csv')

train_df = pd.read_csv(read_train_path, index_col='PassengerId')
test_df = pd.read_csv(read_test_path, index_col='PassengerId')
df = pd.concat([train_df, test_df], axis=0)

# this takes all rows and all columns except Survived (i.e. from Age onwards) and creates a matrix of float values
X = train_df.loc[:, 'Age':].as_matrix().astype('float')

# this creates a flattened one dimensional array (or vector) of floats from the outputs using a numpy function
y = train_df['Survived'].ravel()

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

# performance metrics
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

def get_submission_file(model, filename):
    # converting to the matrix
    test_X = test_df.as_matrix().astype('float')
    # get predictions
    predictions = model.predict(test_X)
    # submission dataframe
    df_submission = pd.DataFrame({'PassengerId': test_df.index, 'Survived': predictions})
    # submission file
    submission_data_path = os.path.join(os.getcwd(), 'data', 'external')
    submission_file_path = os.path.join(submission_data_path, filename)
    #write to file
    df_submission.to_csv(submission_file_path, index=False)

(712, 32) (712,)
(179, 32) (179,)


# Model Tuning

## Underfitting vs Overfitting

The model needs to be able to make predictions using inputs it hasn't seen before.

Underfitting is a model that has been unable to learn a pattern even with the training data and makes many misclassifications, resulting in poor performance. The model is too simple and a more complex model needs to be chosen.

However, you have to be careful when chosing a more complext model. If it creates an incredibly complex decision boundary that perfectly separates your classes, it may simply be doing that by memorising the training data and it won't be able to make accurate predictions with new, unseen inputs. This is called overfitting the model because it has poor generalisation.

Generalisation is very important to machine learning, because the model needs to be accurate on unseen data. That means you don't want it to have perfect performance on the training data, because it won't be accurate on unseen data. It's a balancing act!

## Regularisation

This is a way of tackling overfitting by reducing the complexity of a model in order to make it more balanced.

When we created the logicistic regression model, the we set the random_state parameter but didn't touch any of the others, which can all be used to tune the model in various ways. The C parameter is the regularisation parameter. The default is C=1.0.

If you increase the value of C a lot, the model will be more complex and the model will be overfitted. If you decrease the value of C, the model will be simpler and may be underfitted.

Penalty is another term for regularisation, and the default is penalty=l2. The common settings for penalty ate L1 and L2 and these can be changed to over or underfit your model.

The various parameter that can be set in a model to fine-tune it are often called **hyperparameters**.

## Hyperparameter Optimisation

The most popular hyperparameter optimisation technique is called grid search. First create a grid of values for hyperparameters you want to try (for example, if you want to try three different settings for one parameter and two different settings for another, you'd have a 3x2 grid with 6 cells, then evaluate the performance for each combination in a model. You can then select the best combination to use in your model.

To evaluate the performance, you can use the train-test split as before, but for hyperparameter optmimisation there is a technique called **cross-validation** that can be used instead.

## Cross-validation

The data is split into three different parts, instead of two: training, test, cross-validation.

Pass the training data to the model to get the trained model. Then evaluate the performance of the model on the cross-validation dataset.

So for each hyperparameter set, you build a model, train it, and evaluate on the cross-validation data. Then put the score into the grid and find the model with the best score.

Finally, pass the test data to the best model to evaluate the final score for the model. In other words, the training and cross-validation data is used multiple times to get the best model, and the test data is used only once to evaluate the final model.

## K-fold Cross-validation

This is a more advanced method of getting cross-validation data than was described in the previous section, where the split is always the same. 3-Fold cross-validation is a common use.

Split the training data equally into three portions. For ease, we'll call them portions 1, 2, and 3. In the first iteration, portions 2 and 3 are used for training and portion 1 is the cross-validation data. The score is recorded. In the second iteration, portions 1 and 3 are used for training and portion 2 is the cross-validation. The third iteration uses 1 and 2 for training and 3 for validation. 

At the end, you'll have three scores and you'll take the mean of those scores as your overall score for this iteration of the model and its hyperparameters. It's useful to take the standard deviation, too, to make sure there isn't an unacceptable deviation in the scores that might indicate a problem.

# Tuning in practice

## Hyperparameter optimisation

There is a scikit learn function, thankfully.

In [2]:
from sklearn.linear_model import LogisticRegression

In [3]:
# base model
model_lr = LogisticRegression(random_state=0)

In [4]:
from sklearn.model_selection import GridSearchCV

In [5]:
# create a dictionary listing out the hyperparameters that will be tried
parameters = {'C': [1.0, 10.0, 50.0, 100.0, 1000.0], 'penalty': ['l1', 'l2']}

# create the grid search, setting cv to 3 performs the 3-fold crossvalidation
clf = GridSearchCV(model_lr, param_grid=parameters, cv=3)

In [6]:
# now the grid search is set up, the model can be trained with the different hyperparameter combinations
clf.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1.0, 10.0, 50.0, 100.0, 1000.0], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [7]:
# now find which was the best combination
clf.best_params_

{'C': 1.0, 'penalty': 'l1'}

In [8]:
print('best score: {0:.2f}'.format(clf.best_score_))

best score: 0.83


This is starting to get close to the maximum we will see from a logistic regression model, so the improvement is only very small over the original logistic regression model from Part 1.

In [22]:
# evaluate model on test data, internally it will use the best model to get the best score
print('score for logistic regression model - version 2: {0:.5f}'.format(clf.score(X_test, y_test)))

score for logistic regression model - version 2: 0.82682


In [10]:
# get submission file
get_submission_file(clf, '03_lr.csv')

# Feature Normalisation and Standardisation

For many ML algorithms, model performance is improved if the features are on the same scale. For the kind of logistic regression model used in the course it may not make much difference, but it's something to explore with other models.

For example, in the Titantic dataset there are Age, Fare, and FamilySize features which all have very different ranges and scales. Ideally, they should all have the same scale.

Scales of 0 to 1 or -1 to +1 are common, but don't use scales with minus values for features with negative values make no sense. So for Age, Fare, and FamilySize, a scale of 0 to 1 makes sense. The idea is to map the data for each feature to a scale like this (use the same scale for all features, obviously).

The other technique is feature standardisation, which is about the distribution of the data in a feature. So for Age, Fare, and FamilySize, they might all have different means and standard distributions. The aim is to get the features to have a mean of 0.0 and a variance of 1.0

Thankfully, scikit-learn also provides functions for doing this!

In [11]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

### Normalisation in practice

In [12]:
# feature normalisation
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [13]:
X_train_scaled[:,0].min(), X_train_scaled[:,0].max()

(0.0, 1.0)

In [14]:
#normalise test data
X_test_scaled = scaler.fit_transform(X_test)

### Standardisation in practice

In [15]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

# Create a model with standardisation and hyperparameter optimisation

In [29]:
# base model
model_lr2 = LogisticRegression(random_state=0)

# create a dictionary listing out the hyperparameters that will be tried
parameters = {'C': [1.0, 10.0, 50.0, 100.0, 1000.0], 'penalty': ['l1', 'l2']}

# create the grid search, setting cv to 3 performs the 3-fold crossvalidation
clf2 = GridSearchCV(model_lr2, param_grid=parameters, cv=3)

# now the grid search is set up, the model can be trained with the different hyperparameter combinations
clf2.fit(X_train_scaled, y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1.0, 10.0, 50.0, 100.0, 1000.0], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [30]:
# now find which was the best combination
clf2.best_params_

{'C': 1.0, 'penalty': 'l1'}

In [31]:
print('best score: {0:.6f}'.format(clf2.best_score_))

best score: 0.813202


In [32]:
# evaluate model on test data, internally it will use the best model to get the best score
print('score for logistic regression model - version 3: {0:.5f}'.format(clf2.score(X_test_scaled, y_test)))

score for logistic regression model - version 3: 0.81564


This is not an improvement, so I'm not going to submit this to Kaggle!

# Model persistence

This is the process of saving the trained model to the disk so it can be reloaded and reused without needing to be retrained.

This means you can share the model with others without having to share the training steps and data. You can also use the persistence to create a machine learning API on top of it.

## Pickle!

The pickle library can be used to save the trained model to disk.

In [33]:
import pickle

In [35]:
# create the model paths
model_lr_file_path = os.path.join(os.getcwd(), 'models', 'lr_model.pkl')
model_lr2_file_path = os.path.join(os.getcwd(), 'models', 'lr2_model.pkl')
scalar_file_path = os.path.join(os.getcwd(), 'models', 'scalar.pkl')

In [36]:
# open the files to write
model_lr_pickle = open(model_lr_file_path, 'wb')
model_lr2_pickle = open(model_lr2_file_path, 'wb')
scalar_pickle = open(scalar_file_path, 'wb')

In [37]:
# persist the data
pickle.dump(clf, model_lr_pickle)
pickle.dump(clf2, model_lr2_pickle)
pickle.dump(scaler, scalar_pickle)

In [38]:
# close the files
model_lr_pickle.close()
model_lr2_pickle.close()
scalar_pickle.close()

### Load the persisted files

In [40]:
# open the files
model_lr_pickle = open(model_lr_file_path, 'rb')
model_lr2_pickle = open(model_lr2_file_path, 'rb')
scalar_pickle = open(scalar_file_path, 'rb')
#load files
clf_loaded = pickle.load(model_lr_pickle)
clf2_loaded = pickle.load(model_lr2_pickle)
scalar_loaded = pickle.load(scalar_pickle)
# close the files
model_lr_pickle.close()
model_lr2_pickle.close()
scalar_pickle.close()

In [41]:
clf_loaded

GridSearchCV(cv=3, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1.0, 10.0, 50.0, 100.0, 1000.0], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [42]:
clf2_loaded

GridSearchCV(cv=3, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1.0, 10.0, 50.0, 100.0, 1000.0], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [43]:
scalar_loaded

StandardScaler(copy=True, with_mean=True, with_std=True)

In [44]:
# transform the test data using the loaded scalar
X_test_scaled = scalar_loaded.transform(X_test)
# calculate the score using the loaded model
print('score for logistic regression model 3: {0:5f}'.format(clf2_loaded.score(X_test_scaled, y_test)))

score for logistic regression model 3: 0.815642
