## Logistic Regression: Fit and evaluate a model

Using the Titanic dataset from [this](https://www.kaggle.com/c/titanic/overview) Kaggle competition.

In this section, we will fit and evaluate a simple Logistic Regression model.

### Read in Data

In [49]:
import joblib
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

tr_features = pd.read_csv('../train_features.csv')
tr_labels = pd.read_csv('../train_labels.csv', header=None)

### Hyperparameter tuning

In [50]:
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

In [55]:
lr = LogisticRegression()
parameters = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}
cv = GridSearchCV(lr, parameters, cv=5)
cv.fit(tr_features, tr_labels)

print_results(cv)

BEST PARAMS: {'C': 1}

0.678 (+/-0.092) for {'C': 0.001}
0.704 (+/-0.099) for {'C': 0.01}
0.796 (+/-0.13) for {'C': 0.1}
0.798 (+/-0.123) for {'C': 1}
0.794 (+/-0.118) for {'C': 10}
0.794 (+/-0.118) for {'C': 100}
0.794 (+/-0.118) for {'C': 1000}


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

In [52]:
cv.cv_results_



{'mean_fit_time': array([0.00316734, 0.00257254, 0.0024642 , 0.00226369, 0.0022295 ]),
 'mean_score_time': array([0.00079961, 0.00071874, 0.00061779, 0.00057936, 0.00059919]),
 'mean_test_score': array([0.70411985, 0.79588015, 0.79775281, 0.79400749, 0.79400749]),
 'mean_train_score': array([0.70785242, 0.80289519, 0.80944602, 0.8066368 , 0.80757248]),
 'param_C': masked_array(data=[0.01, 0.1, 1, 10, 100],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 0.01}, {'C': 0.1}, {'C': 1}, {'C': 10}, {'C': 100}],
 'rank_test_score': array([5, 2, 1, 3, 3], dtype=int32),
 'split0_test_score': array([0.75925926, 0.86111111, 0.86111111, 0.85185185, 0.85185185]),
 'split0_train_score': array([0.69248826, 0.78403756, 0.79342723, 0.78873239, 0.78873239]),
 'split1_test_score': array([0.73831776, 0.8317757 , 0.8317757 , 0.82242991, 0.82242991]),
 'split1_train_score': array([0.70023419, 0.80093677, 0.80327869, 0.80327869, 0.

In [53]:
cv.best_estimator_

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

### Write out pickled model

In [54]:
joblib.dump(cv.best_estimator_, '../LR_model.pkl')

['../LR_model.pkl']