In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from seaborn.apionly import load_dataset
import statsmodels.formula.api as smf

from sklearn.model_selection import KFold, LeaveOneOut, train_test_split
from sklearn import metrics

## Load the data

In [2]:
data = load_dataset('car_crashes')
data.head(3)

Unnamed: 0,total,speeding,alcohol,not_distracted,no_previous,ins_premium,ins_losses,abbrev
0,18.8,7.332,5.64,18.048,15.04,784.55,145.08,AL
1,18.1,7.421,4.525,16.29,17.014,1053.48,133.93,AK
2,18.6,6.51,5.208,15.624,17.856,899.47,110.35,AZ


## Train test split
Splits in this case two thirds for train and one third for test

In [3]:
scores = []
n_splits = 5
for _ in range(n_splits):
    train, test = train_test_split(data, test_size=.33)
    model = smf.ols("total ~ speeding + alcohol", train).fit()
    y_pred = model.predict(test)
    #score = metrics.r2_score(y_true=test['total'], y_pred=y_pred)
    score = metrics.mean_squared_error(y_true=test['total'], y_pred=y_pred)
    scores.append(score)
np.array(scores).mean()

5.0588109320862964

## Leave one out

In [4]:
loo = LeaveOneOut()
for train_index, test_index in loo.split(data):
    train, test = data.iloc[train_index], data.iloc[test_index]
    model = smf.ols("total ~ speeding + alcohol", train).fit()
    y_pred = model.predict(test)
    #score = metrics.r2_score(y_true=test['total'], y_pred=y_pred)
    score = metrics.mean_squared_error(y_true=test['total'], y_pred=y_pred)
    scores.append(score)
np.array(scores).mean()

5.1594090109870132

## K-Folds cross-validator

In [5]:
kf = KFold(n_splits=5)
for train_index, test_index in kf.split(data):
    train, test = data.iloc[train_index], data.iloc[test_index]
    model = smf.ols("total ~ speeding + alcohol", train).fit()
    y_pred = model.predict(test)
    #score = metrics.r2_score(y_true=test['total'], y_pred=y_pred)
    score = metrics.mean_squared_error(y_true=test['total'], y_pred=y_pred)
    scores.append(score)
np.array(scores).mean()

5.1691041576354095

## Functions for feature selection
Because of formulas like: "total ~ speeding + alcohol" are hard to work with for feature selection will use the regual sm api without formula and will transform the categorical features to dummy vecotr

In [6]:
data = load_dataset('tips')
targets = data['tip']
del data['tip']
data.head()

Unnamed: 0,total_bill,sex,smoker,day,time,size
0,16.99,Female,No,Sun,Dinner,2
1,10.34,Male,No,Sun,Dinner,3
2,21.01,Male,No,Sun,Dinner,3
3,23.68,Male,No,Sun,Dinner,2
4,24.59,Female,No,Sun,Dinner,4


In [7]:
import statsmodels.api as sm

categorical_columns = data.columns[data.dtypes=='category']

features = {}
for column in data.columns:
    if column in categorical_columns:
        features[column] = pd.get_dummies(data[[column]]).astype(np.float32)
    else:
        features[column] = data[[column]].astype(np.float32)

## Here is a regression with all the features

In [8]:
x = pd.concat(features.values(), axis=1)
x['intercept'] = 1
model = sm.OLS(targets, x).fit()
model.summary()

0,1,2,3
Dep. Variable:,tip,R-squared:,0.47
Model:,OLS,Adj. R-squared:,0.452
Method:,Least Squares,F-statistic:,26.06
Date:,"Thu, 27 Apr 2017",Prob (F-statistic):,1.1999999999999999e-28
Time:,15:44:01,Log-Likelihood:,-347.48
No. Observations:,244,AIC:,713.0
Df Residuals:,235,BIC:,744.4
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
size,0.1760,0.090,1.966,0.051,-0.000,0.352
time_Lunch,0.1615,0.237,0.682,0.496,-0.305,0.628
time_Dinner,0.0934,0.214,0.437,0.662,-0.328,0.514
total_bill,0.0945,0.010,9.841,0.000,0.076,0.113
day_Thur,-0.0212,0.308,-0.069,0.945,-0.627,0.585
day_Fri,0.1410,0.194,0.727,0.468,-0.241,0.523
day_Sat,0.0196,0.191,0.103,0.918,-0.356,0.395
day_Sun,0.1156,0.198,0.584,0.560,-0.274,0.506
sex_Male,0.1113,0.080,1.392,0.165,-0.046,0.269

0,1,2,3
Omnibus:,27.86,Durbin-Watson:,2.096
Prob(Omnibus):,0.0,Jarque-Bera (JB):,52.555
Skew:,0.607,Prob(JB):,3.87e-12
Kurtosis:,4.923,Cond. No.,3.35e+17


In [13]:
# Bigger is better
def score_fn(features, targets=targets, n_splits=20):
    if not len(features): return -np.inf
    scores = []
    X = pd.concat(features.values(), axis=1)
    assert X.index.equals(targets.index)
    X['intercept'] = 1
    
    for _ in range(n_splits):
        train_index, test_index = train_test_split(X.index, test_size=.33)

        X_train, y_train = X.loc[train_index], targets.loc[train_index]
        X_test, y_true = X.loc[test_index], targets.loc[test_index]

        model = sm.OLS(y_train, X_train).fit()
        
        y_pred = model.predict(X_test)
        score = metrics.r2_score(y_true=y_true, y_pred=y_pred)
        scores.append(score)
    return np.array(scores).mean()


def forward_step(selected_features, test_features, targets, score_fn):
    max_score = score_fn(selected_features)
    max_score_feature = None
    for feature in test_features.keys():
        score_features = {**selected_features, **dict(feature=test_features[feature])}
        new_score = score_fn(score_features, targets=targets)
        if new_score > max_score:
            max_score = new_score
            max_score_feature = feature
    return max_score_feature
    
def forward_feature_selection(features, targets=targets, score_fn=score_fn):
    test_features = features.copy()
    selected_features = {}
    while True:
        add_key = forward_step(selected_features, test_features, targets, score_fn)
        if add_key is None:
            return selected_features
        else:
            del test_features[add_key]
            selected_features[add_key] = features[add_key]

def backward_step(features, targets, score_fn):
    max_score = score_fn(features)
    max_score_feature = None
    for feature in features.keys():
        test_features = features.copy()
        del test_features[feature]
        new_score = score_fn(test_features)
        if new_score >= max_score:
            max_score = new_score
            max_score_feature = feature
    return max_score_feature

def backward_feature_selection(features, targets=targets, score_fn=score_fn):
    selected_features = features.copy()
    while True:
        remove_feature = backward_step(selected_features, targets, score_fn)
        if remove_feature is None:
            return selected_features
        else:
            del selected_features[remove_feature]

def stepwise_feature_selection(features, targets=targets, score_fn=score_fn):
    test_features = features.copy()
    selected_features = {}
    while True:
        # forward step
        add_feature = forward_step(selected_features, test_features, targets, score_fn)
        if add_feature is not None:
            selected_features[add_feature] = test_features[add_feature]
            del test_features[add_feature]
        # backward step
        remove_feature = backward_step(selected_features, targets, score_fn)
        if remove_feature is not None:
            test_features[remove_feature] = selected_features[remove_feature]
            del selected_features[remove_feature]
        # nothing more to do, just return
        if add_feature is None and remove_feature is None:
            return selected_features

In [10]:
selected_features = forward_feature_selection(features, targets=targets, score_fn=score_fn)
print('Final cross-validated score:', score_fn(selected_features))
selected_features.keys()

Final cross-validated score: 0.429700080737


dict_keys(['time', 'total_bill', 'day'])

In [11]:
selected_features = backward_feature_selection(features, targets=targets, score_fn=score_fn)
print('Final cross-validated score:', score_fn(selected_features))
selected_features.keys()

Final cross-validated score: 0.443424953038


dict_keys(['total_bill', 'smoker'])

In [12]:
selected_features = stepwise_feature_selection(features, targets=targets, score_fn=score_fn)
print('Final cross-validated score:', score_fn(selected_features))
selected_features.keys()

Final cross-validated score: 0.429951960977


dict_keys(['total_bill'])