In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from utils import columns, clean_df, numerics, summary
from pycaret.regression import *

%config Completer.use_jedi = False
seed = 1

# Agenda
1. Problem
2. EDA
3. Solution
    - Base Model (multivariate linear regression)
    - Advanced Framework
    - Production Scenario
4. Conclusion

# 1. Problem
- Simple regression model
- Target: **normalized losses** ~ "*relative average loss payment per insured vehicle year.*" ([UCI](https://archive.ics.uci.edu/ml/datasets/Automobile))
- Evaluate with MSE

# 2. EDA

This looks nice: https://medium.com/analytics-vidhya/implementing-linear-regression-using-sklearn-76264a3c073c

In [2]:
df = pd.read_csv("data/imports-85.data", header=None, names=columns)
df = clean_df(df)
df.head(3)

164 rows (80.0%) left after preprocessing


Unnamed: 0,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
3,164.0,audi,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,164.0,audi,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0
6,158.0,audi,gas,std,four,sedan,fwd,front,105.8,192.7,71.4,55.7,2844,ohc,five,136,mpfi,3.19,3.4,8.5,110.0,5500.0,19,25,17710.0


In [3]:
# histogram target + log(target)
#pd.Series(np.log(y)).plot.hist()
#y.plot.hist()





# fancy stuff to display relations





# 3. Solution
## 3.1 Base Model (multivariate linear regression)
In our simple approach we'll consider continuous variables only.

In [4]:
# TODO: Check for linearity etc (use scripts from previous notebook)

cols = list(df.columns)
target, num_features = cols[0], [c for c in cols[1:] if c in numerics]

X,y = df[num_features], np.log(df[target]) 

#X,y = pd.get_dummies(df[cols[1:]]),np.log(df[target])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=seed)

model = LinearRegression().fit(X_train, y_train)
summary(model, X_train, y_train)

Call:
lm(formula = normalized-losses ~ wheel-base + length + width + height + curb-weight + engine-size + bore + stroke + compression-ratio + horsepower + peak-rpm + city-mpg + highway-mpg + price) 

                       Coefficients  Std. Error  t value  Pr(>|t|)
0         (Intercept)        8.4932       1.600    5.310     0.000
1          wheel-base       -0.0036       0.010   -0.365     0.715
2              length        0.0031       0.005    0.618     0.538
3               width       -0.0146       0.025   -0.582     0.562
4              height       -0.0584       0.013   -4.441     0.000
5         curb-weight        0.0002       0.000    0.951     0.343
6         engine-size        0.0019       0.002    1.064     0.289
7                bore       -0.2206       0.116   -1.904     0.059
8              stroke       -0.1412       0.085   -1.653     0.101
9   compression-ratio        0.0063       0.008    0.791     0.430
10         horsepower       -0.0012       0.002   -0.687     0.

In [5]:
MSE = mean_squared_error(y_test, model.predict(X_test))
RMSE = mean_squared_error(y_test, model.predict(X_test), squared=False)

print(np.exp(MSE),np.exp(RMSE)) # reversing log via exp for interpretability of MSE

1.0234624573711244 1.1644948359274552


Doesn't look too bad considering a terrible R squared of 0.37.

## 3.2 Advanced Framework
- https://github.com/pycaret/pycaret/blob/master/tutorials/Regression%20Tutorial%20Level%20Beginner%20-%20REG101.ipynb
- https://github.com/pycaret/pycaret/blob/master/tutorials/Regression%20Tutorial%20Level%20Intermediate%20-%20REG102.ipynb

In [7]:
data = df.sample(frac=0.9, random_state=seed)
data_unseen = df.drop(data.index)

data.reset_index(drop=True, inplace=True)
data_unseen.reset_index(drop=True, inplace=True)

print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

Data for Modeling: (148, 25)
Unseen Data For Predictions: (16, 25)


In [24]:
params = {
    "data": data,
    "target": "normalized-losses",
    "train_size": 0.9,
    "test_data": data_unseen,
    "categorical_features": [c for c in cols[1:] if c not in numerics],
    "remove_multicollinearity": True,
    "verbose":False,
    "silent": True
}

regr = setup(**params)

In [25]:
#help(setup)

## 3.3 Production Scenario

# Conclusion

### Sources
- https://scikit-learn.org/stable/auto_examples/model_selection/plot_cv_predict.html#sphx-glr-auto-examples-model-selection-plot-cv-predict-py
- https://boostedml.com/2019/06/linear-regression-in-r-interpreting-summarylm.html
- https://stackoverflow.com/questions/27928275/find-p-value-significance-in-scikit-learn-linearregression
- 