In [15]:
import pandas as pd
import statsmodels.formula.api as smf

cars = pd.read_csv('DATA/cars_multivariate.csv',na_values=['?'])
cars = cars[cars.horsepower.notna()]
cars.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 392 entries, 0 to 397
Data columns (total 9 columns):
mpg             392 non-null float64
cylinders       392 non-null int64
displacement    392 non-null float64
horsepower      392 non-null float64
weight          392 non-null float64
acceleration    392 non-null float64
model           392 non-null int64
origin          392 non-null int64
car_name        392 non-null object
dtypes: float64(5), int64(3), object(1)
memory usage: 30.6+ KB


In [16]:
#recall from cars.ipynb we made models like this to do our linear regressions

formula = 'mpg ~ cylinders + displacement + horsepower + weight + acceleration + model + origin'

fitted_model = smf.ols(formula=formula, data=cars).fit()
fitted_model.summary()

0,1,2,3
Dep. Variable:,mpg,R-squared:,0.821
Model:,OLS,Adj. R-squared:,0.818
Method:,Least Squares,F-statistic:,252.4
Date:,"Tue, 19 Nov 2019",Prob (F-statistic):,2.04e-139
Time:,10:47:08,Log-Likelihood:,-1023.5
No. Observations:,392,AIC:,2063.0
Df Residuals:,384,BIC:,2095.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-17.2184,4.644,-3.707,0.000,-26.350,-8.087
cylinders,-0.4934,0.323,-1.526,0.128,-1.129,0.142
displacement,0.0199,0.008,2.647,0.008,0.005,0.035
horsepower,-0.0170,0.014,-1.230,0.220,-0.044,0.010
weight,-0.0065,0.001,-9.929,0.000,-0.008,-0.005
acceleration,0.0806,0.099,0.815,0.415,-0.114,0.275
model,0.7508,0.051,14.729,0.000,0.651,0.851
origin,1.4261,0.278,5.127,0.000,0.879,1.973

0,1,2,3
Omnibus:,31.906,Durbin-Watson:,1.309
Prob(Omnibus):,0.0,Jarque-Bera (JB):,53.1
Skew:,0.529,Prob(JB):,2.95e-12
Kurtosis:,4.46,Cond. No.,85900.0


## We as data analysts know that 'model', 'origin', and 'cylinders' are actually *categorical* variables
## BUT we did not explicitly mark them as such to statsmodels.This is an unsatifactory thing to do. 
## Let's fix this mistake now and see how the linear regression changes.

In [19]:
cars.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model,origin
count,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0
mean,23.445918,5.471939,194.41199,104.469388,2977.584184,15.541327,75.979592,1.576531
std,7.805007,1.705783,104.644004,38.49116,849.40256,2.758864,3.683737,0.805518
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0,1.0
25%,17.0,4.0,105.0,75.0,2225.25,13.775,73.0,1.0
50%,22.75,4.0,151.0,93.5,2803.5,15.5,76.0,1.0
75%,29.0,8.0,275.75,126.0,3614.75,17.025,79.0,2.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0,3.0


In [20]:
cars.cylinders.value_counts()

4    199
8    103
6     83
3      4
5      3
Name: cylinders, dtype: int64

In [17]:
formula = 'mpg ~ C(cylinders) + displacement + horsepower + weight + acceleration + C(model) + C(origin)'

fitted_model = smf.ols(formula=formula, data=cars).fit()
fitted_model.summary()

0,1,2,3
Dep. Variable:,mpg,R-squared:,0.874
Model:,OLS,Adj. R-squared:,0.867
Method:,Least Squares,F-statistic:,116.8
Date:,"Tue, 19 Nov 2019",Prob (F-statistic):,2.64e-151
Time:,10:47:12,Log-Likelihood:,-954.59
No. Observations:,392,AIC:,1955.0
Df Residuals:,369,BIC:,2047.0
Df Model:,22,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,30.9168,2.361,13.095,0.000,26.274,35.559
C(cylinders)[T.4],6.9399,1.537,4.516,0.000,3.918,9.962
C(cylinders)[T.5],6.6377,2.337,2.840,0.005,2.042,11.234
C(cylinders)[T.6],4.2973,1.706,2.519,0.012,0.943,7.652
C(cylinders)[T.8],6.3668,1.969,3.234,0.001,2.495,10.238
C(model)[T.71],0.9104,0.816,1.116,0.265,-0.693,2.514
C(model)[T.72],-0.4903,0.804,-0.610,0.542,-2.071,1.090
C(model)[T.73],-0.5529,0.721,-0.766,0.444,-1.972,0.866
C(model)[T.74],1.2420,0.855,1.453,0.147,-0.439,2.923

0,1,2,3
Omnibus:,32.56,Durbin-Watson:,1.574
Prob(Omnibus):,0.0,Jarque-Bera (JB):,55.829
Skew:,0.528,Prob(JB):,7.53e-13
Kurtosis:,4.518,Cond. No.,79500.0


## Note that when you tell statsmodels that something is a categorial variable, it makes multiple dummy variables out of it!
## Why might we want that?
## Why didn't we have to do this with the 'sex' variable from insects.ipynb?

In [12]:
#let's continue our analysis and drop acceleration, since it has p value of 96.7%
formula = 'mpg ~ C(cylinders) + displacement + horsepower + weight + C(model) + C(origin)'

fitted_model = smf.ols(formula=formula, data=cars).fit()
fitted_model.summary()

0,1,2,3
Dep. Variable:,mpg,R-squared:,0.874
Model:,OLS,Adj. R-squared:,0.867
Method:,Least Squares,F-statistic:,122.6
Date:,"Tue, 19 Nov 2019",Prob (F-statistic):,2.36e-152
Time:,09:41:23,Log-Likelihood:,-954.59
No. Observations:,392,AIC:,1953.0
Df Residuals:,370,BIC:,2041.0
Df Model:,21,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,30.9707,1.970,15.718,0.000,27.096,34.845
C(cylinders)[T.4],6.9490,1.519,4.575,0.000,3.962,9.936
C(cylinders)[T.5],6.6467,2.324,2.860,0.004,2.077,11.217
C(cylinders)[T.6],4.3051,1.693,2.542,0.011,0.975,7.635
C(cylinders)[T.8],6.3723,1.962,3.249,0.001,2.515,10.230
C(model)[T.71],0.9057,0.807,1.123,0.262,-0.680,2.492
C(model)[T.72],-0.4921,0.802,-0.614,0.540,-2.068,1.084
C(model)[T.73],-0.5551,0.719,-0.772,0.440,-1.968,0.858
C(model)[T.74],1.2376,0.847,1.461,0.145,-0.428,2.903

0,1,2,3
Omnibus:,32.741,Durbin-Watson:,1.574
Prob(Omnibus):,0.0,Jarque-Bera (JB):,56.27
Skew:,0.53,Prob(JB):,6.04e-13
Kurtosis:,4.524,Cond. No.,79400.0


In [13]:
#We shouldn't take out any of the individual dummy variables, so we are pretty much done.

In [14]:
#BONUS How could we incorporate weight^2 in the linear regression model above?