In [1]:
# !conda install scikit-learn -y

In [2]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import sklearn

In [3]:
insects = pd.read_csv('DATA/insects.csv', header=1, sep='\s+')
insects.describe()

Unnamed: 0,continent,latitude,wingsize,sex
count,42.0,42.0,42.0,42.0
mean,0.52381,44.6,864.52381,0.5
std,0.505487,5.637592,52.276581,0.506061
min,0.0,35.5,789.0,0.0
25%,0.0,40.7,812.5,0.0
50%,1.0,45.0,872.0,0.5
75%,1.0,48.8,914.5,1.0
max,1.0,56.1,944.0,1.0


In [4]:
insects.head()

Unnamed: 0,continent,latitude,wingsize,sex
0,1,35.5,901,0
1,1,37.0,896,0
2,1,38.6,906,0
3,1,40.7,907,0
4,1,40.9,898,0


In [5]:
# train/test split is done using sklearn
from sklearn.model_selection import train_test_split

In [6]:
X = insects[['continent', 'latitude','sex']]

In [7]:
y = insects['wingsize']

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [32]:
pd.concat([X_train, y_train], axis=1)

Unnamed: 0,continent,latitude,sex,wingsize
0,1,35.5,0,901
32,0,36.4,1,789
10,1,50.8,0,930
12,0,39.3,0,889
8,1,48.8,0,927
40,0,52.1,1,819
20,0,56.1,0,934
7,1,46.8,0,915
39,0,50.4,1,842
37,0,47.3,1,815


In [33]:
insects_train = pd.concat([X_train, y_train], axis = 1)
insects_train

Unnamed: 0,continent,latitude,sex,wingsize
0,1,35.5,0,901
32,0,36.4,1,789
10,1,50.8,0,930
12,0,39.3,0,889
8,1,48.8,0,927
40,0,52.1,1,819
20,0,56.1,0,934
7,1,46.8,0,915
39,0,50.4,1,842
37,0,47.3,1,815


In [34]:
insects_test = pd.concat([X_test,y_test], axis = 1)
insects_test

Unnamed: 0,continent,latitude,sex,wingsize
2,1,38.6,0,906
31,1,50.8,1,814
33,0,39.3,1,803
19,0,52.1,0,920
36,0,45.5,1,808
29,1,48.8,1,800
26,1,42.4,1,809
6,1,45.0,0,913
34,0,41.3,1,812


In [35]:
linear_model = smf.ols(formula='wingsize ~ latitude + sex', data=insects_train).fit()

linear_model.summary()

0,1,2,3
Dep. Variable:,wingsize,R-squared:,0.956
Model:,OLS,Adj. R-squared:,0.953
Method:,Least Squares,F-statistic:,322.5
Date:,"Fri, 04 Sep 2020",Prob (F-statistic):,5.21e-21
Time:,15:12:30,Log-Likelihood:,-124.88
No. Observations:,33,AIC:,255.8
Df Residuals:,30,BIC:,260.3
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,819.8114,15.148,54.121,0.000,788.875,850.747
latitude,2.1194,0.335,6.321,0.000,1.435,2.804
sex,-96.1975,3.905,-24.637,0.000,-104.172,-88.223

0,1,2,3
Omnibus:,2.447,Durbin-Watson:,2.509
Prob(Omnibus):,0.294,Jarque-Bera (JB):,1.382
Skew:,0.464,Prob(JB):,0.501
Kurtosis:,3.379,Cond. No.,350.0


## Let's check the r -squared measure of goodness of fit

In [12]:
from sklearn.metrics import r2_score

In [36]:
linear_model.params

Intercept    819.811366
latitude       2.119360
sex          -96.197509
dtype: float64

In [37]:
## THIS IS MODEL PREDICTION ON NEW DATA THE MODEL HAS NEVER SEEN!! NOTICE WE ARE USING THE TEST SET
y_test_pred = linear_model.predict(X_test)


In [38]:
y_test_pred  ## THIS IS MODEL PREDICTION ON NEW DATA!! NOTICE WE ARE USING THE TEST SET

2     901.618661
31    831.277344
33    806.904704
19    930.230021
36    820.044736
29    827.038624
26    813.474720
6     915.182565
34    811.143424
dtype: float64

In [39]:
y_test

2     906
31    814
33    803
19    920
36    808
29    800
26    809
6     913
34    812
Name: wingsize, dtype: int64

In [40]:
# THIS IS TESTING THE MODEL ON THE DATA THE MODEL HAS NEVER SEEN!!

r2_score(y_test, y_test_pred)  #### notice the decrease in R2

0.94028922812579

In [18]:
from sklearn import linear_model
sklearn_model = linear_model.LinearRegression().fit(X_train, y_train)

In [19]:
y_train_pred = sklearn_model.predict(X_train)

In [20]:
# THIS IS TESTING THE MODEL ON THE DATA WE USED TO BUILD THE MODEL!!

r2_train = r2_score(y_train, y_train_pred) 
r2_train

0.9595810746709353

In [21]:
y_test_pred = sklearn_model.predict(X_test)

In [22]:
# THIS IS TESTING THE MODEL ON THE DATA IT HAS NEVER SEEN!!
r2_test = r2_score(y_test, y_test_pred) 
r2_test

0.9470607886733824

In [23]:
train_test_split??

[0;31mSignature:[0m [0mtrain_test_split[0m[0;34m([0m[0;34m*[0m[0marrays[0m[0;34m,[0m [0;34m**[0m[0moptions[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
[0;32mdef[0m [0mtrain_test_split[0m[0;34m([0m[0;34m*[0m[0marrays[0m[0;34m,[0m [0;34m**[0m[0moptions[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m"""Split arrays or matrices into random train and test subsets[0m
[0;34m[0m
[0;34m    Quick utility that wraps input validation and[0m
[0;34m    ``next(ShuffleSplit().split(X, y))`` and application to input data[0m
[0;34m    into a single call for splitting (and optionally subsampling) data in a[0m
[0;34m    oneliner.[0m
[0;34m[0m
[0;34m    Read more in the :ref:`User Guide <cross_validation>`.[0m
[0;34m[0m
[0;34m    Parameters[0m
[0;34m    ----------[0m
[0;34m    *arrays : sequence of indexables with same length / shape[0][0m
[0;34m        Allowed inputs are lists, numpy arrays, scipy-sparse[0m
[0;34m   

## ASSIGNMENT: DO THE SAME FOR THE CARS DATASET

In [24]:
import statsmodels.formula.api as smf

cars = pd.read_csv('DATA/cars_multivariate.csv',na_values=['?'])
cars = cars[cars.horsepower.notna()]
cars.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 392 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        392 non-null    float64
 5   acceleration  392 non-null    float64
 6   model         392 non-null    int64  
 7   origin        392 non-null    int64  
 8   car_name      392 non-null    object 
dtypes: float64(5), int64(3), object(1)
memory usage: 30.6+ KB


In [56]:
idx = insects.index
idx

RangeIndex(start=0, stop=42, step=1)

In [57]:
train = np.random.choice(idx, size=int(len(idx)*0.8), replace=False)
train

array([38, 10,  5, 41, 39, 22, 35, 29, 30,  0,  7, 24, 25, 36, 13,  6, 19,
       33, 18, 28, 23, 11, 32, 27,  1, 14, 31, 12,  4,  8, 21,  2,  9])

In [59]:
insects.iloc(18)

ValueError: No axis named 18 for object type <class 'pandas.core.frame.DataFrame'>