In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from sklearn.metrics import r2_score, mean_squared_error

In [2]:
insects = pd.read_csv('DATA/insects.csv', header=1, sep='\s+')
insects.describe()

Unnamed: 0,continent,latitude,wingsize,sex
count,42.0,42.0,42.0,42.0
mean,0.52381,44.6,864.52381,0.5
std,0.505487,5.637592,52.276581,0.506061
min,0.0,35.5,789.0,0.0
25%,0.0,40.7,812.5,0.0
50%,1.0,45.0,872.0,0.5
75%,1.0,48.8,914.5,1.0
max,1.0,56.1,944.0,1.0


In [3]:
#version 2

def statsmodels_train_test_split(df, test_frac = 0.2, random_state=None):
    '''
    input: 
        dataframe df
        float test_frac, with 0.0 < test_frac < 1.0
    
    output: 
        tuple containing:
            dataframe with (1-test_frac)* len(df) rows, and 
            dataframe with (test_frac) * len(df) rows 
            
    usage: 
        insects_train, insects_test = statsmodels_train_test_split(insects)
        
    alternate usage:
        insects_train, insects_test = statsmodels_train_test_split(insects, test_frac = 0.1)
    
    '''
    assert (test_frac < 1.0 and test_frac > 0.0) #fail if test_frac is not valid
    
    if random_state is not None:
        np.random.seed(random_state)
    
    test_size = int(test_frac*len(df.index))
    test_rows = np.random.choice(df.index, size=test_size, replace=False)
    train_rows = list(set(df.index) - set(test_rows))
    
    return df.iloc[train_rows], df.iloc[test_rows] 

In [14]:
insects_train, insects_test = statsmodels_train_test_split(insects, random_state=42)


In [15]:
insects_train.describe()

Unnamed: 0,continent,latitude,wingsize,sex
count,34.0,34.0,34.0,34.0
mean,0.5,44.341176,864.147059,0.5
std,0.507519,5.85823,52.649287,0.507519
min,0.0,35.5,789.0,0.0
25%,0.0,39.3,812.5,0.0
50%,0.5,45.0,872.0,0.5
75%,1.0,48.5,911.5,1.0
max,1.0,56.1,944.0,1.0


In [16]:
insects_test.describe()

Unnamed: 0,continent,latitude,wingsize,sex
count,8.0,8.0,8.0,8.0
mean,0.625,45.7,866.125,0.5
std,0.517549,4.759952,54.178112,0.534522
min,0.0,40.9,800.0,0.0
25%,0.0,41.2,815.75,0.0
50%,1.0,45.6,870.0,0.5
75%,1.0,49.2,916.25,1.0
max,1.0,52.1,927.0,1.0


In [17]:
formula='wingsize ~ latitude + sex'
linear_model = smf.ols(formula=formula, data=insects_train).fit()

linear_model.summary()

0,1,2,3
Dep. Variable:,wingsize,R-squared:,0.96
Model:,OLS,Adj. R-squared:,0.957
Method:,Least Squares,F-statistic:,368.2
Date:,"Tue, 08 Sep 2020",Prob (F-statistic):,2.5000000000000002e-22
Time:,12:36:59,Log-Likelihood:,-127.95
No. Observations:,34,AIC:,261.9
Df Residuals:,31,BIC:,266.5
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,827.6143,14.621,56.604,0.000,797.794,857.434
latitude,1.9423,0.324,5.987,0.000,1.281,2.604
sex,-99.1862,3.745,-26.487,0.000,-106.824,-91.549

0,1,2,3
Omnibus:,7.937,Durbin-Watson:,2.547
Prob(Omnibus):,0.019,Jarque-Bera (JB):,6.997
Skew:,0.769,Prob(JB):,0.0302
Kurtosis:,4.604,Cond. No.,350.0


In [18]:
linear_model.predict(insects_test)

25    807.870004
13    907.833143
8     922.400736
26    810.783523
4     907.056205
39    826.322289
19    928.810477
29    823.214536
dtype: float64

In [19]:
y_test_pred = linear_model.predict(insects_test)

In [20]:
r2_score(insects_test['wingsize'], y_test_pred)

0.9453610597019787

In [21]:
mean_squared_error(insects_test['wingsize'], y_test_pred)

140.33243455448826