In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

In [2]:
insects = pd.read_csv('DATA/insects.csv', header=1, sep='\s+')
insects.describe()

Unnamed: 0,continent,latitude,wingsize,sex
count,42.0,42.0,42.0,42.0
mean,0.52381,44.6,864.52381,0.5
std,0.505487,5.637592,52.276581,0.506061
min,0.0,35.5,789.0,0.0
25%,0.0,40.7,812.5,0.0
50%,1.0,45.0,872.0,0.5
75%,1.0,48.8,914.5,1.0
max,1.0,56.1,944.0,1.0


In [16]:
def statsmodels_train_test_split(df, test_frac = 0.2):
    '''
    input: 
        dataframe df
        float test_frac, with 0.0 < test_frac < 1.0
    
    output: 
        tuple containing:
            dataframe with (1-test_frac)* len(df) rows, and 
            dataframe with (test_frac) * len(df) rows 
            
    usage: 
        insects_train, insects_test = statsmodels_train_test_split(insects)
        
    alternate usage:
        insects_train, insects_test = statsmodels_train_test_split(insects, test_frac = 0.1)
    
    '''
    assert (test_frac < 1.0 and test_frac > 0.0) #fail if test_frac is not valid
    
    test_size = int(test_frac*len(df.index))
    test_rows = np.random.choice(df.index, size=test_size, replace=False)
    train_rows = list(set(df.index) - set(test_rows))
    
    return df.iloc[train_rows], df.iloc[test_rows] 

In [17]:
insects_train, insects_test = statsmodels_train_test_split(insects)


In [18]:
insects_test.describe()

Unnamed: 0,continent,latitude,wingsize,sex
count,8.0,8.0,8.0,8.0
mean,0.75,44.0625,878.125,0.375
std,0.46291,5.259532,53.193548,0.517549
min,0.0,37.0,812.0,0.0
25%,0.75,40.325,817.25,0.0
50%,1.0,44.05,905.5,0.0
75%,1.0,47.925,917.25,1.0
max,1.0,50.8,930.0,1.0


In [15]:
insects_train.describe()

Unnamed: 0,continent,latitude,wingsize,sex
count,34.0,34.0,34.0,34.0
mean,0.470588,44.761765,864.264706,0.5
std,0.50664,5.823568,51.858379,0.507519
min,0.0,35.5,789.0,0.0
25%,0.0,40.75,812.75,0.0
50%,0.0,45.25,872.0,0.5
75%,1.0,48.8,914.5,1.0
max,1.0,56.1,934.0,1.0


In [7]:
linear_model = smf.ols(formula='wingsize ~ latitude + sex', data=insects_train).fit()

linear_model.summary()

0,1,2,3
Dep. Variable:,wingsize,R-squared:,0.958
Model:,OLS,Adj. R-squared:,0.955
Method:,Least Squares,F-statistic:,354.0
Date:,"Fri, 04 Sep 2020",Prob (F-statistic):,4.49e-22
Time:,23:47:19,Log-Likelihood:,-129.42
No. Observations:,34,AIC:,264.8
Df Residuals:,31,BIC:,269.4
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,827.8628,16.597,49.880,0.000,794.013,861.713
latitude,1.9531,0.361,5.412,0.000,1.217,2.689
sex,-99.1898,3.954,-25.089,0.000,-107.253,-91.127

0,1,2,3
Omnibus:,4.414,Durbin-Watson:,2.043
Prob(Omnibus):,0.11,Jarque-Bera (JB):,3.182
Skew:,0.438,Prob(JB):,0.204
Kurtosis:,4.216,Cond. No.,383.0


In [9]:
linear_model.predict(insects_test)

41    838.240636
11    898.954851
5     910.673318
4     907.743701
24    808.163237
25    808.553853
36    817.538011
31    827.889324
dtype: float64

In [11]:
y_test_pred = linear_model.predict(insects_test)
from sklearn.metrics import r2_score

r2_score(insects_test['wingsize'], y_test_pred)

0.9379239574216208

In [None]:
#version 2

def statsmodels_train_test_split(df, test_frac = 0.2, random_seed=None):
    '''
    input: 
        dataframe df
        float test_frac, with 0.0 < test_frac < 1.0
    
    output: 
        tuple containing:
            dataframe with (1-test_frac)* len(df) rows, and 
            dataframe with (test_frac) * len(df) rows 
            
    usage: 
        insects_train, insects_test = statsmodels_train_test_split(insects)
        
    alternate usage:
        insects_train, insects_test = statsmodels_train_test_split(insects, test_frac = 0.1)
    
    '''
    assert (test_frac < 1.0 and test_frac > 0.0) #fail if test_frac is not valid
    if random_seed is not None:
        np.random.seed(random_seed)
    
    test_size = int(test_frac*len(df.index))
    test_rows = np.random.choice(df.index, size=test_size, replace=False)
    train_rows = list(set(df.index) - set(test_rows))
    
    return df.iloc[train_rows], df.iloc[test_rows] 