In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from sklearn.metrics import r2_score, mean_squared_error

In [2]:
insects = pd.read_csv('DATA/insects.csv', header=1, sep='\s+')
insects.describe()

Unnamed: 0,continent,latitude,wingsize,sex
count,42.0,42.0,42.0,42.0
mean,0.52381,44.6,864.52381,0.5
std,0.505487,5.637592,52.276581,0.506061
min,0.0,35.5,789.0,0.0
25%,0.0,40.7,812.5,0.0
50%,1.0,45.0,872.0,0.5
75%,1.0,48.8,914.5,1.0
max,1.0,56.1,944.0,1.0


In [3]:
insects.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42 entries, 0 to 41
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   continent  42 non-null     int64  
 1   latitude   42 non-null     float64
 2   wingsize   42 non-null     int64  
 3   sex        42 non-null     int64  
dtypes: float64(1), int64(3)
memory usage: 1.4 KB


In [4]:
insects.head()

Unnamed: 0,continent,latitude,wingsize,sex
0,1,35.5,901,0
1,1,37.0,896,0
2,1,38.6,906,0
3,1,40.7,907,0
4,1,40.9,898,0


In [5]:
#version 3 includes stratify
from sklearn.model_selection import train_test_split

def statsmodels_train_test_split(df, stratify=None, **kwargs):

    if stratify is None:
        y, X = df.iloc[:,0], df.drop(columns=df.columns[0])
        X_train, X_test, y_train, y_test = train_test_split(X,y, **kwargs)
    else:
        y, X = stratify, df.drop(columns = stratify.name)
        X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y, **kwargs)
    
    return pd.concat([X_train, y_train], axis=1), pd.concat([X_test, y_test], axis=1)
    

In [6]:
insects_train, insects_test = statsmodels_train_test_split(insects, random_state=42, stratify=insects.sex)

In [None]:
insects.describe()

In [7]:
insects['sex'].value_counts()

1    21
0    21
Name: sex, dtype: int64

In [None]:
insects_train.describe()

In [8]:
insects_train['sex'].value_counts()

1    16
0    15
Name: sex, dtype: int64

In [None]:
insects_test

In [None]:
insects_test.describe()

In [None]:
insects_test['sex'].value_counts()

In [None]:
insects_train, insects_test = statsmodels_train_test_split(insects, random_state=42)


In [None]:
insects_train.describe()

In [None]:
insects_test.describe()

In [9]:
formula='wingsize ~ latitude + C(sex)'
linear_model = smf.ols(formula=formula, data=insects_train).fit()

linear_model.summary()

0,1,2,3
Dep. Variable:,wingsize,R-squared:,0.948
Model:,OLS,Adj. R-squared:,0.944
Method:,Least Squares,F-statistic:,253.0
Date:,"Tue, 08 Sep 2020",Prob (F-statistic):,1.18e-18
Time:,15:06:37,Log-Likelihood:,-119.08
No. Observations:,31,AIC:,244.2
Df Residuals:,28,BIC:,248.5
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,833.3410,17.544,47.501,0.000,797.405,869.277
C(sex)[T.1],-98.2395,4.368,-22.492,0.000,-107.186,-89.293
latitude,1.7896,0.399,4.480,0.000,0.971,2.608

0,1,2,3
Omnibus:,4.747,Durbin-Watson:,2.192
Prob(Omnibus):,0.093,Jarque-Bera (JB):,3.302
Skew:,0.515,Prob(JB):,0.192
Kurtosis:,4.223,Cond. No.,369.0


In [10]:
linear_model.predict(insects_test)

14    911.007732
7     917.092222
39    825.295129
9     922.460890
37    819.747506
10    924.250445
19    926.576868
32    800.241347
22    801.315080
21    798.630747
6     913.871021
dtype: float64

In [12]:
y_test_pred = linear_model.predict(insects_test)

In [13]:
r2_score(insects_test['wingsize'], y_test_pred)

0.9752690369015538

In [14]:
mean_squared_error(insects_test['wingsize'], y_test_pred)

81.55372322118527

In [15]:
mean_squared_error??

[0;31mSignature:[0m
[0mmean_squared_error[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0my_true[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0my_pred[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msample_weight[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmultioutput[0m[0;34m=[0m[0;34m'uniform_average'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msquared[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
[0;34m@[0m[0m_deprecate_positional_args[0m[0;34m[0m
[0;34m[0m[0;32mdef[0m [0mmean_squared_error[0m[0;34m([0m[0my_true[0m[0;34m,[0m [0my_pred[0m[0;34m,[0m [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m                       [0msample_weight[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m                       [0mmultioutput[0m[0;34m=[0m[0;34m'uniform_average'[0m[0;34m,[0m [0msquar

In [None]:
import seaborn as sns


In [None]:
iris = sns.load_dataset('iris')
iris.head()

In [None]:
iris_train, iris_test = statsmodels_train_test_split(iris, stratify=iris.species, random_state=42)

In [None]:
iris_train

In [None]:
iris_train['species'].value_counts()

In [None]:
iris_test['species'].describe()

In [None]:
iris.describe()