In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
pd.set_option('display.width', 100)
pd.set_option('precision', 4)
import statsmodels.api as sm
from statsmodels.stats.stattools import durbin_watson
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from sklearn.metrics import mean_squared_error, r2_score
from linearmodels import PanelOLS

# Linear regression to every feature

In [8]:
crime = pd.read_csv('data/crime.csv', encoding='unicode_escape', parse_dates=['OCCURRED_ON_DATE'])
data = crime.groupby('REPORTING_AREA').mean()
data['NUM'] = crime.REPORTING_AREA.value_counts()
# remove outliers
data = data[(np.abs(stats.zscore(data, axis=0)) < 3).all(axis=1)]
cols = ['MONTH', 'HOUR', 'Lat', 'Long']
y = data['NUM']

In [9]:
# return coefficient, standard error, t-statistic, p-value
# of regression y ~ data[colname], for every colname in cols
# if add_const, add constant to the single feature

def single_feature_reg(y, data, cols, add_const = True):
    Allres = None
    
    for colname in cols:
        if add_const:
            X_ = sm.add_constant(data[[colname]])
        else:
            X_ = data[[colname]]
            
        res = sm.OLS(y.values, np.asarray(X_)).fit()
        coef = res.params[1]
        std_err = res.bse[1]
        t_stat = res.tvalues[1]
        p_val = res.pvalues[1]
        r2 = res.rsquared
        tmp = pd.DataFrame([coef,std_err,t_stat,p_val,r2]).transpose()
        tmp.columns = ['coef','std_err','t-value','p-value','R2']
        Allres = pd.concat((Allres,tmp),axis=0)
    Allres.index = cols
    
    return(Allres)

In [10]:
res = single_feature_reg(y,data,cols)
res

Unnamed: 0,coef,std_err,t-value,p-value,R2
MONTH,-15.1268,33.6046,-0.4501,0.65273,0.0002
HOUR,60.7837,14.6261,4.1558,3.5746e-05,0.0202
Lat,99.8472,128.3889,0.7777,0.43697,0.0007
Long,219.1702,81.2787,2.6965,0.0071476,0.0086


In [7]:
res.to_latex(index=False)

'\\begin{tabular}{rrrrr}\n\\toprule\n     coef &   std\\_err &  t-value &     p-value &      R2 \\\\\n\\midrule\n -15.1268 &   33.6046 &  -0.4501 &  6.5273e-01 &  0.0002 \\\\\n  60.7837 &   14.6261 &   4.1558 &  3.5746e-05 &  0.0202 \\\\\n  99.8472 &  128.3889 &   0.7777 &  4.3697e-01 &  0.0007 \\\\\n 219.1702 &   81.2787 &   2.6965 &  7.1476e-03 &  0.0086 \\\\\n\\bottomrule\n\\end{tabular}\n'