In [59]:
import pandas
import statsmodels.formula.api as sm
import numpy as np 

from code import data_clean
import datetime

In [57]:
reload(data_clean)

<module 'code.data_clean' from 'code/data_clean.py'>

In [2]:
hn_top = pandas.read_csv('hackernews_march_2015_top.csv.gz')

In [3]:
hn_top.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 424865 entries, 0 to 424864
Data columns (total 6 columns):
id              424865 non-null int64
position        424865 non-null int64
score           424865 non-null int64
age_in_hours    424865 non-null float64
numComments     424865 non-null int64
timeScraped     424865 non-null object
dtypes: float64(1), int64(4), object(1)
memory usage: 22.7+ MB


## Transform features

In [46]:
hn_top = data_clean.create_features(hn_top)

## Clean data

In [49]:
hn_top = data_clean.remove_invalid_observations(hn_top)

## Remove articles with few observations

In [56]:
hn_top = data_clean.remove_articles_low_observations(hn_top)

## Remove observations time of day

In [58]:
hn_top = data_clean.remove_observations_time_of_day(hn_top, 8, 20)

## Take subsets for tractability

In [102]:
temp = hn_top[hn_top.dateScraped >= datetime.date(2015, 6,6)]

# Fit the basic model

In [104]:
%time model = sm.poisson('score_delta ~ C(id) + C(position)', data=temp)

CPU times: user 28.5 s, sys: 1.11 s, total: 29.6 s
Wall time: 20.9 s


In [105]:
%time fittedModel = model.fit(method='lbfgs', maxiter=5000)

CPU times: user 3min 55s, sys: 2.83 s, total: 3min 58s
Wall time: 2min 22s


In [68]:
fittedModel.params.head()

Intercept           1.675757
C(id)[T.9631580]   -2.251638
C(id)[T.9632604]    0.887215
C(id)[T.9632928]    0.174629
C(id)[T.9633038]   -5.955184
dtype: float64

In [88]:
fittedModel.params

Intercept            6.675757
C(id)[T.9631580]     2.748362
C(id)[T.9632604]     5.887215
C(id)[T.9632928]     5.174629
C(id)[T.9633038]    -0.955184
C(id)[T.9633051]     5.971427
C(id)[T.9633170]    -0.156288
C(id)[T.9633251]    -0.329970
C(id)[T.9633487]     5.504299
C(id)[T.9633534]    -0.540006
C(id)[T.9633740]     6.815654
C(id)[T.9633834]    -0.672816
C(id)[T.9633884]     0.016382
C(id)[T.9633921]     6.195081
C(id)[T.9633951]    -0.888968
C(id)[T.9633960]    -0.005967
C(id)[T.9634082]     5.075738
C(id)[T.9634138]     0.550259
C(id)[T.9634462]     0.007994
C(id)[T.9634468]    -0.334719
C(id)[T.9634491]     4.872484
C(id)[T.9634506]    -2.398467
C(id)[T.9634549]     5.305253
C(id)[T.9634557]     4.365942
C(id)[T.9634561]    -0.548577
C(id)[T.9634610]     5.771585
C(id)[T.9634611]     5.978492
C(id)[T.9634634]    -1.596578
C(id)[T.9634729]    -1.454426
C(id)[T.9634742]    -1.157009
                       ...   
C(position)[T.61]    0.840940
C(position)[T.62]    0.540386
C(position

In [98]:
fittedModel.params = fittedModel.params - 6

In [99]:
fittedModel.model.predict(fittedModel.params)

array([ 1.52200806,  0.37437582,  0.31081368, ...,  0.03592436,
        0.01584194,  0.0164471 ])

In [None]:
fittedModel.model.predict

In [101]:
fittedModel.predict?

In [90]:
fittedModel.params

Intercept            16.675757
C(id)[T.9631580]     12.748362
C(id)[T.9632604]     15.887215
C(id)[T.9632928]     15.174629
C(id)[T.9633038]      9.044816
C(id)[T.9633051]     15.971427
C(id)[T.9633170]      9.843712
C(id)[T.9633251]      9.670030
C(id)[T.9633487]     15.504299
C(id)[T.9633534]      9.459994
C(id)[T.9633740]     16.815654
C(id)[T.9633834]      9.327184
C(id)[T.9633884]     10.016382
C(id)[T.9633921]     16.195081
C(id)[T.9633951]      9.111032
C(id)[T.9633960]      9.994033
C(id)[T.9634082]     15.075738
C(id)[T.9634138]     10.550259
C(id)[T.9634462]     10.007994
C(id)[T.9634468]      9.665281
C(id)[T.9634491]     14.872484
C(id)[T.9634506]      7.601533
C(id)[T.9634549]     15.305253
C(id)[T.9634557]     14.365942
C(id)[T.9634561]      9.451423
C(id)[T.9634610]     15.771585
C(id)[T.9634611]     15.978492
C(id)[T.9634634]      8.403422
C(id)[T.9634729]      8.545574
C(id)[T.9634742]      8.842991
                       ...    
C(position)[T.61]    10.840940
C(positi