In [None]:
"""
This notebook takes in ngram time series data produced from a MR job
The assumed format is:

2	(2013-01-01 ,#dead)

ie.. word \t (date, term)

We parse this data and fit a linear trend model to each term
Through clever use of interaction terms in statsmodels we can do this in 1 shot

"""
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import gzip
import string
import re
from time import time
import os.path
from datetime import datetime

##read in stopwrods file
stopwords=[]
with open('stopwords.txt', 'r') as stop_fn:
    for line in stop_fn:
       stopwords.append(line.strip().replace('"', '')) 
print("read in {} stopwords".format(len(stopwords)))
print(stopwords)

In [None]:
t0=time()
line_num=0
data_for_df=[]
with open('/mnt/snelson/word_time_series/unzipped_word_time_series.txt', 'r') as f:
    for line in f:
        line_num+=1
        try:
            num_test_recs=1000
            #if line_num>num_test_recs:
            #    break
            data=line.split('\t')
            count=data[0]
            mytuple=[x.translate(None, ")(").strip() for x in re.split(',', data[1], maxsplit=1)]
            dt, term=mytuple
            dt=dt.strip()
            term=term.strip()
            count=int(count)
            ## check if date is correct
            pattern=re.compile('20[0-9]{2}-[0-9]{2}-[0-9]{2}')
            if term not in stopwords and len(mytuple)==2 and len(dt)==10 and pattern.search(dt):
                dt_list = dt.split("-")
                guess_date = int(int(dt_list[1])-1)*31 + int(dt_list[2])
                #dt_days=(datetime.strptime(dt, '%Y-%m-%d') - datetime.strptime('2013-01-01', '%Y-%m-%d')).days
                data_for_df.append([guess_date, term, count])
        except:
            print("error on line {}: {}".format(line_num, line))
            continue

#pickle data for later use
#df.to_pickle('df.pkl')
t1=time()
tot=(t1-t0)/60
print("time to run test data:{} minutes".format(tot))
print("estimated time to run total:{} minutes".format(tot*(15061489/num_test_recs) ))

In [None]:
## convert lsit of lists to a df
df=pd.DataFrame(data_for_df, columns=['dt', 'term', 'count'])
#df.dt=pd.to_datetime(df.dt, format='%Y-%m-%d')
df=df.drop_duplicates()
df=df.groupby(['dt', 'term'])['count'].sum().reset_index()
print(df)

In [None]:
#pickle data for later use
#df.to_pickle('df.pkl')

In [None]:
## formatting
print(df.columns)
print(df.head(n=5))
print(df.shape)
print(df.dtypes)

In [None]:
#number of unique terms in the data
unique_terms=df.term.unique()
print(unique_terms)
print(len(unique_terms))


In [None]:
# terms with more than 500 mentions
term_count=df.groupby(['term'])['count'].sum()
print(len(term_count[term_count>500]))
print(term_count[term_count>500])
print(term_count[term_count>500].index[:5].values)

In [None]:
## filter down data set to only include terms with more than 500 mentions -- ~4000 unique terms
## 1.3M rows of data
df2=df[df.term.isin(term_count[term_count>500].index.values)]
print(len(df2))
print(len(df2.term.unique()))


In [None]:
## filter to terms with more than 30 days of data
## would be nice if this was consecutive days 
num_days_by_term=df2.groupby(['term']).size()
num_days_by_term.sort(ascending=False)
print(num_days_by_term)



In [None]:
## run regression on top 100 terms due to memory issues
df3=df2[df2.term.isin(num_days_by_term[:1].index.values)]
print(df3.shape)
print df3


In [None]:
num_days_by_term.index[0]

In [None]:
## fit *ALL* models at once  using interaction terms
import statsmodels.api as sm
import statsmodels.formula.api as smf
t0=time()
results={}
for i in range(len(num_days_by_term.index)):
    df3=df2[df2.term==num_days_by_term.index[i]]
    results[i] = smf.ols(formula='count ~ C(term) + C(term)*dt', data=df3).fit()
t1=time()
print("running time:{}".format(t1-t0))
#print("estimated total running time:{} minutes".format((t1-t0)/60*4000))
#print(est.summary())

In [None]:
## parse out results
a=[results[i].params['dt'] for i in results]
b=num_days_by_term.index.values
c=pd.DataFrame(zip(b,a), columns=['term', 'coef'])
c=c.sort(['coef'], ascending=False)
print(c)

In [None]:
#test plot
from matplotlib.backends.backend_pdf import PdfPages
def plotit(i, term):
    test=df2[df2.term==term]
    test1=pd.Series(data=test['count'].values, index=test['dt'])
    #test1.plot(title=term)
    plt.figure(); ax=test1.plot(title=term);
    ax.set_xlabel("day")
    ax.set_ylabel("count/day")
    pdf.savefig()
    #plt.savefig('./figs/{}.pdf'.format(i))
    plt.close()
with PdfPages('multipage_pdf.pdf') as pdf:
    for i, term in enumerate(c.term.values[:200]):
        plotit(i, term)