In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import ExtraTreesClassifier,GradientBoostingClassifier,RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from nltk.stem import SnowballStemmer,PorterStemmer
from stemming import lovins,porter2,paicehusk
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import stop_words
import pprint
import re
import time
import spacy
import datetime
import requests
import BeautifulSoup
%matplotlib inline

We begin by loading in the scraped reddit information from json to df. We put these dfs into a list for simplicity.

Additionally, we convert the comment text using regex.

In [110]:
news=pd.read_json('../r_news.json')
worldnews=pd.read_json('../r_worldnews.json')
tech=pd.read_json('../r_technology.json')
stock=pd.read_json('../r_StockMarket.json')
politics=pd.read_json('../r_politics.json')
inthenews=pd.read_json('../r_inthenews.json')
futurology=pd.read_json('../r_Futurology.json')
dfs=[news,worldnews,tech,stock,politics,inthenews,futurology]
for df in dfs:
    for i in range(len(df.columns)):
        all_words=pprint.pformat(df.iloc[4,i])
        all_words=all_words.replace("\n"," ").replace("\\n"," ").replace("{u'comment': ","")\
                .replace(",","").replace("ucomment: ","")
        all_words=re.sub(r"[\[\]]+"," ",all_words)
        all_words=re.sub(r"\Wu\'id\': \w\'\w{7}\'","",all_words).replace("{","").replace("}","").replace("      "," ")\
                    .replace("\\\'","\'").replace("' u\'"," ").replace(" u\'","")
        all_words=re.sub(r"\\u20..","",all_words).replace("\'","").replace(' u"',"")
        all_words=re.sub(r"[A-Za-z]*[0-9]+[A-Za-z]*","",all_words).replace("*","")
        df.iloc[4,i]=all_words

Below is a word vectorizer that takes in a df and the row to vectorize. This function can go through multiple columns.

In [82]:
def vectorize_words(df,row_num):
    cvec = TfidfVectorizer()
    newtemp=pd.DataFrame()
    stoppers=stopwords.words('english')
    df_list=[]
    print "Starting Word Vectorization:",time.ctime()
    for i in range(len(df.columns)):
        all_words=df.iloc[row_num,i]
        if len(all_words)>5:
            new_words=[porter2.stem(word) for word in all_words.split(" ") if word not in stoppers]
            new_words=[word for word in new_words]# if word not in stoppers
            temp_string=""
            for word in new_words:
                temp_string+=word+" "
            temp=pd.DataFrame(cvec.fit_transform([temp_string]).todense(),columns=cvec.get_feature_names())
        else:
            temp=pd.DataFrame([1.0],columns=["TO_DELETE"])
        df_list.append(temp)
    print "Finished Word Vectorization:",time.ctime()
    print 
    print "Starting Concatenation:", time.ctime()
    newtemp=pd.concat(df_list,ignore_index=True)
    print "Finished Concatenation:", time.ctime()
    print
    return newtemp

Here is where we use the above defined function. We also add in a column called "Mean", which contains the mean occurence of any given word. 

We will use this columns to sort our data.

In [42]:
for i in range(len(dfs)):
    print "Starting dfs #%i\n"%(i+1)
    newtemp=vectorize_words(dfs[i],4)
    print "Finished dfs #%i\n"%(i+1)
    dfs[i]=newtemp.T.sort_values([0],ascending=False).T.copy()
    dfs[i]=dfs[i].fillna(0)
    cols_to_keep=[]
    j=0
    while j<len(dfs[i].columns):
        if max(dfs[i].iloc[:,j])>0.0052:
            cols_to_keep.append(dfs[i].columns[j])
        j+=1
    dfs[i]=dfs[i][cols_to_keep].T
    dfs[i]["Mean"]=dfs[i].T.mean()

Starting dfs #1

Starting Word Vectorization: Wed Nov 16 09:43:33 2016
Finished Word Vectorization: Wed Nov 16 09:46:08 2016

Starting Concatenation: Wed Nov 16 09:46:08 2016
Finished Concatenation: Wed Nov 16 09:54:20 2016

Finished dfs #1

Starting dfs #2

Starting Word Vectorization: Wed Nov 16 09:54:29 2016
Finished Word Vectorization: Wed Nov 16 09:56:51 2016

Starting Concatenation: Wed Nov 16 09:56:51 2016
Finished Concatenation: Wed Nov 16 10:06:10 2016

Finished dfs #2

Starting dfs #3

Starting Word Vectorization: Wed Nov 16 10:06:20 2016
Finished Word Vectorization: Wed Nov 16 10:09:01 2016

Starting Concatenation: Wed Nov 16 10:09:01 2016
Finished Concatenation: Wed Nov 16 10:16:47 2016

Finished dfs #3

Starting dfs #4

Starting Word Vectorization: Wed Nov 16 10:16:54 2016
Finished Word Vectorization: Wed Nov 16 10:17:01 2016

Starting Concatenation: Wed Nov 16 10:17:01 2016
Finished Concatenation: Wed Nov 16 10:18:13 2016

Finished dfs #4

Starting dfs #5

Starting Word V

Here, we sort the data.

In [77]:
new_dfs=[]
for df in dfs:
    df=df.sort_values("Mean",ascending=False).T
    new_dfs.append(df)

Here, we save our word vectorized dfs to variable names for simplicity.

In [79]:
news=new_dfs[0]
worldnews=new_dfs[1]
tech=new_dfs[2]
stock=new_dfs[3]
politics=new_dfs[4]
inthenews=new_dfs[5]
futurology=new_dfs[6]

Here we save our dfs to jsons for time saving. Getting them into a word vectorized state took 70 minutes; this process cuts the time down to just 10. 

In [85]:
news.to_json('../vectorized_r_news.json')
worldnews.to_json('../vectorized_r_worldnews.json')
tech.to_json('../vectorized_r_technology.json')
stock.to_json('../vectorized_r_StockMarket.json')
politics.to_json('../vectorized_r_politics.json')
inthenews.to_json('../vectorized_r_inthenews.json')
futurology.to_json('../vectorized_r_Futurology.json')

Here we load in the vectorized jsons and reformat them to how they were before we saved them as a json. This is a 10 minute process.

In [168]:
### THIS CODE LOADS THE JSONS OF THE VECTORIZED REDDIT COMMENTS, AND GETS IT TO ITS PRIOR USABLE STATE

jsons=[]
json_dfs=[]
print "Starting JSON Loading"
start_time=time.time()
for link in ['../vectorized_r_news.json','../vectorized_r_worldnews.json','../vectorized_r_technology.json',\
            '../vectorized_r_StockMarket.json','../vectorized_r_politics.json','../vectorized_r_inthenews.json',\
            '../vectorized_r_Futurology.json']:
    print "Loading JSON at Time: %s Minutes"%((time.time()-start_time)/60.)
    jsons.append(pd.read_json(link))
    
print "Finished JSON Loading"
print "Took: %s Minues"%((time.time()-start_time)/60.)
print
print "Starting DF reformatting"
start_time=time.time()
for i in range(len(jsons)):
    temp_df=jsons[i]
    temp_df=temp_df.T.sort_values("Mean",ascending=False)
    del temp_df["Mean"]
    new_cols=[]
    for col in temp_df.columns:
        new_cols.append(int(col))
    temp_df.columns=new_cols
    temp_df["Mean"]=temp_df.T.mean()
    temp_df=temp_df.sort_values("Mean",ascending=False).T.sort_index()
    json_dfs.append(temp_df)
print "Finished DF reformatting"
print "Took: %s Seconds"%(time.time()-start_time)

Starting JSON Loading
Loading JSON at Time: 2.08218892415e-06
Loading JSON at Time: 1.24375556707
Loading JSON at Time: 2.83682558139
Loading JSON at Time: 4.60352323055
Loading JSON at Time: 5.01098876397
Loading JSON at Time: 6.43430181742
Loading JSON at Time: 8.50360263189
Finished JSON Loading
Took: 9.68986428181

Starting DF reformatting
Finished DF reformatting
Took: 0.947946071625


In [166]:
json_dfs[0]

Unnamed: 0,like,get,peopl,the,it,would,that,dont,one,go,...,tabid,quadrants,moranius,kadno,bulkshit,mea,shoelac,caryophyllene,panasiuk,epperson
0,0.136128,0.153144,0.144636,0.148890,0.108477,0.119112,0.108477,0.116985,0.125493,0.102096,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.162488,0.195942,0.138593,0.169657,0.160099,0.095581,0.109919,0.086023,0.078855,0.114698,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.186811,0.117510,0.138602,0.099432,0.117510,0.105458,0.108471,0.153667,0.096419,0.099432,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.137787,0.127348,0.129436,0.104384,0.131524,0.125261,0.112735,0.164927,0.093946,0.125261,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.148811,0.059524,0.181278,0.181278,0.108226,0.143399,0.108226,0.086581,0.154222,0.078464,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,0.231894,0.157357,0.168399,0.140793,0.151835,0.121468,0.143553,0.096622,0.102144,0.107665,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6,0.208964,0.226377,0.148016,0.153820,0.150918,0.104482,0.197354,0.089970,0.121895,0.121895,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,0.194210,0.139503,0.355596,0.114885,0.114885,0.164121,0.134032,0.144974,0.095737,0.123091,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,0.171185,0.171185,0.166150,0.188807,0.138459,0.098180,0.090628,0.090628,0.143494,0.146011,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9,0.103985,0.163976,0.135980,0.133980,0.127981,0.131981,0.103985,0.091987,0.071989,0.065990,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [1]:
fed=pd.read_json('../fed_info.json')
del fed['f2015043']
del fed['f2015042']
cols=[]
for col in fed.columns:
    if col[0]!='f':
        name=col[:4]+'-'+col[4:6]+'-'+col[6:]
        cols.append(name)
    else: print col
cols=pd.to_datetime(cols)
fed.columns=cols
# We eliminated columns before the 119th because we want to focus in on a 1 year period. We state the end of the 
# period is the last date in our data set
fed=fed.iloc[:,119:]

NameError: name 'pd' is not defined

Group the documents by month for word vectorization

In [94]:
#'News' will be a compiled list with eache element containing every fed article for a given month
#'index_names' will be the corresponding month of each grouping of news; it will be used in the vectorized words matrix

news=[]
index_names=[]
for year in [2015,2016]:
    for month in range(1,13):
        news.append([])
        words=""
        state=True
        for col in fed.columns:
            if col.year==year:
                if col.month==month:
                    for paragraph in fed.loc[2,col]:
                        words+=paragraph+" "
                        if state:
                            index_names.append(str(year)+'-'+str(month))
                            state=False
        news[-1].append(words)
index_names.append("Mean")
news_df=[]
for stuff in news:
    if len(str(stuff))>4:
        news_df.append(stuff)
news_df=pd.DataFrame(news_df).T

In [95]:
news_df=vectorize_words(news_df,0)
news_df=news_df.T.sort_values([0],ascending=False).T.copy()
news_df=news_df.fillna(0)
cols_to_keep=[]
j=0
while j<len(news_df.columns):
    if max(news_df.iloc[:,j])>0.0052:
        cols_to_keep.append(news_df.columns[j])
    j+=1
news_df=news_df[cols_to_keep].T
news_df["Mean"]=news_df.T.mean()
news_df=news_df.sort_values("Mean",ascending=False).T
news_df.index=index_names

Starting Word Vectorization: Thu Nov 17 08:15:21 2016
Finished Word Vectorization: Thu Nov 17 08:15:22 2016

Starting Concatenation: Thu Nov 17 08:15:22 2016
Finished Concatenation: Thu Nov 17 08:15:24 2016



In [97]:
news_df

Unnamed: 0,the,feder,reserv,board,bank,financi,2016,for,committe,market,...,strengthened,capacities,acquisitions,generat,preced,actuari,goodland,keycorp,newport,zion
2015-11,0.390653,0.245553,0.200907,0.312523,0.156261,0.044646,0.033485,0.122777,0.055808,0.044646,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2015-12,0.28853,0.271557,0.229126,0.246099,0.280043,0.093348,0.033945,0.118806,0.016972,0.033945,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2016-1,0.331783,0.288973,0.27827,0.139135,0.246162,0.16054,0.181946,0.085622,0.11773,0.128432,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2016-2,0.418775,0.235561,0.209387,0.340255,0.157041,0.091607,0.130867,0.104694,0.052347,0.03926,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2016-3,0.316727,0.345521,0.316727,0.165562,0.230347,0.165562,0.100777,0.093578,0.08638,0.08638,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2016-4,0.401434,0.347909,0.24086,0.178415,0.178415,0.11597,0.133811,0.080287,0.133811,0.089208,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2016-5,0.359114,0.283511,0.24571,0.302412,0.056702,0.264611,0.122855,0.122855,0.037802,0.037802,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2016-6,0.360858,0.301701,0.20705,0.147893,0.212966,0.195218,0.130146,0.076904,0.065073,0.053241,...,0.005916,0.005916,0.005916,0.005916,0.005916,0.005916,0.005916,0.005916,0.005916,0.005916
2016-7,0.394306,0.304691,0.17923,0.259883,0.161307,0.044807,0.107538,0.098576,0.134422,0.098576,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2016-8,0.368824,0.295059,0.239736,0.331942,0.175191,0.036882,0.119868,0.110647,0.036882,0.184412,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [100]:
fed.head()

Unnamed: 0,2015-11-10 00:00:00,2015-11-12 00:00:00,2015-11-13 00:00:00,2015-11-16 00:00:00,2015-11-18 00:00:00,2015-11-19 00:00:00,2015-11-24 00:00:00,2015-11-25 00:00:00,2015-11-30 00:00:00,2015-12-02 00:00:00,...,2016-10-21 00:00:00,2016-10-24 00:00:00,2016-10-25 00:00:00,2016-10-27 00:00:00,2016-10-28 00:00:00,2016-10-31 00:00:00,2016-11-01 00:00:00,2016-11-02 00:00:00,2016-11-03 00:00:00,2016-11-10 00:00:00
0,Federal Reserve Board issues enforcement actions,Federal Reserve Board issues enforcement actio...,Agencies announce final EGRPRA outreach meeting,Federal Reserve Board announces approval of ap...,"Minutes of the Federal Open Market Committee, ...",Federal Reserve announces TDF test operation,Federal Reserve Board issues enforcement actio...,Federal Reserve Board approves final rule to m...,Federal Reserve Board approves final rule spec...,Federal Reserve announces College Fed Challeng...,...,Federal Reserve Board announces plans to enter...,Federal Reserve Board announces it has voted t...,Federal Reserve Board approves fee schedule fo...,Federal Reserve Board announces annual indexin...,Federal Reserve Board announces approval of ap...,Agencies request comment on proposed private f...,Federal Reserve Board issues enforcement actio...,Federal Reserve issues FOMC statement,Federal Reserve Board issues enforcement actio...,Federal Reserve Board announces approval of ap...
1,https://www.federalreserve.gov/newsevents/pres...,https://www.federalreserve.gov/newsevents/pres...,https://www.federalreserve.gov/newsevents/pres...,https://www.federalreserve.gov/newsevents/pres...,https://www.federalreserve.gov/newsevents/pres...,https://www.federalreserve.gov/newsevents/pres...,https://www.federalreserve.gov/newsevents/pres...,https://www.federalreserve.gov/newsevents/pres...,https://www.federalreserve.gov/newsevents/pres...,https://www.federalreserve.gov/newsevents/pres...,...,https://www.federalreserve.gov/newsevents/pres...,https://www.federalreserve.gov/newsevents/pres...,https://www.federalreserve.gov/newsevents/pres...,https://www.federalreserve.gov/newsevents/pres...,https://www.federalreserve.gov/newsevents/pres...,https://www.federalreserve.gov/newsevents/pres...,https://www.federalreserve.gov/newsevents/pres...,https://www.federalreserve.gov/newsevents/pres...,https://www.federalreserve.gov/newsevents/pres...,https://www.federalreserve.gov/newsevents/pres...
2,[The Federal Reserve Board on Tuesday announce...,[The Federal Reserve Board on Thursday announc...,[The meeting is the sixth and final in a serie...,[The Federal Reserve Board on Monday announced...,[The Federal Reserve Board and the Federal Ope...,[The Federal Reserve plans to continue its pre...,[The Federal Reserve Board on Tuesday announce...,[The Federal Reserve Board on Wednesday approv...,[The Federal Reserve Board on Monday approved ...,[Pace University won the 12th annual national ...,...,[The Federal Reserve Board announced Friday th...,[The Federal Reserve Board announced on Monday...,[The Federal Reserve Board on Tuesday announce...,[The Federal Reserve Board on Thursday announc...,[The Federal Reserve Board on Friday announced...,[The federal flood insurance statutes require ...,[The Federal Reserve Board on Tuesday announce...,[Information received since the Federal Open M...,[The Federal Reserve Board on Thursday announc...,"[The Federal Reserve Board on November 10, 201..."


In [142]:
news.head().iloc[0,0]

u"A pilot who ordered an emergency evacuation after smoke was detected coming from one of the jet's engines is suing Allegiant Air for firing him."

In [5]:
# Get a list of all US companies

import requests
import BeautifulSoup
r=requests.get('https://en.wikipedia.org/wiki/List_of_companies_of_the_United_States')
soup=BeautifulSoup.BeautifulSoup(r.text)
soup=soup.findAll('div',{'class':"div-col columns column-width"})[1:26]
companies=[]
for i in range (len(soup)):
    companies.extend([x.text for x in soup[i].findAll('li')])

In [6]:
# List of some relevant terms
stock_terms=['inflation','fraud','fed','federal','funds','fund','rates','rate','ffr','demographics','liquidity',\
            'buyout','sentiment','sold','stock','options','contract','ib','investment','bank',"$",'thousand',\
            'banker','misled','mislead','crash','scammer','scamming','purchase','bought','bribes','accepted',\
             'investor','investors','scheme','global','commodity','commodities','tax','million','billion','trillion'\
            'oil','crude','gold','prices','metals','grain','cattle','soybeans','rice','ecb','central bank',\
            'bail','bailout','finance','refinance','debt','loans','debts','loan','repay','repaid','owe','owed',\
             'owes','dividend','pay','paid','bankrupt','bankruptcy','stimulus','trade','deal','corporate',\
            'price','exchange','nyse','bonds','bond','yield','stocks','trader','traders',\
            'embezzle','embezzled','stole','holder','holders','yielded','yields','economy','economic',\
            'monetary','money','usd','dollars','stimulus','insider','trading','defraud','defrauded','deceived',\
            'deception','chief','health','science','business','businessman','businessmen','shareholder','equity']
stock_terms.sort()

In [33]:
# Get government words
url='http://www.sparknotes.com/us-government-and-politics/glossary/terms.html'
r=requests.get(url)
soup=BeautifulSoup.BeautifulSoup(r.text)
poli_words=soup.findAll('meta',{"name":'keywords'})[0]['content'].split(", ")
p_words2=['president','secretary','department','state','nation','federal','judge','judicial','court','government',\
         'fcc','federal','commission']
poli_words.extend(p_words2)

In [8]:
# All government agencies in the US
gov_agencies=[]
baseurl="https://www.usa.gov/federal-agencies/"
for letter in 'abcdefghijklmnopqrstuvwxyz':
    url=baseurl+letter
    r=requests.get(url)
    soup=BeautifulSoup.BeautifulSoup(r.text)
    l=soup.findAll('ul', {'class':"one_column_bullet"})
    if len(str(l))>10:
        l=l[0].findAll('a', {'class':"url"})
        l=[x.text for x in l]
        gov_agencies.extend(l)

In [9]:
# Countries and Capital Cities
locations=[]
url='https://www.countries-ofthe-world.com/capitals-of-the-world.html'
r=requests.get(url)
soup=BeautifulSoup.BeautifulSoup(r.text)
locations=[x.findAll('td') for x in soup.findAll('tr',{'class':'grey'})]
locs=[]
for x in locations:
    locs.append([])
    for y in x:
        locs[-1].append(y.text)
locations=locs

In [10]:
# Cities and States of the US
locations=[]
url='https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population'
r=requests.get(url)
soup=BeautifulSoup.BeautifulSoup(r.text)
soup=soup.findAll('table')[3]
locs=[]
for a in soup.findAll('tr'):
    locs.append([])
    for b in a.findAll('th'):
        locs[-1].append(b.text)
    for b in a.findAll('td'):
        locs[-1].append(b.text)
cities=pd.DataFrame(locs).iloc[:,1].values
states=pd.DataFrame(locs).iloc[:,2].values
city=[]
for name in cities:
    city.append(name.split('[')[0])
state=[]
for name in states:
    state.append(name.split('[')[0])

In [11]:
words=pd.read_csv('../subjectivity of words.txt')
word_list=['type=weaksubj len=1 word1=abandoned pos1=adj stemmed1=n priorpolarity=negative'.split(" ")]
words.columns=['col']
for row in words['col']:
    word_list.append(row.split(" "))
words=[]
for x in word_list:
    words.append([])
    for y in x:
        words[-1].append(y.split("=")[-1])
words=pd.DataFrame(words)
words.columns=['type','len','word','pos','stemmed','priorpolarity','N/A']

In [37]:
key_words=[['Companies',x] for x in companies]
key_words.extend([["Finance",x] for x in stock_terms])
key_words.extend([['Political',x] for x in poli_words])
key_words.extend([['Gov Agencies',x] for x in gov_agencies])
key_words.extend([["Countries and Capitals",x] for x in locations])
key_words.extend([['US Cities',x] for x in city])
key_words.extend([['US States',x] for x in states])
key_terms_df=pd.DataFrame(key_words)
key_terms_df.to_json('../key_terms.json')

In [69]:
temp=pd.read_json('../key_terms.json').reset_index()
temp['index']=temp['index'].astype(int)
temp=temp.sort_values('index')
temp.index=temp['index']
del temp['index']