In [1]:
import pandas as pd
import numpy as np
import json
from polyglot.text import Text
from polyglot.downloader import downloader
from textblob import TextBlob
import nltk
import timestring
from datetime import datetime

In [3]:
# tweets=pd.read_csv('datasets/pres_tweets.csv')
tweets = pd.read_csv('datasets/speeches/2016_all_entities.csv')

In [12]:
def get_entity_sentiment(e):
    try:
        pos=e.positive_sentiment
        neg=e.negative_sentiment
        return pos,neg
    except:
        return '',''
def get_sentiment(t):
    blob = TextBlob(str(t).decode('utf-8'))
    return map(lambda s:s.sentiment.polarity,blob.sentences)
def get_noun_phrases(t):
    blob = TextBlob(str(t).decode('utf-8'))
    return list(blob.noun_phrases)
def map_entity(e):
    pos,neg=get_entity_sentiment(e)
    return {'tag':e.tag,'entity':" ".join(e),'pos':pos,'neg':neg}
def get_entites(text):
    try:
        polyglot_text=Text(text)
        entities=polyglot_text.entities
        return map(map_entity,entities)
    except:
        return []
     
def map_to_data(tweet):
    user_name=tweet['user']['name']
    text=tweet['text']
    return {'text':text,'user':user_name,'entities':get_entites(text),'sentiment':get_sentiment(text),'noun_phrases':get_noun_phrases(text)}

In [5]:
# tweets['entities']=tweets['body'].map(get_entites)

In [6]:
tweets.columns

Index([u'Unnamed: 0', u'Unnamed: 0.1', u'url', u'Date', u'Title', u'Link',
       u'Actor', u'Texts', u'entities'],
      dtype='object')

In [20]:
tweets['Date_parsed'] = tweets.Date.apply(lambda x: datetime.strptime(x, '%B %d, %Y'))

In [22]:
tweets.Date_parsed.head()

0   2015-04-12
1   2015-04-14
2   2015-04-20
3   2015-04-29
4   2015-05-05
Name: Date_parsed, dtype: datetime64[ns]

In [16]:
tweets.Date.head()

0    April 12, 2015
1    April 14, 2015
2    April 20, 2015
3    April 29, 2015
4       May 5, 2015
Name: Date, dtype: object

In [24]:
tweets['displayName'] = tweets.Actor

In [7]:
body_col = 'Texts'

In [11]:
tweets['sentiment']=tweets[body_col].map(get_sentiment)

In [13]:
tweets['noun_phrases']=tweets[body_col].map(get_noun_phrases)

In [23]:
tweets['mean_sentiment']=tweets['sentiment'].map(lambda s:reduce(lambda x, y: x + y, s) / len(s))
tweets['postedTime_datetime']=tweets.Date_parsed.map(lambda x:pd.to_datetime(str(timestring.Date(x))))
tweets['postedTime_datetime']=pd.DatetimeIndex(tweets.Date_parsed)

In [25]:
tweets['displayName']=tweets['displayName'].replace('Senator Ted Cruz','Ted Cruz')
tweets['day']=tweets['postedTime_datetime'].map(lambda x:x.date())
tweets['week']=tweets['day'].map(lambda x:x.isocalendar()[1])

In [26]:
tweets.to_csv('datasets/speech_with_desc.csv')

In [36]:
dataToCorelation={}
candidates=['Bernie Sanders', 'Donald Trump', 'Hillary Clinton']
for candidate in candidates:
    candidateTweets=tweets[tweets['displayName']==candidate]
    
    dataToCorelation[candidate]=candidateTweets[['mean_sentiment','day']].groupby(['day']).mean().reset_index()

In [39]:
dataToCorelation

{'Bernie Sanders':             day  mean_sentiment  Bernie Sanders
 0    2015-04-30        0.078392        0.078392
 1    2015-05-01        0.129071        0.129071
 2    2015-05-02        0.082386        0.082386
 3    2015-05-10        0.071941        0.071941
 4    2015-05-14        0.043780        0.043780
 5    2015-05-15        0.138889        0.138889
 6    2015-05-19        0.106782        0.106782
 7    2015-05-20        0.073246        0.073246
 8    2015-05-22        0.000000        0.000000
 9    2015-05-26        0.057375        0.057375
 10   2015-05-28        0.161668        0.161668
 11   2015-05-29        0.179001        0.179001
 12   2015-05-30        0.109498        0.109498
 13   2015-05-31        0.098294        0.098294
 14   2015-06-01        0.055195        0.055195
 15   2015-06-02       -0.039299       -0.039299
 16   2015-06-06        0.136257        0.136257
 17   2015-06-08        0.160977        0.160977
 18   2015-06-13        0.148750        0.148750
 1

In [37]:
cols=[]
headers=[]
dates=[]
for k,v in dataToCorelation.iteritems():
    v[k]=v['mean_sentiment']
    dates.append(list(v['day']))
    cols.append(list(v[k]))
    headers.append(k+' speeches')

In [38]:
import pickle
pickle.dump((cols,headers,dates),open('corelation_speeches.p','wb'))

In [None]:
cand_col = ''

In [27]:
candidates=set(tweets['displayName'])
candidates

{'Bernie Sanders', 'Donald Trump', 'Hillary Clinton'}

In [29]:
data={}
for candidate in candidates:
    candidateTweets=tweets[tweets['displayName']==candidate]
    sentimentData=candidateTweets[['postedTime_datetime','mean_sentiment','day','week']]
    data[candidate]=sentimentData
#     sentimentData.set_index(sentimentData['postedTime_datetime'],inplace=True)
#     sentimentData=sentimentData.groupby(pd.TimeGrouper("D")).mean()
#     print candidate+': '+str(sentimentData['mean_sentiment'].std())
    sentimentData.to_csv('speech_sentiment/'+candidate+'.csv')
    

Unnamed: 0,postedTime_datetime,mean_sentiment,day,week,postedTime_datetime.1,mean_sentiment.1,day.1,week.1,postedTime_datetime.2,mean_sentiment.2,day.2,week.2,postedTime_datetime.3,mean_sentiment.3,day.3,week.3,postedTime_datetime.4,mean_sentiment.4,day.4,week.4
0,NaT,,,,NaT,,,,2015-06-04 23:32:23,0.000000,2015-06-04,23.0,NaT,,,,NaT,,,
1,NaT,,,,NaT,,,,2015-06-04 23:06:04,0.200000,2015-06-04,23.0,NaT,,,,NaT,,,
2,NaT,,,,2015-06-04 22:47:32,0.422222,2015-06-04,23.0,NaT,,,,NaT,,,,NaT,,,
3,NaT,,,,2015-06-04 22:44:17,0.103125,2015-06-04,23.0,NaT,,,,NaT,,,,NaT,,,
4,NaT,,,,NaT,,,,NaT,,,,2015-06-04 22:41:33,0.300000,2015-06-04,23.0,NaT,,,
5,NaT,,,,NaT,,,,NaT,,,,NaT,,,,2015-06-04 22:35:37,0.30,2015-06-04,23.0
6,NaT,,,,NaT,,,,NaT,,,,NaT,,,,2015-06-04 22:32:26,0.25,2015-06-04,23.0
7,NaT,,,,2015-06-04 22:28:37,0.083333,2015-06-04,23.0,NaT,,,,NaT,,,,NaT,,,
8,NaT,,,,NaT,,,,2015-06-04 22:05:05,0.400000,2015-06-04,23.0,NaT,,,,NaT,,,
9,2015-06-04 21:53:05,0.800000,2015-06-04,23.0,NaT,,,,NaT,,,,NaT,,,,NaT,,,


In [95]:
downloader.download('ner2.sco',download_dir='/root/polyglot_data')

[polyglot_data] Error loading ner2.sco: Package u'ner2.sco' not found
[polyglot_data]     in index


False

NameError: name 'tweet' is not defined

In [139]:
entities_count = {}

In [126]:
# {for k, v tweets['entities']}

In [140]:
for row_id, tweet in tweets.iterrows():
    for ent in tweet['entities']:
        e = ent['entity']
        if e not in entities_count:
            entities_count.update({e: 1})
        else:
            entities_count[e] += 1

In [181]:
entities_count = {k.strip().encode('ascii','ignore'):v for k, v in entities_count.iteritems() if len(k) > 2 and v > 4}

In [182]:
entities_count

{': New Hampshire': 6,
 'ABC': 47,
 'ACA': 6,
 'AMERICA': 32,
 'AMERICA SAFE': 5,
 'Aberdeen': 6,
 'Adam': 16,
 'Affordable': 12,
 'Afghanistan': 10,
 'Aiken': 5,
 'Alabama': 34,
 'Alaska': 10,
 'Alberto Gonzales': 5,
 'America': 757,
 "America's": 7,
 'American': 60,
 'American Airlines Center': 5,
 'Ames': 16,
 'Ames , Iowa': 6,
 'Amir Hekmati': 5,
 'Amnesty': 6,
 'Anderson': 8,
 'Antonin Scalia': 7,
 'Apple': 7,
 'Arizona': 49,
 'Arkansas': 12,
 'Arnold': 5,
 'Assad': 6,
 'Atlanta': 24,
 'Austin': 5,
 'BET': 8,
 'BUSH': 7,
 'Baltimore': 11,
 'Barack Obama': 18,
 'Barbara Walters': 5,
 'Beijing': 8,
 'Ben': 11,
 'Ben Carson': 22,
 'Benghazi': 17,
 'Bern': 31,
 'Bernie': 314,
 'Bernie Sanders': 331,
 "Bernie Sanders's": 7,
 'BernieSanders': 25,
 'Biden': 9,
 'Big Pharma': 9,
 'Bill': 12,
 'Bill Clinton': 8,
 "Bill O'Reilly": 9,
 'Birmingham': 8,
 'Bloomberg': 8,
 'Bluffs , Iowa': 5,
 'Bob': 8,
 'Bob Vander Plaats': 7,
 'Bobby Knight': 7,
 'Boehner': 7,
 'Boston': 17,
 'Boulder': 5,
 '

In [183]:
len(entities_count)

689

In [136]:
for row_id, tweet in tweets.iterrows():
    for ent in tweet['entities']:
        if ent['entity'] not in entities:
            entities[ent['entity']].update({})

In [148]:
!pip2 install wordcloud

Collecting wordcloud
  Using cached wordcloud-1.2.1.tar.gz
Building wheels for collected packages: wordcloud
  Running setup.py bdist_wheel for wordcloud ... [?25l- \ done
[?25h  Stored in directory: /root/.cache/pip/wheels/29/9a/a9/86dcbbd5a7b6ace25887e4351a0136ea6dfcc0dd7de0a51357
Successfully built wordcloud
Installing collected packages: wordcloud
Successfully installed wordcloud-1.2.1
[33mYou are using pip version 8.1.0, however version 8.1.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [149]:
from wordcloud import WordCloud

In [184]:
wordcloud = WordCloud(background_color="white").generate_from_frequencies(entities_count)

ValueError: could not convert string to float: y

In [None]:
# def draw_wc_filtered(df, top_n=10, save=False, agg_col=u'Quantity'): 
#     df = pd.DataFrame(df.groupby(u'Group').sum()[agg_col]).sort(
#         agg_col, ascending=False).ix[:top_n].reset_index()
#     df.Group = df.Group.apply(lambda x: x.split(' - ')[1])
#     counts = [((x[1]['Group'].decode('utf-8'), x[1][agg_col])) for x in df.iterrows()]
# #     print counts
wordcloud = WordCloud(background_color="white").generate_from_frequencies(counts)
plt.figure(figsize=(9, 12))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
#     if save:
#         plt.savefig('wc/{}.png'.format(save))