In [None]:
import cPickle
import requests
import json
import csv
import copy
import pandas as pd

with open('NYCountyNames.csv', 'r') as f:
    county_names = list(csv.reader(f))[0]

q = ['feelthebern',
     'sanders',
     'berniesanders',
     'bernie2016',
     'clinton',
     'hillaryclinton',
     'imwithher',
     'hillary2016',
     'tedcruz',
     'choosecruz',
     'cruzcrew',
     'unitewithcruz',
     'cruz2016',
     'trump2016',
     'donaldtrump',
     '#trump',
     'donaldtrump2016',
     'kasichcan',
     'johnkasich',
     'kasich2016',
     'kasich4us',
    ]

big_dict = dict()
for county in county_names:
    big_dict[county] = dict()
    for hashtag in q:
        try:
            with open('Tweets/' + county + '/' + hashtag + 'FULL.p', 'rb') as f:
                big_dict[county][hashtag] = cPickle.load(f)
        except IOError:
            continue

In [None]:
# Candidates to their associated hashtags
hashtag_dict = dict()
hashtag_dict['Bernie Sanders'] = ['feelthebern', 'sanders', 'berniesanders', 'bernie2016']
hashtag_dict['Hillary Clinton'] = ['clinton', 'hillaryclinton', 'imwithher','hillary2016',]
hashtag_dict['Ted Cruz'] = ['tedcruz', 'choosecruz', 'cruzcrew', 'unitewithcruz', 'cruz2016']
hashtag_dict['Donald Trump'] = ['trump2016', 'donaldtrump', '#trump', 'donaldtrump2016',]
hashtag_dict['John Kasich'] = ['kasichcan', 'johnkasich', 'kasich2016', 'kasich4us']

In [None]:
# Remove retweets and not-geocoded tweets
no_retweets = copy.deepcopy(big_dict)
for county in county_names:
    for hashtag in no_retweets[county].keys():
        with_rt = no_retweets[county][hashtag]
        no_retweets[county][hashtag] = [status for status in with_rt if ((not ('retweeted_status' in status)) and status['coordinates'])]

In [None]:
# Returns county given latitude and longitude
def findCounty(lat, lon):
    county_url = 'https://maps.googleapis.com/maps/api/geocode/json?sensor=false&latlng=' + str(lat) + ',' + str(lon)
    r = requests.post(county_url)
    county = ''
    for result in r.json()['results']:
        for comp in result['address_components']:
            if 'administrative_area_level_2' in comp['types']:
                county = comp['long_name'].replace(' County', '')
        if county:
            break
    return county

In [None]:
removed = dict()
actual = dict()
problem = dict()

for county in county_names:
    removed[county] = dict()
    actual[county] = dict()
    problem[county] = dict()
    for hashtag in q:
        removed[county][hashtag] = []
        actual[county][hashtag] = []
        problem[county][hashtag] = []

# Remove statuses whose counties are incorrect, then add them to the correct county if applicable
for county in county_names:
    print county
    for hashtag in no_retweets[county].keys():
        statuses = no_retweets[county][hashtag]
        for status in statuses:
            coords = status['coordinates']['coordinates']
            actual_county = findCounty(coords[1], coords[0])
            if not actual_county in county_names:
                print str(coords) + ' is not in New York'
                removed[county][hashtag].append(status)
                statuses.remove(status)
            elif county != actual_county:
                removed[county][hashtag].append(status)
                actual[actual_county][hashtag].append(status)
                statuses.remove(status)
for county in county_names:
    for hashtag in actual[county].keys():
        for status in actual[county][hashtag]:
            if status not in no_reteets[county][hashtag]:
                no_retweets[county][hashtag].append(status)

In [None]:
# Interesting features:
# Average favorites for a status with a certain hashtag
# Unique posters for statuses with certain hashtags
# Average number of retweets for a status with a certain hashtag
# Average number of followers for the posters of statuses with certain hashtags
# Total posts about certain hashtags

feature_dict = dict()

for county in county_names:
    feature_dict[county] = dict()
    for candidate in hashtag_dict.keys():
        total_favorites = 0
        posters = set()
        total_retweets = 0
        total_followers = 0
        total_posts = 0
        for hashtag in hashtag_dict[candidate]:
            statuses = no_retweets[county].get(hashtag)
            if statuses:
                total_posts += len(statuses)
                for status in statuses:
                    total_favorites += status['favorite_count']
                    poster = status['user']
                    if not poster['id'] in posters:
                        total_followers += poster['followers_count']
                    posters.add(poster['id'])
                    total_retweets += status['retweet_count']
        feature_dict[county][candidate + ' Average Favorite Count'] = total_favorites/total_posts if total_posts else 0
        feature_dict[county][candidate + ' Unique Posters'] = len(posters)
        feature_dict[county][candidate + ' Average Retweet Count'] = total_retweets/total_posts if total_posts else 0
        feature_dict[county][candidate + ' Average Follower Count'] = total_followers/len(posters) if len(posters) else 0
        feature_dict[county][candidate + ' Total Post Count'] = total_posts
feature_df = pd.DataFrame.from_dict(feature_dict, orient='index')
feature_df

In [None]:
# Add sentiment features

url = 'http://www.sentiment140.com/api/bulkClassifyJson?appid=sbalanovich@college.harvard.edu'
for candidate in hashtag_dict.keys():
    pos_dict = {county: 0 for county in county_names}
    neg_dict = {county: 0 for county in county_names}
    for county in county_names:
        for hashtag in hashtag_dict[candidate]:
            statuses = no_retweets[county].get(hashtag)
            if not statuses:
                continue
            # Assume these hashtags are positive
            if hashtag in ['feelthebern', 'imwithher', 'choosecruz', 'cruzcrew', 'unitewithcruz', 'kasich4us', 'kasichcan']:
                pos_dict[county] += len(statuses)
                continue
            tweet_data = [{'text': status['text'],
                           'query': candidate,
                           'id': status['id']} for status in statuses]
            r = requests.post(url, data=json.dumps({'data': tweet_data}))
            neg_dict[county] += len([result for result in r.json()['data'] if result['polarity'] == 0])
            pos_dict[county] += len([result for result in r.json()['data'] if result['polarity'] == 4])
    # Not sure why, but pd refuses to do element-wise division for me, so...
    for (county, count) in pos_dict.items():
        if feature_df[candidate + ' Total Post Count'][county]:
            pos_dict[county] = float(count) / float(feature_df[candidate + ' Total Post Count'][county])
    for (county, count) in neg_dict.items():
        if feature_df[candidate + ' Total Post Count'][county]:
            neg_dict[county] = float(count) / float(feature_df[candidate + ' Total Post Count'][county])
    feature_df[candidate + ' Positive Tweet Proportion'] = pd.DataFrame.from_dict(pos_dict, orient='index')
    feature_df[candidate + ' Negative Tweet Proportion'] = pd.DataFrame.from_dict(neg_dict, orient='index')
feature_df.head()

In [None]:
# Pickle for future use
with open('cleaned_no_rt.p', 'wb') as f:
    cPickle.dump(no_retweets, f)
with open('clean_feature_df.p', 'wb') as f:
    cPickle.dump(feature_df, f)