In [1]:
import os
import re
import json
import string
import numpy as np
import pandas as pd
from pandas import json_normalize
import pymongo
from pymongo import MongoClient
from bson import json_util, ObjectId
from tqdm import tqdm

pd.set_option('display.max_colwidth', None)

In [2]:
# Setup client

def mongo_to_dataframe(mongo_data):

    sanitized = json.loads(json_util.dumps(mongo_data))
    normalized = json_normalize(sanitized)
    df = pd.DataFrame(normalized)

    return df
  
    
client = MongoClient("localhost", 27017)
db = client.TwitterData
collection = db.Tweets

In [None]:
query = {"place.country_code" : "US"}
projection = {"text": 1, "place.place_type": 1, "place.full_name": 1, "_id": 0}

# Get data from Mongo to Pandas
cursor = collection.find(query, projection)
df =  mongo_to_dataframe(cursor)
len(df)

In [None]:
df.head()

In [None]:
# Get state in seperate column
df["state"] = df["place.full_name"].apply(lambda x: str(x).split(", ")[-1])

In [None]:
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}
abbrev_us_state = dict(map(reversed, us_state_abbrev.items()))
state_abbrevs = ["USA"] + [state for state in abbrev_us_state]

def convert_states(x):
    if x["state"] == "USA":
        try:
            return us_state_abbrev[x["place.full_name"].split(", ")[0]]
        except:
            return None
    else:
        return x["state"]

In [None]:
df["state"] = df.apply(lambda x: convert_states(x), axis=1)
df = df[df["state"].isin(state_abbrevs)]

In [None]:
df.head()

In [None]:
tweets_by_states = pd.DataFrame(df.groupby(["state"]).size().sort_values(ascending=False), columns=["tweet_count"])
tweets_by_states.head()

In [None]:
# Load population of states CSV
pop = pd.read_csv("data/nst-est2019-alldata.csv")
pop = pop[pop["NAME"].isin([state for state in us_state_abbrev])][["NAME", "POPESTIMATE2016"]]
pop.columns = ["state_name", "pop_estimate_2016"]
pop["state"] = pop["state_name"].apply(lambda x: us_state_abbrev[x])
pop.head()

In [None]:
tweets_by_states = tweets_by_states.merge(pop, on="state")

In [None]:
tweets_by_states["tweet_per_capita"] = tweets_by_states.apply(lambda x: x.tweet_count / x.pop_estimate_2016, axis=1)
tweets_by_states.head()

In [None]:
# Load election turnouts for the states
turn = pd.read_csv("data/2016_november_general_election__turnout_rates.csv", sep=";")
turn = turn[["State Abv", "Highest Office", "VEP Highest Office"]]
turn = turn[1:]
turn.columns = ["state", "highest_office", "turnout_percent"]
turn["highest_office"] = turn["highest_office"].apply(lambda x: int(x.replace(",", "")))
turn.head()

In [None]:
tweets_by_states = tweets_by_states.merge(turn, on="state")
tweets_by_states.head()

In [None]:
tweets_by_states["turnout"] = tweets_by_states.turnout_percent.apply(lambda x: int(x.replace("%", "")))
tweets_by_states["tweet_per_vote"] = tweets_by_states.apply(lambda x: x.tweet_count/int(x.highest_office), axis=1)
tweets_by_states#.to_csv("tweets_by_states.csv")

In [None]:
tweets_by_state = tweets_by_states[["state", "state_name", "tweet_count", "pop_estimate_2016", "tweet_per_capita"]]
tweets_by_state.set_index("state", inplace=True)
tweets_by_state.sort_values(ascending=False, by=["tweet_per_capita"])

In [None]:
import pandas as pd
import folium
import branca
 
state_geo = os.path.join('data', 'us-states.json')
state_data = tweets_by_states[["state", "tweet_per_capita"]].copy()
state_data["tweet_per_capita"] = state_data.tweet_per_capita.apply(lambda x: x*100)

# Initialize the map:
m = folium.Map(location=[37, -102], zoom_start=5)
 
# Add the color for the chloropleth:
m.choropleth(
 geo_data=state_geo,
 name='choropleth',
 data=state_data,
 columns=['state', 'tweet_per_capita'],
 key_on='feature.id',
 fill_color='OrRd',
 fill_opacity=0.7,
 line_opacity=0.2,
 legend_name='Tweets per capita (%)'
)
folium.LayerControl().add_to(m)
 
# Save to html
m.save('map.html')
m


In [None]:
import pandas as pd
import folium
 
state_geo = os.path.join('data', 'us-states.json')
state_data = tweets_by_states[["state", "turnout"]].copy()

# Initialize the map:
m = folium.Map(location=[37, -102], zoom_start=5)
 
# Add the color for the chloropleth:
m.choropleth(
 geo_data=state_geo,
 name='choropleth',
 data=state_data,
 columns=['state', 'turnout'],
 key_on='feature.id',
 fill_color='OrRd',
 fill_opacity=0.7,
 line_opacity=0.2,
 legend_name='2016 general election turnout (%)'
)
folium.LayerControl().add_to(m)
 
# Save to html
#m.save('map_turnout.html')
m