# Drone Crashmas

In [1]:
import networkx as nx
from pymongo import MongoClient
import pandas as pd
from pandas.io.json import json_normalize
import seaborn as sns
import json
from urllib.parse import urlsplit
from urllib.parse import urlparse
import requests
import http.client
import matplotlib.pyplot as plt
import datetime as dt

In [2]:
client = MongoClient('mongodb://localhost:27017')
db = client["twitterdb"]
collection = db.droneCrashmas2019

In [3]:
projection = {
        #convert created_at to date
        '$project': {
            '_id': 0, 
            'id': 1, 
            'created_at': {
                '$dateFromString': {
                    'dateString': '$created_at'
                }
            }, 
            'user': 1, 
            'entities': 1, 
            'lang': 1, 
            'text': 1, 
            'retweeted_status': 1
        }
    }

match = {
        '$match': {
            'created_at': {
                '$lte': dt.datetime(2020, 1, 1, 0, 0, 0, tzinfo=dt.timezone.utc)
            }
        }
    }

In [4]:
cursor = collection.aggregate([projection, match])
df =  pd.DataFrame(list(cursor))

In [None]:
df['created_at'] = df['created_at'].dt.tz_localize('UTC')
df['created_at'] = df['created_at'].dt.tz_convert('US/Eastern')
df = df[(df['created_at'] >= '2019-12-21 00:00:00') & (df['created_at'] <= '2019-12-31 23:59:59')]
df['hour'] = df['created_at'].dt.hour
df['day_of_week'] = df['created_at'].dt.dayofweek
df.shape

In [None]:
df['text'] = df['text'].str.lower()
df = df[~df['text'].str.contains('iraq|afghanistan|libya|syria|airstrike|soleimani|suleimani|qaeda|terrorist', na=False)]
df.head()

In [None]:
df['lang'] = df['lang'].replace('en-gb', 'en-GB')
df['lang'] = df['lang'].replace('en-GB', 'en')
df_en = df.loc[df['lang'] == 'en']

In [None]:
crashmas = pd.DataFrame()
crashmas = df[df['text'].str.contains('dad|brother|hair|crash|dog|cat|roof|mom|mother|grandmother|grandma|sister|grandfather|grandpa|tree|lake|pond|ocean|lost|crashed|broken|broke|vanished|uncle', na=True)].reset_index()
crashmas = crashmas.loc[crashmas['lang'] == 'en']
crashmas = crashmas[~crashmas['screen_name'].str.contains('mountainherder|faineg', na=False)]
crashmas.shape

In [None]:
crashmas_2 = crashmas.groupby(crashmas['created_at'].rename('Date')).size().reset_index(name='Number of Tweets')
crashmas_2 = crashmas_2.set_index(['Date'])
crashmas_2.index = pd.to_datetime(crashmas_2.index, unit='s')
crashmas_resampled = crashmas_2.resample('H').sum().reset_index()

import matplotlib.dates as mdates
sns.set_context("talk")
sns.set_style("darkgrid", {'axes.facecolor':'.95'})
plt.figure(figsize=(24, 12))
plt.rcParams['font.family'] = "serif"
plt.xticks(rotation=45)
lp = sns.lineplot(data=crashmas_resampled, x="Date", y="Number of Tweets", markers=True)
lp.xaxis.set_major_locator(mdates.DayLocator())
lp.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
lp.xaxis.set_minor_locator(mdates.HourLocator())
#lp.xaxis.set_minor_formatter(mdates.DateFormatter('%H'))
lp.tick_params(axis="x", which="major", pad=12)
lp.grid(b=True, which='minor', color='w', linewidth=0.5)
for label in (lp.get_xticklabels() + lp.get_yticklabels()):
    label.set_fontsize(14)
for label in (lp.get_xminorticklabels()):
    label.set_fontsize(8)
    label.set_rotation(45)
g1 = lp.set_title('Volume of #dronecrashmas tweets by hour | 2019').get_figure()
g1.savefig("Tweets_Hour_Dads.png")

In [None]:
days_of_week = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday','Sunday']
tweets_timeline = crashmas.groupby(by=['hour', 'day_of_week'])['created_at'].count()

midpoint = (tweets_timeline.values.max() - tweets_timeline.values.min()) / 2

plt.figure(figsize=(20, 15))
plt.rcParams['font.family'] = "serif"
plt.title('Tweet Heatmap for #dronecrashmas Discussions', fontsize=24)

ax = (sns.heatmap(tweets_timeline.unstack(),
                  cmap='coolwarm',
                  robust=True,
                  center=midpoint,
                  xticklabels=days_of_week))

for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontsize(20)

plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.xlabel('Day of the Week', fontsize=24)
plt.ylabel('Time GMT -5:00', fontsize=24)
ax.get_figure().savefig('heatmap_drones.png')

## URLs, Users, and Hashtags
This code no longer works. Whether it is user error or a change in how twitter nests arrays, I need further time to fix it. It is supposed to take MongoDB documents containing nested data and flatten them, using json_normalize

### URLs

In [None]:
#tweets_flat = json_normalize(df['entities'])
entities_data = json_normalize(df['entities'], record_path='urls', meta=['hashtags', 'user_mentions'], errors='ignore')
entities_data['protocol'],entities_data['domain'],entities_data['path'],entities_data['query'],entities_data['fragment'] = zip(*[urlsplit(x) for x in entities_data['expanded_url']])
df['domain'] = entities_data['domain']
df['url'] = entities_data['expanded_url']
entities_data.drop(entities_data.index, inplace=True)
df.head()

In [None]:
parsed_urls = pd.DataFrame()
parsed_urls = df['domain'].reset_index(name='url')
unique = parsed_urls.groupby('url')['index'].nunique()
unique.nlargest(25)

In [None]:
url = pd.DataFrame()
url = df['url'].reset_index()
url = url.groupby('url')['index'].nunique()
url_list = url.nlargest(50).reset_index()
url_list.to_csv('top50URLs.csv', sep='\t')

### User Names

In [35]:
user_flat = json_normalize(df['user'])
df['screen_name'] = user_flat['screen_name']
user_flat.drop(user_flat.index, inplace=True)

In [None]:
user_count = pd.DataFrame()
user_count = df['screen_name'].reset_index(name='Screen Name')
user_count = user_count.groupby('Screen Name')['index'].nunique()
user_count.nlargest(25)

### Hashtags

In [37]:
hashtag_data = json_normalize(df['entities'], record_path='hashtags', meta=['user_mentions', 'urls'], errors='ignore')
df['hashtags'] = hashtag_data['text']
hashtag_data.drop(hashtag_data.index, inplace=True)

In [None]:
hashtag_data = json_normalize(df['entities'], record_path='hashtags', meta=['user_mentions', 'urls'], errors='ignore')
hashtags = pd.DataFrame()
hashtags = hashtag_data['text'].reset_index(name='hashtags')
utags = hashtags.groupby('hashtags')['index'].nunique()
utags.nlargest(25)