In [None]:
"""
this jupyter is used for analyzing and drawing the topics & sentiments inferred by OneModel using 2 months vaccine-related tweets 
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.dates import DateFormatter

# Display figures inline in Jupyter notebook
import seaborn as sns
# Use seaborn style defaults and set the default figure size
sns.set(rc={'figure.figsize':(11, 4)})

# 0. Importe Dated Tweets with Max Topic Column

In [None]:
concat_dfs = pd.read_pickle("../corpora/distinct_tweets/dated_distinct_tweets_dtm.pickle")

In [None]:
concat_dfs =  concat_dfs.drop(["index"], axis=1)
concat_dfs

In [None]:
# concat_dfs.reset_index()
# concat_dfs = concat_dfs.set_index('Date')
# concat_dfs.index


In [None]:
# daily_counts = concat_dfs.groupby("Date").size().loc["2020-12-16":"2021-02-13"]
# daily_counts.columns = ['count']
# daily_counts = daily_counts.reset_index()
# daily_counts = daily_counts.set_index('Date')

## a. Count topic ratio in total (across 8 weeks)

In [None]:
topic_counts = concat_dfs['max_topic'].value_counts()
for topic_idx in range(0, 50):
    print("{:.2%}".format(topic_counts[topic_idx]/concat_dfs.shape[0]))

In [None]:
def count_maxtopics_per_day(df_max):
    """ This function is used to count topics per day for later drawing
    """
    # groupby [date, top_topics] aggregate by count(id_str)
    max_counts = df_max.groupby(['Date', 'max_topic']).agg({'id_str' : 'count'}).reset_index()
    max_counts.columns = ['Date', 'topic_id', 'max_count']

    # Count the number of individual documents per day
    total_docs = df_max[['Date', 'id_str']].groupby('Date').agg({'id_str' : 'count'}).reset_index()
    total_docs.columns = ['Date', 'total_docs']

    # Combine the two dataframes
    max_counts = max_counts.merge(total_docs, on='Date', how='left')
    # Create a new column with the count per topic divided by the total docs per day
    # Normalize
    max_counts['prevalence'] = max_counts['max_count']/max_counts['total_docs']

    # pivot to wide format
    max_counts = max_counts[['Date', 'topic_id', 'prevalence']].pivot(index='Date',\
                                                                    columns='topic_id',\
                                                                    values='prevalence').fillna(0)

    max_counts.index = pd.to_datetime(max_counts.index)
    return max_counts

In [None]:
%%time
max_counts = count_maxtopics_per_day(concat_dfs)

In [None]:
topic_label_dict = {0: "Vaccination of Frontline Workers",
13: "Access to Vaccines - Signing Up Online",
9: "South African Variant",
21: "Biden Stimulus Plan",
45: "mRNA vaccines",
27: "Complaints about pharm company profits",
26: "Vaccine Conspiracy Theories online",
4: "Trials in non mRNA vaccines",
31: "Vaccine distribution in Canada",
2: "Concerns about supply to reach heard immunity by summer",
36: "Genetic concerns about vaccines and kids",
7: "Low distribution of AstraZeneca vaccine"}

topic_reorder_dict = {0:1,
13:2,
9:3,
21:4,
45:5,
27:6,
26:7,
4:8,
31:9,
2:10,
36:11,
7:12}

# 1 Draw Topic Trend

### a. Plot Multiple Topics

In [None]:
phrase1 = ['0', '13', '9', '21', '45', '27']
phrase2 = ['26', '4', '31', '2', '36', '7']

In [None]:
START = '2020-12-16'
END = '2021-02-14'
fig, ax = plt.subplots()
for topic_id in phrase1:
    ax.plot(max_counts.loc[START:END, int(topic_id)], label="Topic {}, {}".format(topic_reorder_dict[int(topic_id)], topic_label_dict[int(topic_id)]))
    date_form = DateFormatter("%m-%d")
    ax.xaxis.set_major_formatter(date_form) 
    ax.xaxis.set_major_locator(mdates.DayLocator(interval=7))
    ax.legend(loc=9, bbox_to_anchor=(0.25, 1.6))
    ax.set_ylabel('Prevalence of the Selected Topics per Day')
    ax.set_title('Trends in Covid19 Discussion Topics')
plt.savefig('../images/Topic-Trends-Phrase1.png', bbox_inches = 'tight', dpi=300)

### b. Plot a Single Topic

In [None]:
START = '2020-12-16'
END = '2021-02-14'
# topic_id = 4
for topic_id in phrase1+phrase2:
    # Plot daily and weekly resampled time series together
    fig, ax = plt.subplots()
    ax.plot(max_counts.loc[START:END, int(topic_id)],
    marker='.', linestyle='-', linewidth=0.5, label='Daily Prevalence of Topic {}, {}'.format(topic_reorder_dict[int(topic_id)], topic_label_dict[int(topic_id)]))
    date_form = DateFormatter("%m-%d")
    ax.xaxis.set_major_formatter(date_form) 
    ax.xaxis.set_major_locator(mdates.DayLocator(interval=7))
    ax.plot(max_counts.loc[START:END, int(topic_id)].resample('W').mean(),
    marker='o', markersize=8, linestyle='-', label='Weekly Prevalence of Topic {}, {}'.format(topic_reorder_dict[int(topic_id)], topic_label_dict[int(topic_id)]))
    ax.set_ylabel('Prevalence of the Selected Topic per Day')
    ax.legend()
    plt.savefig('../images/Trends-of-Topic-{}.png'.format(topic_reorder_dict[int(topic_id)]), dpi=300)