In [1]:
"""
this jupyter is used for analyzing and drawing the topics & sentiments inferred by OneModel using 2 months vaccine-related tweets 
"""
import sys
sys.path.append("../") # go to parent dir
from codebase.topic_utilities import dtm_csv_to_pd_df, calculate_topic_ratios

import numpy as np
import pandas as pd

# 1 Create Dated Document-Topic Matrix

In [None]:
# fixed params
corpora_path = "../corpora/"
model_path = "../models/"

# params
fileTag_list = ["First-and-SecondWeek-Tweets-Rolling","Third-and-FourthWeek-Tweets-Rolling",\
    "Fifth-and-SixthWeek-Tweets-Rolling", "Seventh-and-EighthWeek-Tweets-Rolling"]

start_list = ["2020-12-16","2020-12-31", "2021-01-15", "2021-01-30"]
end_list = ["2020-12-31", "2021-01-15", "2021-01-30", "2021-02-14"]

num_topics = 50
model_suffix = "-{}topics".format(num_topics)

In [None]:
def return_dated_dtm(dtm, metadata, start, end):
    # drop duplicated tweets 
    metadata.drop_duplicates(subset="id_str", inplace=True)
    # merge with metadata for fecthing date
    df = dtm.merge(metadata, on="position_index", how="right")
    df['preDate'] = pd.to_datetime(df['created_time'])
    # format the Date column for grouping conveniently
    df["Date"] = df["preDate"].dt.strftime("%Y-%m-%d")
    # drop position_index, created_time, preDate
    df = df.drop(['position_index', 'created_time', 'preDate'], axis=1)
    return df.reset_index()

In [None]:
%%time
list_of_dfs = []
for fileTag, start, end in zip(fileTag_list, start_list, end_list):
    
    print("Start {} from {} to {}.".format(fileTag, start, end))

    metadata_filename = "{}{}-Meta.csv".format(corpora_path, fileTag)
    # Read back data from csv files
    metadata = pd.read_csv(metadata_filename)
    doc_topic_matrix_filename = "{}{}{}-dtm.csv".format(model_path, fileTag, model_suffix)
    dtm = dtm_csv_to_pd_df(doc_topic_matrix_filename)
    list_of_dfs.append(return_dated_dtm(dtm, metadata, start, end))

In [None]:
dated_dtm = pd.concat(list_of_dfs)
dated_dtm

In [2]:
# dated_dtm.to_pickle("../models/dated-dtm/2-months-50topics-dated-dtm.pkl")
dated_dtm = pd.read_pickle("../models/dated-dtm/2-months-50topics-dated-dtm.pkl")

# 2 Select Date sub matrix and Representative Tweets of a Topic

In [9]:
target_date = "2020-12-19"
target_topic = 9
top_n_tweets = 100
sub_dated_dtm = dated_dtm.loc[dated_dtm['Date'] == target_date]
nL = sub_dated_dtm.nlargest(top_n_tweets, columns=target_topic)[["id_str", "Date", target_topic]]
nL


Unnamed: 0,id_str,Date,9
571682,1340369231100194820,2020-12-19,1.000000
690205,1340355367017000961,2020-12-19,0.986842
623005,1340379244837408768,2020-12-19,0.985944
687182,1340325199527301120,2020-12-19,0.980553
641168,1340353303075561472,2020-12-19,0.977436
...,...,...,...
695727,1340268856808534017,2020-12-19,0.928789
384096,1340105908907601920,2020-12-19,0.928210
699074,1340388298527793153,2020-12-19,0.927419
631839,1340323632946548737,2020-12-19,0.926927
