# Covid19 Tweets Data Exploration

## Imports

In [None]:
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from urllib.parse import unquote
from scipy.stats import pearsonr
from tqdm import tqdm

%matplotlib inline
sns.set_context("talk")
figsize = (15,5)

## Load Data
Set the correct filenames and paths to the input data file (download first)
* `TweetsCOV19_file` from https://data.gesis.org/tweetscov19

In [None]:
TweetsCOV19_file = "../data/tweetsCov19/TweetsCOV19.tsv"

In [None]:
%%time
# load Tweets Covid-19 data (~45 min. to 1 hour, due to slow timestamp conversion. Comment out converter for MUCH faster read)
colnames = ["TweetID", "Username", "date", "Followers", "Friends", "Retweets", "Favorites", "Entities", "Sentiment", "Mentions", "Hashtags", "URLs"]
df = pd.read_csv(TweetsCOV19_file, sep='\t', header=None, names=colnames,
                 na_values=["null;"], converters={"date": pd.Timestamp},
                 nrows=None)
df.set_index("date", drop=True, inplace=True)
print(df.shape)
print(df.index.min(), df.index.max())

In [None]:
%%time
# separate positive from negative sentiment score
sentiment = df["Sentiment"].apply(lambda s: s.split())
df["Sentiment_positive"] = sentiment.apply(lambda l: int(l[0]))
df["Sentiment_negative"] = sentiment.apply(lambda l: int(l[1]))
df.drop(columns=["Sentiment"], inplace=True)

In [None]:
# Save with pickle so it can be read faster next time
%time df.to_pickle("../data/tweetsCov19/TweetsCOV19_df.pkl")

In [None]:
# Read pickled data
%time df = pd.read_pickle("../data/tweetsCov19/TweetsCOV19_df.pkl")
print(df.shape)
print(df.index.min(), df.index.max())

## Entity Analysis

In [None]:
# Initialize entity lists to empty lists (~15 sec.)
%time df["Entities_list"] = np.empty((len(df), 0)).tolist()

In [None]:
%%time 
# get a list of all entities (~30 sec.)
nonna = ~df["Entities"].isna()
df.loc[nonna,"Entities_list"] = df["Entities"][nonna].apply(lambda s: [i.split(':')[1] for i in s.split(';')[:-1]])
entities = pd.Series(itertools.chain.from_iterable(df["Entities_list"]))
entities.name = "Entities"

In [None]:
entities_counts = entities.value_counts()
entities_counts_rel = entities_counts / df.shape[0] # dividy by total number of tweets

In [None]:
freq_cutoff_upper = 0.009 # chosen as to remove some uninteresting entities
freq_cutoff_lower = 0.001
frequent_entities = entities_counts_rel[(entities_counts_rel <= freq_cutoff_upper) & (entities_counts_rel >= freq_cutoff_lower)]
frequent_entities.drop("Spotify", inplace=True) # due to strange behaviour (single large peak in December 2019)
print(f"There are {entities_counts.shape[0]} distinct entities in the dataset.")
print(f"Only {frequent_entities.shape[0]} entities appear between {freq_cutoff_lower*100} % and {freq_cutoff_upper*100:.1f} % of all tweets.")

In [None]:
print("The frequency of entities decays sharply, i.e. there are only few tweets that appear frequently")
s = 0  # start index of entity to display
n = 50 # number of entities to display
plt.figure(figsize=figsize)
plt.plot(entities_counts_rel.values[s:n+s], label="Entity Frequency")
plt.plot([0,n-1], 2*[freq_cutoff_upper], "--", label=f"Upper Cutoff ({freq_cutoff_upper})")
plt.plot([0,n-1], 2*[freq_cutoff_lower], "--", label=f"Lower Cutoff ({freq_cutoff_lower})")
plt.xticks(range(n), [unquote(l) for l in entities_counts.index[s:n+s]], rotation=90)
plt.ylabel("Frequency of Appearance")
plt.legend()
plt.savefig("../figures/entity_frequency.png", bbox_inches="tight")

## Entity Usage Over Time

In [None]:
%time daily_sum = df.resample('D').sum()

In [None]:
%time weekly_sum = df.resample('W').sum()

In [None]:
%time monthly_sum = df.resample('M').sum()

In [None]:
%time daily_mean = df.resample('D').mean()

In [None]:
%time weekly_mean = df.resample('W').mean()

In [None]:
%time monthly_mean = df.resample('M').mean()

In [None]:
daily_sum[frequent_entities.index[28:29]].plot(figsize=figsize, ylabel="Tweets per Week")

In [None]:
plt.figure(figsize=figsize)
plt.plot(weekly_sum[frequent_entities.index].sum(1))
plt.xlabel("Time")
plt.ylabel("Tweets per week")
plt.savefig("../figures/tweets_per_week.png", bbox_inches="tight")

In [None]:
def normalize_on_period(data, period=None, stdperiod=None, std_only_if_peak=True):
    """
    Normalize data on specified period
    
    period: pandas datetime indexing string for the period that is used for the normalization
    stdperiod: pandas datetime intexing string for the preiod that is used for std computation, defaults to period value
    std_only_if_peak: if true only use stdperiod if the global peak is in this period, otherwise use period
    """
    
    if period is None:
        data_mean = data.mean()
    else:
        data_mean = data[period].mean()
        
    if stdperiod is None:
        data_std = data.std()
    else:
        if not std_only_if_peak:
            data_std = data[stdperiod].std()
        else:
            # if global max falls within stdperiod
            if data[stdperiod].max() == data.max():
                data_std = data[stdperiod].std()
            else:
                if period is None:
                    data_std = data.std()
                else:
                    data_std = data[period].std()
                
    
    data -= data_mean
    data /= data_std
    
    return data


def correlate(series1, series2):
    corr = np.correlate(series1, series2, mode="same")
    corrmax = corr.max()
    corr_offset = (series1.shape[0] // 2) - corr.argmax()
    corrtime    = corr_offset * series1.index.freq
    return corrmax, corrtime


correlations = np.zeros(2*[frequent_entities.shape[0]], dtype=float)
corrtimes    = np.zeros_like(correlations, dtype=pd.Timedelta)
correlations = pd.DataFrame(correlations, index=frequent_entities.index, columns=frequent_entities.index)
corrtimes    = pd.DataFrame(corrtimes,    index=frequent_entities.index, columns=frequent_entities.index)

for entity1 in tqdm(frequent_entities.index):
    series1 = daily_sum[entity1].copy()
    series1 = normalize_on_period(series1, period="2019", stdperiod="2020")
    
    for entity2 in frequent_entities.index:
        series2 = daily_sum[entity2].copy()
        series2 = normalize_on_period(series2, period="2019", stdperiod="2020")
        
        corrmax, corrtime = correlate(series1, series2)
        correlations.loc[entity1, entity2] = corrmax
        corrtimes.loc[entity1, entity2]    = corrtime.delta

## Which entities appear together

In [None]:
countries = ["Germany", "France", "Italy", "Spain", "Japan", "Taiwan", "Russia",
             "Canada", "South_Korea", "Iran", "Australia", "United_States", "Turkey",
             "Ukraine", "Nigeria", "Lagos", "Hong_Kong", "Nigeria", "Europe", "India",
             "Kashmir", "Pakistan"]

In [None]:
%%time
# add integer column to count entity usage
for entity in tqdm(frequent_entities.index, desc="Extracting entities in int cols"):
    df[entity] = df["Entities_list"].apply(lambda l: int(entity in l))

In [None]:
# Save with pickle so it can be read faster next time
%time df.to_pickle("../data/tweetsCov19/TweetsCOV19_df_intent.pkl")

In [None]:
%time df = pd.read_pickle("../data/tweetsCov19/TweetsCOV19_df_intent.pkl")

In [None]:
%%time
df_entities = df[frequent_entities.index].copy()
df_entities = df_entities.astype(bool)

In [None]:
# compute the "togetherness" as the frequency that entity1 appears together with entity2 times the opposite frequency (entity2 together with entity1)
togetherness = np.zeros(2*[frequent_entities.shape[0]], dtype=float)
togetherness = pd.DataFrame(togetherness, index=frequent_entities.index, columns=frequent_entities.index)

for entity1 in tqdm(frequent_entities.index):
    ent1_bool = df[entity1] == 1
    ent1_sum  = df[entity1].sum()
    for entity2 in frequent_entities.index:
        togetherness.loc[entity1, entity2] = (((ent1_bool) & (df[entity2] == 1)).sum()**2 / (ent1_sum * df[entity2].sum()))**0.5

In [None]:
%time daily_sum_corr = daily_sum[frequent_entities.index].corr()

In [None]:
# ~8.5 minutes
%time tweet_corr = df[frequent_entities.index].corr()

In [None]:
# remove countries from togetherness, because we don't care about them
not_country    = [False if i in countries else True for i in togetherness.index]
togetherness   = togetherness.loc[not_country, not_country]
not_country    = [False if i in countries else True for i in tweet_corr.index]
tweet_corr     = tweet_corr.loc[not_country, not_country]
not_country    = [False if i in countries else True for i in daily_sum_corr.index]
daily_sum_corr = daily_sum_corr.loc[not_country, not_country]

In [None]:
print(daily_sum.shape)
print(daily_sum_corr.shape)
daily_sum_corr.head()

In [None]:
print(df.shape)
print(tweet_corr.shape)
tweet_corr.head()

In [None]:
plt.figure(figsize=figsize)
plt.hist(daily_sum_corr.values.flatten(), bins=50, range=(0,1.0), log=True)
plt.xlabel("Entity correlations based on daily sum timeseries")
plt.ylabel("Count (Log Scale)")

In [None]:
plt.figure(figsize=figsize)
plt.hist(tweet_corr.values.flatten(), bins=50, range=(-0.01,0.3), log=True)
plt.xlabel("Entity correlations based on individual tweets")
plt.ylabel("Count (Log Scale)")

In [None]:
plt.figure(figsize=figsize)
plt.hist(togetherness.values.flatten(), bins=50, range=(0,0.3), log=True)
plt.xlabel("Togetherness (T) of Entities")
plt.ylabel("Count (Log Scale)")
plt.savefig("../figures/togetherness_histogram.png", bbox_inches="tight")

In [None]:
def print_corr(corr, entity1, entity2):
    print(f"T({entity1},{entity2}) = {corr.loc[entity1,entity2]:.2f}")

print_corr(togetherness, "Bill_Gates", "5G")
print_corr(togetherness, "Bill_Gates", "Vaccine")
print_corr(togetherness, "Toilet_paper", "Panic_buying")
print_corr(togetherness, "President_of_the_United_States", "Men_who_have_sex_with_men")
print_corr(togetherness, "President_of_the_United_States", "Fake_news")
print_corr(togetherness, "Donald_Trump", "Men_who_have_sex_with_men")
print_corr(togetherness, "Donald_Trump", "Fake_news")

In [None]:
upper_limit = 0.9
lower_limit = 0.01
pairs = list(zip(*np.where((togetherness > lower_limit) & (togetherness < upper_limit))))
pairs = set([tuple(sorted(list(p))) for p in pairs])
pairs = [(togetherness.index[p[0]], togetherness.index[p[1]]) for p in pairs]

pairs = pd.DataFrame(pairs, columns=["entity1", "entity2"])
pairs["togetherness"] = pairs.apply(lambda row: togetherness.loc[row.entity1, row.entity2], axis=1)

# remove pairs that are both cuntries and sort by togetherness
pairs = pairs[~pairs.apply(lambda row: (row.entity1 in countries) & (row.entity2 in countries), axis=1)]
pairs.sort_values("togetherness", inplace=True, ascending=False)
pairs.reset_index(inplace=True, drop=True)

with pd.option_context('display.max_rows', pairs.shape[0]):
    display(pairs)

In [None]:
def plot_daily_sum(entitylists=None, plotname=None):
    plt.figure(figsize=figsize)
    for entities in entitylists:
        plt.plot(daily_sum[entities].sum(1), label=f"{entities}")
    plt.xlabel("Date")
    plt.ylabel("Daily Number of Tweets")
    plt.legend()
    
    if plotname is not None:
        plt.savefig(f"../figures/{plotname}", bbox_inches="tight")

In [None]:
entities1 = ["Bill_Gates"]
entities2 = ["Vaccine"]
entities3 = ["5G"]
plot_daily_sum((entities1, entities2, entities3), "time_evolution_bill_gates_vaccine_5g.png")

In [None]:
entities1 = ["Toilet_paper"]
entities2 = ["Panic_buying"]
plot_daily_sum((entities1, entities2), "time_evolution_toilet_paper_panic_buying.png")

In [None]:
entities1 = ["President_of_the_United_States", "Donald_Trump"]
entities2 = ["Men_who_have_sex_with_men", "Fake_news"]
plot_daily_sum((entities1, entities2), "time_evolution_president_msm_fake_news.png")

### Average sentiment by entity

In [None]:
weekly_mean_sentiment = weekly_mean[["Sentiment_positive", "Sentiment_negative"]].abs()
plt.figure(figsize=figsize)
plt.plot(weekly_mean_sentiment["Sentiment_positive"], "g", label="Positive Sentiment Score")
plt.plot(weekly_mean_sentiment["Sentiment_negative"], "r", label="Negative Sentiment Score")
plt.ylabel("Mean Sentiment")
plt.xlabel("Date")
plt.legend()
plt.savefig(f"../figures/weekly_mean_sentiment.png", bbox_inches="tight")

In [None]:
sentiments_by_entity = []
for entity in tqdm(frequent_entities.index, desc="Extracting entities in int cols"):
    positive = df.loc[df[entity] == 1, "Sentiment_positive"].mean()
    negative = df.loc[df[entity] == 1, "Sentiment_negative"].mean()
    sentiments_by_entity.append(pd.Series([positive, negative], ["positive", "negative"], name=entity))
sentiments_by_entity = pd.DataFrame(sentiments_by_entity)

In [None]:
sentiments_by_entity.sort_values("positive", ascending=False).head(20)

In [None]:
sentiments_by_entity.sort_values("negative", ascending=True).head(20)

In [None]:
plt.scatter(sentiments_by_entity["positive"], sentiments_by_entity["negative"])
plt.xlabel("Mean Positive Sentiment Score")
plt.ylabel("Mean Negative Sentiment Score");