In [None]:
!pip install transformers==4.12.0
!pip install tensorflow==2.4.0
!pip install datasets
!pip install wandb
!pip install langdetect

In [None]:
first_run = False

In [None]:
import pandas as pd
import json
import glob
import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np
from google.colab import drive
from langdetect import detect
import matplotlib.dates as md

%load_ext autoreload
%autoreload 2

drive.mount('/content/drive/')
%cd '/content/drive/My Drive/eu_commission'

### Read in the data

In [None]:
fs = glob.glob('data/*/*')
fields = ['text', 'lang']
metrics = [f'{m}_count' 
           for m in ['like','quote','reply','retweet']]
processed_tws = []
for f in fs:
    tws = json.load(open(f))['data']
    for i in range(len(tws)):
        item = {k: tws[i][k] for k in fields}
        item.update({k: tws[i]['public_metrics'][k] for k in metrics})
        item.update({'created_at': tws[i]['created_at'][:10]})
        tws[i] = item
    processed_tws += tws

In [None]:
df = pd.DataFrame(processed_tws)
df['created_at'] = pd.to_datetime(df['created_at'], infer_datetime_format=True)

### Tweet volume

In [None]:
freq_dict = {'D': 'day', 'W': 'week', 'M': 'month'}
figsizes = {'D': (50,5), 'W': (50,5), 'M': (30, 5)}
formats = {'D': '%Y-%m-%d', 'W': '%Y-%m-%d', 'M': '%Y-%m'}
top_dict = {}
for freq in ['D']:
    fig, ax = plt.subplots(figsize=figsizes[freq])
    grouped = df.groupby(pd.Grouper(key='created_at', axis=0, freq=freq)).count().reset_index()
    top_dict[freq_dict[freq]] = grouped.sort_values(by='text', ascending=False).head(n=5)[['created_at', 'text']].to_records(index=False)
    grouped['smoothed'] = grouped['text'].rolling(7).mean()
    # Plot 
    sns.lineplot(data=grouped, x='created_at', y='text', 
                 alpha=.2, 
                 label='per day')
    sns.lineplot(data=grouped, x='created_at', y='smoothed', 
                 label='smoothed avg - 7d', 
                 color=sns.color_palette()[0])
    plt.ylabel(f'Tweets per {freq_dict[freq]}')
    plt.xlabel('')
    plt.title('Tweet volume')
    plt.xticks(rotation=60)
    # Make year boundaries
    for d in grouped.created_at.dt.year.unique()[1:]:
        plt.axvline(x=np.datetime64(f'{d}-01-01'), color='darkgrey', linestyle='--')
        plt.annotate(s=d, xy=(np.datetime64(f'{d}-06-01'),120), color='black')
    ax.xaxis.set_major_locator(md.MonthLocator())
    ax.xaxis.set_major_formatter(md.DateFormatter('%b \'%y'))
    plt.xlim(np.datetime64('2010-05-01'),np.datetime64('2022-08-01'))
    plt.savefig('figures/tweet_volume.pdf')
    plt.show()

In [None]:
print(df[df.created_at == np.datetime64(top_dict['day'][1][0])].text.tolist())
print(top_dict['day'][1][0])

Some of the peaks with highest volumes are related to live tweeting of #EUMFF negotiations, automated posting re: to activity on other websites (Storify), and other. 
Worth keeping in mind that tweet volume can be a major confounder with this data.

### Additional preprocessing
- flag retweets and tweets starting with mentions;
- strip links;
- not removing emojis, hashtags and mentions, for now - but could remove depending on which model we end up using.

In [None]:
def language_detection(s):
    try:
        return detect(s)
    except:
        return 'unk'

In [None]:
if first_run is True:
  df['is_retweet'] = np.where(df['text'].str.startswith('RT'), 1, 0)
  df['is_mention'] = np.where(df['text'].str.startswith('@'), 1, 0)
  df['text'] = df['text'].str.replace(r'http.*', '', regex=True)
  df = df[df['text'].str.len() > 0]
  df['lang_detected'] = df['text'].apply(language_detection)
  df[df['lang']!=df['lang_detected']]
  df.to_csv('processed/all_tweets.csv', sep=',')

Breakdown of number of tweets per language

In [None]:
df = pd.read_csv('processed/all_tweets.csv', sep=',', index_col=0)
df.groupby('lang')['text'].count().reset_index().sort_values(by='text', ascending=False).rename({'text': 'count'}, axis=1)

Removing tweets not in English, it's mostly translations of English tweets. We know that Twitter's automatic language detection is not great, so double-checking with langdetect and only including tweets tagged as English by both the default tagger and langdetect.

In [None]:
df = df[(df['lang']=='en') & (df['lang_detected']=='en')]

### Train-test splits
Let's leave out a small dataset for pretraining of our language models. We pick a random set of tweets (if that does not show good results, we could consider balanced sampling over time).

In [None]:
import random
random.seed(42)

train_size = 3000
val_size = 500

train_test = ['train'] * train_size + ['val'] * val_size + ['test'] * (df.shape[0] - train_size - val_size)
random.shuffle(train_test)
df['pretraining_splits'] = train_test

Great, now let's fine tune some language models on these tweets for better performance. 

In [None]:
def _save_results(rlist):
  fname = 'logs/pretraining/performances.jsonl'
  try:
    rdf = pd.read_json(fname, 
                       orient="records",
                       lines=True)
    rdf = pd.concat([rdf, pd.DataFrame(rlist)])
    
  except:
    rdf = pd.DataFrame(rlist)
  rdf.to_json(fname, orient="records", lines=True)

### Training

In [None]:
from pretrain import Pretrainer
models = ['distilbert-base-uncased',
          'distilbert-base-uncased-finetuned-sst-2-english',
          'cardiffnlp/tweet-topic-21-multi']
results = []

# Run one that was not run before
trainer = Pretrainer('cardiffnlp/tweet-topic-21-multi', df, 
                     batch_size=4, 
                     lr=2e-6, 
                     warmup=12)
trainer.compile()
r = trainer.fit()
results.append(r)
trainer.save(f'models/pretrained/{trainer.name}')
_save_results(results)

# Run missing
for lr in [2e-5, 2e-2]: # 2e-6 already run
  for batch_size in [4]: # 32, 16 consistently lower
    for wu_epochs in [3]: # add 1 and try early stopping with 10
      for m in models:
          trainer = Pretrainer(m, df, 
                              batch_size=batch_size, 
                              lr=lr, 
                              warmup=batch_size*wu_epochs)
          trainer.compile()
          r = trainer.fit()
          results.append(r)
          trainer.save(f'models/pretrained/{trainer.name}')
          _save_results(results)

### Topic modeling
Let's move on to modeling the topic of the tweets. We'll try to compare different modeling strategies, and both pretrained and fine-tuned models.
Then, we'll try to get an idea of what the evolution of topics has been over time. 

In [None]:
from octis.models.CTM import CTM
from octis.dataset.dataset import Dataset
from octis.optimization.optimizer import Optimizer
from skopt.space.space import Real, Categorical, Integer
from octis.evaluation_metrics.coherence_metrics import Coherence

Now, let's compare this approach with a simpler topic modeling approach.

### Next steps
- Try more warmup
- Topic modeling
  - Compare different approaches, with respect to sanity and fit
- Engagement as a function of topic
- Collect comments, annotate emotions in comments, plot emotion of reactions as a function of topics
    - Also polarization?
- Topics & engagement as a function of emotions of EU Commission tweet

Other:
- Streamline preprocessing