## Explore and label topics

In [37]:
import pandas as pd
from pathlib import Path
import json
import numpy as np
from collections import defaultdict

Define some useful information

In [61]:
eval_on = 'all'
MODEL_PATH = Path('logs') / 'topic' / 'performances.jsonl'

### Load models, and select best model

In [62]:
models = pd.read_json(MODEL_PATH, orient='records', lines=True)
best = models[(models['split']=='test') & \
              (models['entity']==eval_on)].sort_values(by='npmi_10', ascending=False)
best_model = best.name.tolist()[0]
best_run = best.run.tolist()[0]
model_path = Path('models') / 'topic' / best_model / f'run-{best_run}' / 'model.json'
pred_path = Path('logs') / 'topic' / best_model / f'run-{best_run}' / 'preds.jsonl'

Visualize performance, sorted from best to worst on the test set, with data from test sets for all entities used for evaluation. The evlauation metric used for model selection is `npmi_10`.

In [63]:
best

Unnamed: 0,name,run,split,entity,npmi_10,cwe_10,rbo_10,npmi_20,cwe_20,rbo_20
32,distilbert-base-uncased_vocab-500_bow-491_comp...,0,test,all,0.0635,,,0.0267,,
8,distilbert-base-uncased-finetuned_lr-0.000020_...,1,test,all,0.0629,,,0.0196,,
26,distilbert-base-uncased-finetuned_lr-0.000020_...,4,test,all,0.0626,,,0.0278,,
20,distilbert-base-uncased-finetuned_lr-0.000020_...,3,test,all,0.0547,,,0.005,,
50,distilbert-base-uncased_vocab-500_bow-491_comp...,3,test,all,0.0478,,,0.0016,,
2,distilbert-base-uncased-finetuned_lr-0.000020_...,0,test,all,0.0406,,,0.0092,,
56,distilbert-base-uncased_vocab-500_bow-491_comp...,4,test,all,0.0401,,,0.0104,,
38,distilbert-base-uncased_vocab-500_bow-491_comp...,1,test,all,0.0184,,,-0.0137,,
44,distilbert-base-uncased_vocab-500_bow-491_comp...,2,test,all,0.0033,,,-0.0274,,
14,distilbert-base-uncased-finetuned_lr-0.000020_...,2,test,all,-0.0046,,,-0.0227,,


In [64]:
best_model

'distilbert-base-uncased_vocab-500_bow-491_comp-20_esize-768_batch-64_lr-0.02_epochs-100_act-softplus_train-pretrained'

The best model is the pretrained DistilBERT.

### Labeling topics

In [65]:
topics = json.load(open(model_path))
preds = pd.read_json(pred_path, orient='records', lines=True).drop('index', axis=1)

In [66]:
for i in range(20):
    print(f'Topic {i}')
    print(topics[str(i)])
    print(preds[preds['entity']=='EU_Commission'].sort_values(by=f'topic_{i}',
                                                              ascending=False).head(n=10).text.tolist())
    print('\n\n')

Topic 0
['eu', 'support', 'million', 'humanitarian', 'aid', 'ukraine', 'assistance', 'countries', 'help', 'billion', 'refugees', 'people', 'solidarity', 'crisis', 'emergency', 'fund', 'greece', 'provide', 'projects', 'funding']
['As humanitarian needs continue to rise, the EU is providing direct operational support to aid organisations working inside Ukraine.\n\nThe EU has set up two warehouses in Ukraine, and we are contracting trucking services to get humanitarian aid to those in need.\n\n#StandWithUkraine', 'The EU continues to support people in Haiti who are suffering the consequences of the devastating earthquake that hit the country.\n\nAn EU Humanitarian Air Bridge operation is delivering more than 125 tonnes of life-saving materials: ', 'The EU continues to stand in solidarity with the Palestinian people.\n\nOver 2 million Palestinians in the Occupied Territories and Gaza Strip are in need of humanitarian assistance.\n\nThe EU will support those vulnerable people with €25 milli

### Define topic names
Saved manually under `src/colnames`

In [67]:
topic_names = ['Solidarity and Emergency Response',
               'Health',
               'Citizen Initiatives',
               'Press conferences and Statements',
               'Citizen Rights and Integration',
               'Charts, Links, Infographics',
               'Strategic Investments (e.g., recovery, research, innovation)',
               'Internal Governance',
               'Energy, Sustainability and Climate',
               'Identity, Culture and Citizen Engagement',
               'Visions for the Future',
               'Financial Aid',
               'Human Rights',
               'Growth and Global Development',
               'Economy and markets',
               'Digital Policy',
               'Trade, Partnerships, and Law',
               'Live Events',
               'Digital and Green Transition',
               'Finance and trade']

Define macro-categories to which each topic belongs.

In [68]:
sorted_topics = ['Economy and markets', 
                 'Finance and trade',
                 'Growth and Global Development',
                 'Strategic Investments (e.g., recovery, research, innovation)', 
                 'Health',
                 "Citizen Rights and Integration", 
                 "Human Rights", 
                 'Digital Policy',
                 'Digital and Green Transition', 
                 'Energy, Sustainability and Climate',
                 'Identity, Culture and Citizen Engagement', 
                 "Citizen Initiatives",
                 'Visions for the Future',
                 'Internal Governance',
                 'Trade, Partnerships, and Law', 
                 'Solidarity and Emergency Response', 
                 'Financial Aid',
                 'Press conferences and Statements',
                 'Charts, Links, Infographics', 
                 'Live Events']
categories = ['Economic and Financial Policy'] * 4 + \
             ['Social Policy'] * 3 + \
             ['Environmental and Digital Policy'] * 3 + \
             ['Identity and Citizen Participation'] * 3 + \
             ['Governance'] * 2 + \
             ['Solidarity and Humanitarian Aid'] * 2 + \
             ['Communication and Media'] * 3
cat_dict = defaultdict(list)
for i,c in enumerate(categories):
    cat_dict[c].append(sorted_topics[i])

### Save data
Saving data including topic predictions with renamed column names to `data` folder.

In [69]:
preds['top_topic'] = np.argmax(preds[[f'topic_{i}' for i in range(20)]].values, axis=1)
preds['top_topic'] = preds['top_topic'].apply(lambda x: f'topic_{str(x)}')
preds['confidence'] = np.max(preds[[f'topic_{i}' for i in range(20)]].values, axis=1)

In [70]:
# Replace column names
mapping = {f'topic_{i}': t for i, t in enumerate(topic_names)}
preds = preds.rename(mapping, axis=1)
    
# Compute mean across categories
for k,v in cat_dict.items():
    preds[k] = preds[v].sum(axis=1)
    
# Replace top topics
preds = preds.replace(mapping)

In [71]:
preds

Unnamed: 0,text,lang,id,like_count,quote_count,reply_count,retweet_count,created_at,is_retweet,is_mention,...,Finance and trade,top_topic,confidence,Economic and Financial Policy,Social Policy,Environmental and Digital Policy,Identity and Citizen Participation,Governance,Solidarity and Humanitarian Aid,Communication and Media
0,First Minister @NicolaSturgeon will deliver a ...,en,1250818275199651840,270,9,31,109,2020-04-16,0,0,...,0.0046,Live Events,0.5159,0.0272,0.0848,0.0157,0.0379,0.0137,0.0109,0.8098
1,Update on #coronavirus testing\n\nAs of 2pm to...,en,1250770976532508672,119,8,26,109,2020-04-16,0,0,...,0.0127,Health,0.4222,0.0685,0.5296,0.0830,0.0618,0.0433,0.0831,0.1305
2,Scotland’s Makar @JackieKayPoet has written a ...,en,1250725095926124544,583,90,21,326,2020-04-16,0,0,...,0.0268,Health,0.1891,0.1178,0.2792,0.0902,0.1106,0.0359,0.1653,0.2012
3,While our primary concern is for people’s heal...,en,1250487804963979264,187,7,24,100,2020-04-15,0,0,...,0.0093,Financial Aid,0.2977,0.0869,0.2887,0.0557,0.0272,0.0180,0.4900,0.0336
4,"In a move agreed by @ScotGov and @COSLA, socia...",en,1250427438279979008,377,11,18,146,2020-04-15,0,0,...,0.0279,Growth and Global Development,0.1978,0.3476,0.1645,0.1543,0.0461,0.0434,0.1473,0.0969
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
167969,David Cameron will deliver a speech this morni...,en,103022959018524672,2,0,42,58,2011-08-15,0,0,...,0.0306,"Identity, Culture and Citizen Engagement",0.2262,0.1062,0.1102,0.0996,0.4282,0.0587,0.0554,0.1419
167970,"The PM and President Obama discuss Syria, Liby...",en,102460308576149504,5,0,47,33,2011-08-13,0,0,...,0.0178,Internal Governance,0.3564,0.0466,0.0275,0.0185,0.0409,0.3994,0.0301,0.4372
167971,David Cameron meets organisers and participant...,en,101970042769518592,3,0,35,43,2011-08-12,0,0,...,0.0173,"Identity, Culture and Citizen Engagement",0.2057,0.0864,0.0424,0.0356,0.3918,0.0692,0.0386,0.3358
167972,Read David Cameron's statement to Parliament o...,en,101608394682671105,9,0,70,112,2011-08-11,0,0,...,0.0242,"Charts, Links, Infographics",0.3482,0.1179,0.0655,0.0308,0.0770,0.0425,0.0349,0.6315


Add useful info (day, month, year)

In [72]:
preds['year'] = preds['created_at'].apply(lambda x: x.year)
preds['month'] = preds['created_at'].apply(lambda x: x.month)
preds['day'] = preds['created_at'].apply(lambda x: x.day)

In [74]:
# Save the data
preds.to_json('data/topic/preds.jsonl', orient='records', lines=True)

### Save top tweets per topic

In [55]:
for t in sorted_topics:
    top_10 = preds[preds['entity']=='EU_Commission'].sort_values(by=t, 
                                                                 ascending=False).head(n=10).text.tolist()
    top_df = pd.DataFrame(zip([t]*10, top_10), columns=['topic', 'text'])
    if t == sorted_topics[0]:
        all_df = top_df
    else:
        all_df = pd.concat([all_df, top_df], axis=0)
        
with pd.option_context("max_colwidth", 1000):
    with open('summaries/top_tweets.txt', 'w') as tfile:
        tfile.write(all_df.astype(str).apply(lambda x: 
                                             x.str.encode('ascii', 'ignore').str.decode('ascii')).to_latex())

  tfile.write(all_df.astype(str).apply(lambda x:
