In [1]:
import pandas as pd
import json
import glob
import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np
from langdetect import detect
from logs.best_topic_names import topic_20comp, exclude_20
import matplotlib.dates as md
import plotly.express as px
import spacy
import textdescriptives as td

%load_ext autoreload
%autoreload 2

### Structure
Content and narratives
- Topic modeling of EU Commission tweets
- Which topics generate most engagement (likes, comments, retweets, quotes)? 
- Which topics generate most positive comments? 
- Which narratives resonate the most?

Styles
- How is the style of the European commission tweeting? Is it related to engagement? [TODO]

Emotional content
- What is the prevalent emotional content? [TODO]

Comparison
- Which are the best predictors of engagement?
    - Bag of words models [TODO]
    - Topic models [TODO]
    - Transformer-based models (static) [TODO]
    - Stylistic predictors [TODO]
    - Emotion predictors [TODO]
    
### Meta:
- Extend to other languages?
- Extend to other accounts?

### Load data and extract additional features

In [5]:
df = pd.read_json(f'logs/topic/distilbert-base-uncased-finetuned-sst-2-english_vocab-500_bow-499_comp-20_esize-768_batch-64_lr-0.002_epochs-100_act-softplus/topic_preds_1.jsonl',
                  orient='records', 
                  lines=True)
df.rename(dict(zip([f'topic_{i}' for i in range(20)], topic_20comp)), inplace=True, axis=1)
df['top_topic'] = df[topic_20comp].apply(lambda x: topic_20comp[x.argmax()], axis=1)
response_df = pd.read_json('processed/pre_topic_responses_sentiment.jsonl', 
                           orient='records', 
                           lines=True)   

Extract style descriptors

In [15]:
nlp = spacy.load("en_core_web_md")
nlp.add_pipe("textdescriptives")

dfs = [] 
for t in topic_20d['text']:
    try: 
        extracted = td.extract_df(nlp(t))
        dfs.append(extracted) 
    except:
        dfs.append(pd.DataFrame([[np.nan]*len(extracted.columns)],
                                columns=extracted.columns))



In [27]:
df = pd.concat([df, pd.concat(dfs).reset_index(drop=True).drop('text',
                                                               axis=1)], axis=1)

  df = pd.concat([df, pd.concat(dfs).reset_index(drop=True).drop('text',


In [29]:
df.to_json('processed/post_topic_tweets_style.jsonl', orient='records', lines=True)
# df = pd.read_json('processed/post_topic_tweets_style.jsonl', orient='records', lines=True)

Extract emotions

In [None]:
from transformers import pipeline
classifier = pipeline("text-classification", 
                      model="j-hartmann/emotion-english-distilroberta-base", 
                      return_all_scores=True)

In [47]:
def get_emo(lst):
    columns = [l['label'] for l in lst]
    vals = [l['score'] for l in lst]
    return pd.DataFrame([vals], columns=columns)

In [None]:
emos = pd.concat(df['text'].apply(lambda x: get_emo(classifier(x)[0])).tolist()) # softmaxed, or not?

  return torch._C._cuda_getDeviceCount() > 0


ValueError: Shape of passed values is (46, 1), indices imply (46, 46)

In [21]:
# Concat and save

Unnamed: 0,text,dependency_distance_mean,dependency_distance_std,prop_adjacent_dependency_relation_mean,prop_adjacent_dependency_relation_std,flesch_reading_ease,flesch_kincaid_grade,smog,gunning_fog,automated_readability_index,...,sentence_length_median,sentence_length_std,syllables_per_token_mean,syllables_per_token_median,syllables_per_token_std,n_tokens,n_unique_tokens,proportion_unique_tokens,n_characters,n_sentences
0,40% quicker emergency response times thanks to...,1.633333,0.633333,0.483333,0.016667,97.001429,1.468571,,2.8,3.265,...,7.0,4.0,1.214286,1.0,0.410326,14,14,1.0,68,2
0,“@ECspokesKoen: Statement @BarrosoEU after the...,3.75,0.0,0.4375,0.0,26.47,12.3,,15.309091,17.04,...,11.0,0.0,2.0,2.0,0.953463,11,11,1.0,82,1
0,What do you think is the main source of pollut...,2.254274,0.023504,0.452991,0.008547,87.09681,4.713276,,8.558621,4.66,...,14.5,1.5,1.241379,1.0,0.56661,29,28,0.965517,118,2
0,CARE will ensure immediate support to those fl...,2.627841,0.809659,0.507102,0.038352,65.102632,8.898947,,11.810526,8.397368,...,19.0,9.0,1.447368,1.0,0.879514,38,30,0.789474,169,2
0,CZ foreign minister: by simplifying we can be ...,1.846154,0.0,0.461538,0.0,41.851818,10.154545,,15.309091,8.476364,...,11.0,0.0,1.818182,1.0,1.028519,11,11,1.0,59,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,New waste rules will turn #Europe into a clean...,1.738889,1.230502,0.259524,0.186293,72.455,6.134444,10.504224,10.355556,6.55,...,17.0,7.788881,1.444444,1.0,0.864242,36,30,0.833333,174,3
0,@mariannethyssen @EU_Social @ChristianWigand H...,1.32803,1.034302,0.357576,0.237294,60.50625,6.124167,8.076483,7.814286,10.800714,...,5.5,3.344772,1.666667,1.0,0.992032,21,21,1.0,136,4
0,"Now on EBS, #FutureofEurope Lecture by @Juncke...",3.571429,0.0,0.5,0.0,72.615455,5.863636,,8.036364,11.473636,...,11.0,0.0,1.454545,1.0,0.655555,11,11,1.0,66,1
0,Ta' vera! Let’s #MakeItReal for Malta! 🇲🇹\n\nW...,2.020238,0.763833,0.434524,0.111677,71.003,5.542,9.3871,8.974545,5.984773,...,5.0,6.88186,1.5,1.0,1.05529,44,36,0.818182,223,5


In [22]:
topic_20d.shape

(28751, 34)

In [49]:
# Plot style distribution per topic
# Plot style over time
# Plot engagement by style

### Descriptives

In [53]:
# TODO

### Predictive models

In [19]:
# TODO