In [None]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()

import gensim

from functools import reduce
from operator import or_

In [None]:
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go

pio.templates.default = "plotly_white"
pd.options.plotting.backend = 'plotly'

In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
from numpy.linalg import norm

def cos(a,b):
    return a@b/(norm(a)*norm(b))

## 1. Corpus

In [None]:
df_lf = pd.read_parquet('../data/newspaper/figaro_sents.parquet')

In [None]:
df_lf = df_lf.drop_duplicates(subset=['sent']).reset_index(drop=True)

In [None]:
df_lm = pd.read_parquet('../data/newspaper/monde_sents.parquet')

In [None]:
df_lm = df_lm.drop_duplicates(subset=['sent']).reset_index(drop=True)

In [None]:
def contains_any(x, kwds):
    return any([kw in x for kw in kwds])

In [None]:
kwds = ["homosex","LGBT","lgbt","mariage gay"]

In [None]:
lf_select = df_lf[df_lf.sent.progress_apply(lambda x: contains_any(x, kwds))]

In [None]:
lf_select['words'] = lf_select['sent'].progress_apply(gensim.utils.simple_preprocess)

In [None]:
lm_select = df_lm[df_lm.sent.progress_apply(lambda x: contains_any(x, kwds))]

In [None]:
lm_select['words'] = lm_select['sent'].progress_apply(gensim.utils.simple_preprocess)

In [None]:
lm_select['newspaper'] = 'Le Monde'
lf_select['newspaper'] = 'Le Figaro'

In [None]:
all_select = pd.concat([lm_select, lf_select],ignore_index=True)

## 2. Word embedding

In [None]:
model = gensim.models.Word2Vec(sentences=all_select['words'],vector_size=50, min_count=50, epochs=20, workers=8)

In [None]:
#model.save('./models/lgbt.model')

In [None]:
#model = gensim.models.Word2Vec.load('./models/lgbt.model')

## 3. Build frame axis

In [None]:
w_g = 'agression'
w_d= 'lobby'

In [None]:
model.wv.similar_by_word(w_g)

In [None]:
model.wv.similar_by_word(w_d)

In [None]:
v_d = model.wv.get_vector(w_d)
v_g = model.wv.get_vector(w_g)

frame = v_d-v_g
list_cos = [cos(v,frame) for v in model.wv.vectors]
dict_cos = dict(zip(model.wv.index_to_key, list_cos))

def match_cos(list_words):
    words_cos=[]
    for w in list_words:
        try:
            words_cos.append(dict_cos[w])
        except:
            pass
    return words_cos

In [None]:
pd.Series(dict_cos).sort_values()

## 4. Compute frame bias and intensity on sub-corpora

Build sub-corpora (unit of analysis). Here: newspaper-year

In [None]:
def flatten(x):
    list_words = []
    for wds in x:
        if type(wds)!=list:
            wds = wds.tolist()
        list_words += wds
    return list_words

In [None]:
journ_year = all_select.groupby(['newspaper','year']).progress_apply(lambda df: flatten(df['words']))
journ_year = journ_year.reset_index().rename({0:'words'}, axis=1)

Compute frame bias and intensity on each sub-corpus

In [None]:
journ_year['cos'] = journ_year['words'].progress_apply(match_cos)
journ_year['bias'] = journ_year['cos'].progress_apply(np.mean)
journ_year['intensity'] = journ_year['cos'].progress_apply(np.var)

Compute frame bias relative to the background corpus

In [None]:
background_doc = flatten(all_select['words'])

background_cos = match_cos(background_doc)

In [None]:
background_bias = np.mean(background_cos)
background_bias

In [None]:
journ_year['rel_bias'] = journ_year['bias'] - background_bias

Display bias-intensity map

In [None]:
journ_year['label'] = journ_year['newspaper'] + '_' + journ_year['year'].apply(str)

In [None]:
fig = px.scatter(journ_year, x='rel_bias',y='intensity', color='newspaper',text='year',
                 labels={'rel_bias':'agression <- Microframe bias -> lobby',
                         'intensity': 'Microframe intensity',
                         'newspaper':'Newspaper'})
fig.write_image('./plot/png/lgbt_bias_intensity.png')
fig.show()

In [None]:
## Polarization trends

fig = px.scatter(journ_year, x='year', y='rel_bias', color='newspaper',trendline='lowess',
                 labels = {'newspaper':'Newspaper',
                           'rel_bias':'Microframe bias'})
fig.write_image('./plot/png/lgbt_bias_evolution.png')
fig.show()

In [None]:
diff_bias_per_year = journ_year.set_index('year')
fig = px.scatter((diff_bias_per_year.loc[diff_bias_per_year['newspaper']=='Le Monde', 'bias'] - diff_bias_per_year.loc[diff_bias_per_year['newspaper']=='Le Figaro', 'bias']).apply(np.abs),
           trendline='lowess', labels={'value':'|bias(Le Monde) - bias(Le Figaro)|'})
fig.update_layout(showlegend=False)
fig.write_image('./plot/png/lgbt_polarization_evolution.png')
fig.show()

In [None]:
journ_year['topic'] = 'lgbt'
journ_year[['label','newspaper','year','rel_bias','topic']].to_csv('./csv/bias_lgbt.csv', index=False)

## 5. Bias and intensity per sentence

In [None]:
all_select['cos'] = all_select['words'].progress_apply(match_cos)
all_select['bias'] = all_select['cos'].progress_apply(np.mean)
all_select['intensity'] = all_select['cos'].progress_apply(np.var)

In [None]:
all_select['sent_display'] = all_select['sent'].progress_apply(lambda x: ' '.join([a for b in [x.split(' ')[i:i+20]+['<br>'] for i in range(0,len(x.split(' ')), 20)] for a in b]))

In [None]:
doc1_label = 'Le Monde_2002'
doc2_label = 'Le Figaro_2012'

In [None]:
all_select['label'] = (all_select['newspaper'] + '_' + all_select['year'].apply(str)).apply(lambda x: x if x in [doc1_label, doc2_label] else 'autre')

In [None]:
fig = px.scatter(all_select,
                 x='bias',y='intensity',
                 color='label',
                 hover_data=['sent_display','newspaper','date','author'],
                 labels={'bias':'agression <- Microframe bias -> lobby',
                         'intensity': 'Microframe intensity'})
fig.update_traces(marker=dict(size=3,opacity=1))

fig.write_html('./plot/html/lgbt_bias_per_sentence.html')
fig.write_image('./plot/png/lgbt_bias_per_sentence.png')