## Notebook for Collocation Analysis

Collocation analysis using ```nltk```.

In [1]:
from gensim.models.phrases import Phrases
import matplotlib.pyplot as plt
from operator import itemgetter
from collections import Counter
from nltk.collocations import *
import os,re,string,json,math
from functions import *
from tqdm import tqdm
import networkx as nx
import seaborn as sns
import pandas as pd
import nltk

bigram_measures = nltk.collocations.BigramAssocMeasures()

plotting.style_()

tf = pd.read_csv(base_path + '/resources/keywords-corona-translation.csv')
tf_corona = {language:dict(zip(tf[tf['language'] == language]['word'],tf[tf['language'] == language]['translation'])) for language in list(set(tf['language']))}

tf = pd.read_csv(base_path + '/resources/keywords-expertise-translation.csv')
tf_science = {language:dict(zip(tf[tf['language'] == language]['word'],tf[tf['language'] == language]['translation'])) for language in list(set(tf['language']))}
tf_science = {k:{x:i for x,i in v.items() if x not in ["policy","program","measures"]} for k,v in tf_science.items()}

In [2]:
# Ad hoc function for importing data and preprocessing.

def clean_subset(iso,language,start,end):

    """ Function for importing subset of the data
    iso (str): ISO code for language ("gb", "nl" etc.)
    language (str): name of language for calling nltk stopwords ("english", "dutch" etc.)
    start (str): start month ("2020-01")
    end (str): end month ("2020-02")
    """
    
    df = data_loader.load_month(iso,start,end)
    df = df = df[df['text'].notna()]
    if language in os.listdir('/home/ruben/nltk_data/corpora/stopwords/'):
        stopwords_ = nltk.corpus.stopwords.words(language)
    else:
        stopwords_ = []
    df['text'] = [[w for w in str(t).split(' ') if "_" in w and len(w.split('_')) != 1] for t in df['posner']]
    df['text'] = [" ".join([w.split('_')[0] for w in text if w.split('_')[1] in ['NOUN','VERB','ADJ']]) for text in df['text']]
    df['text'] = [utils.preprocess(str(x),stopwords_) for x in tqdm(df['text'])]
    df = df.drop(['lemmatized','title'],axis=1)
    return df

In [3]:
def find_top_collocates(bgfinder, vocabulary, seed_term, topn):

    """ Function for finding top collocates of seed term
    bgfinder (<BigramCollocationFinder> object): finder object
    vocabulary (list): list of terms to consider
    seed_term (str): seed term
    top_n (int): number of collocates to return
    """

    list_scores = {w:bgfinder.score_ngram(bigram_measures.likelihood_ratio,seed_term,w) for w in vocabulary}
    list_scores = {k:v for k,v in list_scores.items() if v != None}
    return list(dict(sorted(list_scores.items(), key = itemgetter(1), reverse = True)[:topn]).keys())

def get_network(bgfinder, vocabulary, seed_term, topn):

    """ Function for finding top collocates of seed term
    bgfinder (<BigramCollocationFinder> object): finder object
    vocabulary (list): list of terms to consider
    seed_term (str): seed term
    top_n (int): number of collocates to return
    """

    d = []
    for w1 in find_top_collocates(bgfinder, vocabulary, seed_term,topn):
        d.append([seed_term,w1])
        for w2 in find_top_collocates(bgfinder, vocabulary, w1,topn):
            d.append([w1,w2])
            for w3 in find_top_collocates(bgfinder, vocabulary, w2,topn):
                d.append([w2,w3])
    return pd.DataFrame(d,columns=['source','target'])

def collocation_month(iso,df,month,seed_term,window_size=15,topn=6,plot=True,degree_limit=0):
    
    """ Function for drawing collocation network for one month
    iso (str): ISO code
    df (<DataFrame> object): dataframe with text and metadata
    month (str): month
    seed_term (str): seed term
    window_size (int): size of window to use for finding collocates 
    topn (int): top collocates to consider
    plot (boolean): whether to plot network in notebook
    degree_limt (int): limit network to nodes with a min. degree
    """

    df = df[(df['id'].str.contains(month))]
    text = " ".join(df['text']).split(' ')
    finder = BigramCollocationFinder.from_words(text,window_size=window_size)
    d = get_network(seed_term,finder,set(text),topn)
    g = nx.from_pandas_edgelist(d, source='source', target='target',create_using=nx.DiGraph()) 
    dgrs = dict(g.degree)

    if degree_limit != 0:
        d = d[d['target'].isin([k for k,v in dgrs.items() if v >= degree_limit])]
        g = nx.from_pandas_edgelist(d, source='source', target='target',create_using=nx.DiGraph()) 
        dgrs = dict(g.degree)
    
    if plot == True:
        plt.figure(figsize=(25,15))
        layout = nx.spring_layout(g,k=1.15)

        nx.draw_networkx_nodes(g,layout,node_size=2,alpha=0)
        nx.draw_networkx_edges(g, layout, width=1.5, alpha=.75, edge_color="#cccccc",arrows=True,arrowstyle="-|>",arrowsize=50)
        
        for node, (x, y) in layout.items():
            plt.text(x, y, node, fontsize=math.log(dgrs[node] * 5) * 6, ha='center', va='center',color = "red" if node == seed_term else "black",bbox=dict(facecolor='red', alpha=0.1))
        plt.title(f"Collocation Network for seed term {seed_term.upper()} in {month} (Corpus: {iso.upper()})",fontsize=24)
        plt.savefig(f'/results/plots/collocation-networks/collocation-network-{language}-{month}-{seed_term}-topn{topn}-ws{window_size}.png',dpi=250)

In [4]:
## Plot network for different languages and write to file

for iso,language in [('nl','dutch'),('it','italian'),('gb','english')]:
    df = clean_subset(iso,language)
    for month in ["2020-03","2020-04","2020-05","2020-06"]:
        
        for term in ['expert','science']:
            collocation_month(iso,df,month=month,seed_term=tf_science[iso][term],window_size=5,topn=5,plot=True,degree_limit=2)

In [5]:
## Get top collocates for 'expert' per month per coalition-opposition

for iso,language in [('nl','dutch'),('it','italian'),('gb','english')]:
    df = clean_subset(iso,language)
    result = pd.DataFrame()

    for month in ["2020-03","2020-04","2020-05","2020-06"]:
        tdf = df[(df['id'].str.contains(month))]
        for group in ['coalition','opposition']:
            print(month,group)
            tdfg = tdf[tdf['party_status'] == group]
            text = list(tdfg['text'])
            text = " ".join(text).split(' ')
            len_ = len([w for w in text if w  == tf_science[iso]['expert']])
            if len_ < 5:
                continue
            finder = BigramCollocationFinder.from_words(text,window_size=5)
            result[month + "_" + group + f"(n={len_})"] = find_top_collocates(finder, set(text), tf_science[iso]['expert'], 15)
    result.to_csv(f'results/tables/coalopp-topcollocates-expert-{iso}-ws5-alldata.csv',index=False)

In [28]:
## Get top collocates for 'expert' per period (covid/reference)

results = pd.DataFrame()

for iso,language in [('nl','dutch'),('it','italian'),('gb','english'),('pl','polish')]:
    print(iso)
    keyword = tf_science[iso]['expert']
    df_covid = clean_subset(iso,language,"2020-03","2020-08")
    df_reference = clean_subset(iso,language,"2019-08","2020-02")

    text_covid = " ".join(df_covid['text']).split(' ')
    finder_covid = BigramCollocationFinder.from_words(text_covid,window_size=10)
    finder_covid.apply_ngram_filter(lambda *w: keyword not in w)

    text_ref = " ".join(df_reference['text']).split(' ')
    finder_ref = BigramCollocationFinder.from_words(text_ref,window_size=10)
    finder_ref.apply_ngram_filter(lambda *w: keyword not in w)
    results['covid' + "_" + iso + f"({dict(Counter(text_covid))[tf_science[iso]['expert']]})"] = list(set([[x for x in i if x != keyword][0] for i in finder_covid.nbest(bigram_measures.likelihood_ratio, 50) if i != (keyword,keyword)]))[:25]
    results['reference' + "_" + iso + f"({dict(Counter(text_ref))[tf_science[iso]['expert']]})"] = list(set([[x for x in i if x != keyword][0] for i in finder_ref.nbest(bigram_measures.likelihood_ratio, 50) if i != (keyword,keyword)]))[:25]

    ref_colname = 'reference' + "_" + iso + f"({dict(Counter(text_ref))[tf_science[iso]['expert']]})"
    cov_colname = 'covid' + "_" + iso + f"({dict(Counter(text_covid))[tf_science[iso]['expert']]})"
    # unique_covid = set(results[cov_colname]) - set(results[ref_colname])
    # results[cov_colname] = [f"{x} (U)" if x in unique_covid else x for x in results[cov_colname]]

results.to_csv('topcollocates-expert-covidref.csv',index=False)

nl
100%|██████████| 27378/27378 [00:03<00:00, 7793.49it/s]
size after subsetting: 27378
100%|██████████| 50001/50001 [00:05<00:00, 9132.03it/s]
size after subsetting: 50001
it
100%|██████████| 2781/2781 [00:04<00:00, 621.67it/s]
size after subsetting: 2781
100%|██████████| 2564/2564 [00:03<00:00, 768.80it/s]
size after subsetting: 2564
gb
100%|██████████| 37280/37280 [00:11<00:00, 3111.27it/s]
size after subsetting: 37280
100%|██████████| 27476/27476 [00:07<00:00, 3554.77it/s]
size after subsetting: 27476
pl
100%|██████████| 18120/18120 [00:01<00:00, 10664.92it/s]
size after subsetting: 18120
100%|██████████| 17575/17575 [00:01<00:00, 9129.05it/s]
size after subsetting: 17575
