In [124]:
import sys, os, json
import re, nltk
from nltk import pos_tag
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn, gensim
from sklearn.decomposition import PCA
import itertools
from gensim.corpora import Dictionary
from collections import defaultdict
import statsmodels.api as sm
sys.path.append(os.path.expanduser("~")+'/Desktop/topic_modeling/fine_grained_topic_modeling_for_misinformation/src/')
sys.path.append(os.path.expanduser("~")+'/Desktop/topic_modeling/fine_grained_topic_modeling_for_misinformation/src/')
os.chdir(os.path.expanduser("~")+'/Desktop/topic_modeling/fine_grained_topic_modeling_for_misinformation/data/')
from utils import preprocess_for_bow, experiment_result
from models.lda import LDAwrappers
from models.hdp import HDPwrapper
from models.gsdmm import MovieGroupProcessWrapper
from models.lftm import LFTMwrapper
from transformers import T5ForConditionalGeneration, T5Tokenizer

In [125]:
model_name = "google/flan-t5-base"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [126]:
def gen_topic_titles(experiment_result, exp_number):
    exp_id="exp_{}".format(exp_number)
    n=1
    for keywords in experiment_result[exp_id]['word_topic_pvalues']:
        prompt = "I have a topic described by the following keywords: [{}]. Based on the previous keywords, \
            what is this topic about? ".format(", ".join(keywords['words']))
        input_ids = tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True)
        output = model.generate(input_ids, max_length=100, num_return_sequences=1, num_beams=4)
        generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
        print('topic',n,'generated topic name: ', generated_text)
        n+=1


In [92]:
with open('experiments_dataset1.json', 'r') as fout:
    res=json.load(fout)

In [93]:
res.keys()

dict_keys(['lda_experiment', 'gsdmm_experiment', 'hdp_experiment', 'lftm_experiment'])

In [94]:
with open('experiments_dataset2.json', 'r') as fout:
    res2=json.load(fout)

In [95]:
import pandas as pd
def to_df(res):
    params=defaultdict(list)
    for exp in res.keys():
        params['c_we'].append(res[exp]['coherence_metrics']['c_we']['c_we'])
        params['c_we_std'].append(res[exp]['coherence_metrics']['c_we']['c_we_std'])
        #params['num_topics'].append(res[exp]['number_topics'])
        for p in res[exp]['hyperparameters'].keys():
            params[p].append(res[exp]['hyperparameters'][p])
    #if params['num_topics'][0]==None:
    #    del params['num_topics']
    return pd.DataFrame(params)
    #return params


In [96]:
def to_df2(res):
    params=defaultdict(list)
    for exp in res.keys():
        params['u_mass'].append(res[exp]['coherence_metrics']['u_mass']['u_mass'])
        params['u_mass_std'].append(res[exp]['coherence_metrics']['u_mass']['u_mass_std'])
        #params['num_topics'].append(res[exp]['number_topics'])
        for p in res[exp]['hyperparameters'].keys():
            params[p].append(res[exp]['hyperparameters'][p])
    #if params['num_topics'][0]==None:
    #    del params['num_topics']
    return pd.DataFrame(params)

In [97]:
def to_df3(res):
    params=defaultdict(list)
    for exp in res.keys():
        params['excl_we'].append(res[exp]['coherence_metrics']['c_we']['excl_we'])
        params['excl_we_std'].append(res[exp]['coherence_metrics']['c_we']['excl_we_std'])
        #params['num_topics'].append(res[exp]['number_topics'])
        for p in res[exp]['hyperparameters'].keys():
            params[p].append(res[exp]['hyperparameters'][p])
    #if params['num_topics'][0]==None:
    #    del params['num_topics']
    return pd.DataFrame(params)

# select experiment of interest

In [127]:
experiment=res['lftm_experiment']

## word embedding coherence

#### DATASET 1

In [128]:
df=to_df(experiment)
df.iloc[df.nlargest(20, 'c_we').index]

Unnamed: 0,c_we,c_we_std,num_topics,alpha,beta,_lambda
8,0.273267,0.07695,11,0.132584,0.05,0.750871
11,0.271825,0.093425,10,0.145718,0.231992,0.690472
0,0.266815,0.089807,10,0.05,0.233173,0.844202
18,0.26314,0.088836,9,0.204683,0.05,0.73692
7,0.262716,0.094656,11,0.176057,0.17302,0.789664
6,0.262028,0.083986,10,0.057984,0.119486,0.801961
3,0.254952,0.089734,8,0.22803,0.25,0.718774
4,0.250968,0.113905,8,0.116173,0.096624,0.690027
1,0.249829,0.07283,11,0.071085,0.249383,0.630468
19,0.249544,0.086321,11,0.055084,0.213985,0.765648


extract word distrib of model of choice

In [123]:
exp_number = 10
with open('res.txt', 'w') as fout:
    for i in range(10):
        fout.write('\n')
        for t in experiment['exp_'+str(exp_number)]['word_topic_pvalues']:
            fout.write(t['words'][i]+' & ')

In [101]:
import statsmodels.api as sm
y = df['c_we']
x=df.drop(columns=['c_we', 'c_we_std'])
x = sm.add_constant(x)
model = sm.OLS(y, x).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                   c_we   R-squared:                       0.230
Model:                            OLS   Adj. R-squared:                  0.130
Method:                 Least Squares   F-statistic:                     2.296
Date:                Sat, 11 Nov 2023   Prob (F-statistic):              0.105
Time:                        11:58:26   Log-Likelihood:                 69.101
No. Observations:                  27   AIC:                            -130.2
Df Residuals:                      23   BIC:                            -125.0
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.2505      0.024     10.277      0.0

In [129]:
gen_topic_titles(experiment, exp_number)

topic 1 generated topic name:  People's Republic of China want to make a bill that makes it clear that white people want to make a bill that makes it clear that white people want to make a bill that makes it clear that white people want to make a bill that makes it clear that white people want to make a bill that makes it clear that white people want to make a bill that makes it clear that white people want to make a bill that makes it clear that white people want to
topic 2 generated topic name:  A black man was killed by a police officer after he was arrested on suspicion of armed robbery and armed robbery.
topic 3 generated topic name:  children are taken to a church for the first time in their lives after being vaccinated.
topic 4 generated topic name:  image for tumblr: tumblr.com/tumblr/tumblr/tumblr/tumblr/tumblr/tumblr/tumblr/tumblr/tumblr/tumblr/tumblr/tumblr/tumblr/tumblr/tumblr/tumblr/tumbl
topic 5 generated topic name:  Science/Tech
topic 6 generated topic name:  World
topi

## UMASS coherence

In [106]:
df=to_df2(experiment)
df.iloc[df.nlargest(5, 'u_mass').index]

Unnamed: 0,u_mass,u_mass_std,num_topics,decay,passes
15,-0.699165,0.354241,10,0.9,1
24,-0.703127,0.659281,13,0.9,1
1,-0.706969,0.410704,7,0.5,2
3,-0.730332,0.540555,7,0.75,1
12,-0.761578,0.498565,10,0.75,1


In [107]:
exp_number = 15

with open('res.txt', 'w') as fout:
    for i in range(10):
        fout.write('\n')
        for t in experiment['exp_'+str(exp_number)]['word_topic_pvalues']:
            fout.write(t['words'][i]+' & ')

In [112]:
gen_topic_titles(experiment, exp_number)

topic 1 generated topic name:  Science/Tech
topic 2 generated topic name:  Science/Tech
topic 3 generated topic name:  Science/Tech
topic 4 generated topic name:  Science/Tech
topic 5 generated topic name:  Facebook
topic 6 generated topic name:  Science/Tech
topic 7 generated topic name:  Facebook - Facebook - Tagged with: u093e, u093e, u093e, u093e, u093e, u093e, u093e, u093e, u093e, u093e, u093e, u093e, u093e, u093e, u093e
topic 8 generated topic name:  Science/Tech
topic 9 generated topic name:  Science/Tech
topic 10 generated topic name:  Science/Tech


In [108]:
y = df['u_mass']
x=df.drop(columns=['u_mass', 'u_mass_std'])
x = sm.add_constant(x)
model = sm.OLS(y, x).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                 u_mass   R-squared:                       0.476
Model:                            OLS   Adj. R-squared:                  0.408
Method:                 Least Squares   F-statistic:                     6.965
Date:                Sat, 11 Nov 2023   Prob (F-statistic):            0.00168
Time:                        12:04:20   Log-Likelihood:                -6.4845
No. Observations:                  27   AIC:                             20.97
Df Residuals:                      23   BIC:                             26.15
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.9334      0.401     -2.330      0.0

### LDA exclusivity metric

In [113]:
df=to_df3(experiment)
df.iloc[df.nsmallest(5, 'excl_we').index]

Unnamed: 0,excl_we,excl_we_std,num_topics,decay,passes
11,0.189541,0.117069,10,0.5,5
20,0.207283,0.074082,13,0.5,5
5,0.230796,0.094637,7,0.75,5
26,0.233819,0.131302,13,0.9,5
2,0.237208,0.10138,7,0.5,5


In [114]:
exp_number = 11

with open('res.txt', 'w') as fout:
    for i in range(10):
        fout.write('\n')
        for t in experiment['exp_'+ str(exp_number)]['word_topic_pvalues']:
            fout.write(t['words'][i]+' & ')

In [119]:
gen_topic_titles(experiment, exp_number)

topic 1 generated topic name:  presidential candidate trump claims he has been sworn in as president of the united states on fb
topic 2 generated topic name:  Facebook
topic 3 generated topic name:  Science/Tech
topic 4 generated topic name:  Science/Tech
topic 5 generated topic name:  Science/Tech
topic 6 generated topic name:  Science/Tech
topic 7 generated topic name:  Science/Tech
topic 8 generated topic name:  Science/Tech
topic 9 generated topic name:  Science/Tech
topic 10 generated topic name:  Science/Tech


In [115]:
y = df['excl_we']
x=df.drop(columns=['excl_we', 'excl_we_std'])
x = sm.add_constant(x)
model = sm.OLS(y, x).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                excl_we   R-squared:                       0.624
Model:                            OLS   Adj. R-squared:                  0.575
Method:                 Least Squares   F-statistic:                     12.72
Date:                Sat, 11 Nov 2023   Prob (F-statistic):           4.16e-05
Time:                        12:13:56   Log-Likelihood:                 63.413
No. Observations:                  27   AIC:                            -118.8
Df Residuals:                      23   BIC:                            -113.6
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.3155      0.030     10.484      0.0