In [1]:
import sys, os, itertools
import re, nltk
from nltk import pos_tag
import pandas as pd
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt
import sklearn, gensim
from sklearn.decomposition import PCA
from gensim.corpora import Dictionary
sys.path.append(os.path.expanduser("~")+'/Desktop/topic_modeling/fine_grained_topic_modeling_for_misinformation/src/')
sys.path.append(os.path.expanduser("~")+'/Desktop/topic_modeling/fine_grained_topic_modeling_for_misinformation/src/')
os.chdir(os.path.expanduser("~")+'/Desktop/topic_modeling/fine_grained_topic_modeling_for_misinformation/data/')
from utils import preprocess_for_bow
from models.lda import LDAwrappers
from models.hdp import HDPwrapper
from models.gsdmm import MovieGroupProcessWrapper
from models.lftm import LFTMwrapper

In [2]:
data = preprocess_for_bow('data.csv')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/romainbourgeois/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/romainbourgeois/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/romainbourgeois/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
data.keys()

dict_keys(['text', 'ids', 'tokenized_data', 'dictionary', 'corpus'])

In [4]:
lftm=LFTMwrapper(data['tokenized_data'], data['dictionary'])

TypeError: __init__() should return None, not 'str'

In [3]:
def combine(params):
    param_names = list(params.keys())
    param_values = list(params.values())
    param_combinations = list(itertools.product(*param_values))
    return param_combinations

experiment_result = {
    'number_topics': None, #int, None if infered ex: HDP
    'hyperparameters': {},
    'doc_topic_pvalues': { # get for all above or filter in range ?
        '0.35': {}, #list of ids
        '0.50': {}, #list of ids
        '0.60': {}, 
        '0.75': {}, 
        '0.90': {}, 
        '0.95': {}, 
        '0.95': {}, 
    },
    'word_topic_pvalues': dict(),
    'coherence_metrics': defaultdict()
}

def lda_experiment():
    experiment=dict()
    i=0
    print('running lda experiment')
    print(f"{len(combine(lda_param))} experimentations to make")
    for params in combine(lda_param):
        results = experiment_result.copy()
        param_set = dict(zip(list(lda_param.keys()), params))
        print(param_set)
        results['number_topics']=param_set['num_topics']
        results['hyperparameters']=param_set
        model=LDAwrappers(data['corpus'], data['dictionary'], 'LdaModelGensim', num_topics=param_set['num_topics'],
                        decay=param_set['decay'], passes=param_set['passes'])
        for pvalue in results['doc_topic_pvalues'].keys():
            results['doc_topic_pvalues'][pvalue]=model.get_indexes_per_topics(data['corpus'], float(pvalue), data['ids'])
        results['word_topic_pvalues']=model.topics(topn=10)
        results['coherence_metrics']=model.coherence(data['tokenized_data'], ['u_mass']) 
        experiment['exp_'+str(i)]=results
        i+=1
        if i%10==0:
            print(f"runnin the {i}th experiment")
    return experiment

lda_param={'num_topics': [3], 'decay': [0.5], 'passes': [1]}

lda_exp = lda_experiment()



running lda experiment
1 experimentations to make
{'num_topics': 3, 'decay': 0.5, 'passes': 1}


In [24]:
import json
with open('experiments.json', 'r') as json_file:
    res=json.load(json_file)

In [26]:
res['lda_experiment']['exp_0']['coherence_metrics']


{'u_mass': {'u_mass_per_topic': [-3.3153729920505346,
   -2.997888824341584,
   -2.8887212859804547,
   -3.4702990901712445,
   -2.949908083346583],
  'u_mass': -3.1244380551780804,
  'u_mass_std': 0.22720643468828497}}

In [29]:
from models.abstract_model import AbstractModel
import numpy as np
import os, nltk
from gensim.models import LdaModel, LdaMulticore, HdpModel
import logging
from collections import defaultdict
from utils import preprocess_for_bow, preprocess



class LFTMwrapper(AbstractModel):
    """Latent Feature Topic Model

    Source: https://github.com/datquocnguyen/LFTM
    """
    def __init__(self, model, text, id2word): 
        super().__init__() 
        if model not in ['LFLDA', 'LFDMM']:
            raise ValueError('Model should be LFLDA (default) or LFDMM.')
        self.text = text
        self.id2word=id2word
        with open('datalftm.txt', 'w') as fout:
            for i in self.text:
                fout.write(i+'\n')
        
        

ImportError: attempted relative import with no known parent package

In [42]:
import pickle
with open('/Users/romainbourgeois/Desktop/topic_modeling/fine_grained_topic_modeling_for_misinformation/data/glove/glove.6B.300d.pickle', "rb") as input_file:
    d=pickle.load(input_file)

In [43]:
id2word = list(data['dictionary'].values())
tok2remove = {}
for t in id2word:
    if t not in d:
        tok2remove[t] = True

In [44]:
tok2remove

{'accidentalrelease': True,
 'biologicalweapons': True,
 'leveldepopulation': True,
 'wuhanyes': True,
 'makepeople': True,
 'capitolinsurrection': True,
 'covid19': True,
 'newworld': True,
 'satanicpedophilia': True,
 'thebiglie': True,
 'whiteprivilege': True,
 'antifa': True,
 'covid': True,
 'entireyears': True,
 'migrantcaravans': True,
 'newguy': True,
 'thruabortion': True,
 'alexjones': True,
 'qanon': True,
 'blackdeath': True,
 'colidialsilver': True,
 'deadlyviruses': True,
 'retiredcdc': True,
 'bigtalkshow': True,
 'firsthit': True,
 'majorcelebs': True,
 'implantingpeople': True,
 'ludicrousconspiracy': True,
 'manypeople': True,
 'rfidchips': True,
 'faultypresumption': True,
 'minimalresearch': True,
 'qanonresearchers': True,
 'evang': True,
 'denyclimate': True,
 'politicalvaccine': True,
 'covidparticles': True,
 'deepstate': True,
 'filtervirus': True,
 'fomitetransmission': True,
 'spreadvirus': True,
 'uncontrolledenvironment': True,
 'untrainedppl': True,
 'deli

In [47]:
data.keys()

dict_keys(['text', 'ids', 'tokenized_data', 'dictionary', 'corpus'])

In [50]:
def remove_tokens(x, tok2remove):
    return ' '.join(['' if t in tok2remove else t for t in x])
remove_tokens(data['tokenized_data'][0], tok2remove)

'happen level lab develop weapon yes depopulation lean towards release something work weapon test time     '