In [2]:
# Mount your google drive in google colab
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Insert the directory
import sys
sys.path.insert(0,'/content/drive/My Drive/CMPUT 701 - Shraddha/Experiments - Code')

In [None]:
!pip install setuptools~=67.6.0
!pip install spacy~=3.5.0
!pip install numpy~=1.21.5
!pip install gensim~=4.1.2
!pip install networkx~=2.8.4
!pip install tomotopy
!pip install corextopic
!pip install igraph

In [None]:
import network_creation
from gensim.models.phrases import Phraser, ENGLISH_CONNECTOR_WORDS
import preprocessing
import community_utils
import tomotopy as tp
import networkx as nx
import igraph as ig
from gensim.models.coherencemodel import CoherenceModel
import numpy as np
from diversity_metrics import *

In [27]:
with open("/content/drive/My Drive/CMPUT 701 - Shraddha/Experiments - Code/text_datasets/europarl_en_train.txt", "r") as f:
    bbc_train = f.read().split("\n")
with open("/content/drive/My Drive/CMPUT 701 - Shraddha/Experiments - Code/text_datasets/europarl_en_test.txt", "r") as f:
    bbc_test = f.read().split("\n")

In [28]:
import pickle
with open("/content/drive/My Drive/CMPUT 701 - Shraddha/Experiments - Code/ep1_master_object.obj", "rb") as f:
      master_object = pickle.load(f)

In [29]:
import numpy as np
import scipy.sparse as ss
import matplotlib.pyplot as plt

import corextopic.corextopic as ct
import corextopic.vis_topic as vt # jupyter notebooks will complain matplotlib is being loaded twice

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

%matplotlib inline

In [30]:
# Transform 20 newsgroup data into a sparse matrix
vectorizer = CountVectorizer(stop_words='english', max_features=20000, binary=True)
doc_word = vectorizer.fit_transform(bbc_train)
doc_word = ss.csr_matrix(doc_word)

doc_word.shape # n_docs x m_words

(19001, 13906)

In [31]:
# Get words that label the columns (needed to extract readable topics and make anchoring easier)
words = list(np.asarray(vectorizer.get_feature_names_out()))

In [32]:
not_digit_inds = [ind for ind,word in enumerate(words) if not word.isdigit()]
doc_word = doc_word[:,not_digit_inds]
words    = [word for ind,word in enumerate(words) if not word.isdigit()]

doc_word.shape # n_docs x m_words

(19001, 13410)

In [33]:
# Train the CorEx topic model with 50 topics
topic_model = ct.Corex(n_hidden=5, words=words, max_iter=200, verbose=False, seed=1)
topic_model.fit(doc_word, words=words);

In [34]:
# Print all topics from the CorEx topic model
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_,_ = zip(*topic)
    print('{}: '.format(n) + ', '.join(topic_words))

0: development, social, european, economic, union, community, regions, policy, employment, funds
1: mr, president, mrs, commissioner, madam, report, thank, like, behalf, gentlemen
2: party, government, political, austrian, kosovo, van, austria, haider, peace, people
3: member, states, commission, rights, council, treaty, legal, affairs, committee, article
4: water, health, car, oil, safety, environment, cars, disaster, dangerous, cost


In [35]:
topic_words = []

topics = topic_model.get_topics()
for n,topic in enumerate(topics):
  words,_,_ = zip(*topic)
  words_list = [i for i in words]
  topic_words.append(words_list)

print(topic_words)

[['development', 'social', 'european', 'economic', 'union', 'community', 'regions', 'policy', 'employment', 'funds'], ['mr', 'president', 'mrs', 'commissioner', 'madam', 'report', 'thank', 'like', 'behalf', 'gentlemen'], ['party', 'government', 'political', 'austrian', 'kosovo', 'van', 'austria', 'haider', 'peace', 'people'], ['member', 'states', 'commission', 'rights', 'council', 'treaty', 'legal', 'affairs', 'committee', 'article'], ['water', 'health', 'car', 'oil', 'safety', 'environment', 'cars', 'disaster', 'dangerous', 'cost']]


In [37]:
print("puw:", proportion_unique_words(topic_words, topk=10))
print("jd:", pairwise_jaccard_diversity(topic_words, topk=10))
print("irbo p=0.5:", irbo(topic_words, weight=0.5, topk=10))
print("irbo p=0.9:", irbo(topic_words, weight=0.9, topk=10))


for coherence in ["c_v", "c_npmi", "u_mass"]:
  for topn in [5]:
    cm = CoherenceModel(topics=topic_words,
                        texts=master_object["ep_test"],
                        dictionary=master_object["ep_dict"],
                        topn=topn,
                        coherence=coherence)
    score = cm.get_coherence()
    print(coherence, ":", score, "(topn=", topn, ")")

puw: 1.0
jd: 1.0
irbo p=0.5: 1.0
irbo p=0.9: 1.0
c_v : 0.31414772494792736 (topn= 5 )
c_npmi : -0.17226005737477018 (topn= 5 )
u_mass : -8.726391718505672 (topn= 5 )
