Mount Drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/mads_thesis')
!pwd

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/My Drive/mads_thesis


Install required libraries

In [None]:
!pip install bertopic
!pip install octis

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Import Libaries

In [None]:
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import MaximalMarginalRelevance
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
import pandas as pd
import numpy as np

Create BERTopic model

In [None]:
def bertopic_model(docs):
    ctidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
    topic_model = BERTopic(nr_topics="auto",
                           ctfidf_model=ctidf_model,
                           calculate_probabilities=False)
    topics, probs = topic_model.fit_transform(docs)
    return topic_model, topics, probs

Create visualization function

In [None]:
def visualize_model(topic_model):
    print(topic_model.get_topic_info())
    print('\n\n\n')
    for i in range(0, 10):
        print(topic_model.get_topic(i))

Evaluate Model

In [None]:
def evaluate_model(topic_model, docs, topics, probs):
    # Topic Coherence
    # Preprocess Documents
    documents = pd.DataFrame({"Document": docs,
                              "ID": range(len(docs)),
                              "Topic": topics})
    documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
    cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

    # Extract vectorizer and analyzer from BERTopic
    vectorizer = topic_model.vectorizer_model
    analyzer = vectorizer.build_analyzer()

    # Extract features for Topic Coherence evaluation
    words = vectorizer.get_feature_names()
    tokens = [analyzer(doc) for doc in cleaned_docs]
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]
    topic_words = [[words for words, _ in topic_model.get_topic(topic)]
                   for topic in range(len(set(topics))-1)]

    # Evaluate
    coherence_model = CoherenceModel(topics=topic_words,
                                    texts=tokens,
                                    corpus=corpus,
                                    dictionary=dictionary,
                                    coherence='c_v')
    coherence = coherence_model.get_coherence()
    print(f'Coherence Score: {coherence}')

    # Topic Diversity
    diversity = TopicDiversity(topk=10)
    topics = pd.DataFrame(topic_model.get_topic_info()['Representation']).rename(columns={'Representation': 'topics'})
    model_diversity = diversity.score(topics)
    print(f'\nDiversity is: {model_diversity}\n')



Load data

In [None]:
train_data = pd.read_pickle('Video_Games_final_train.pkl.gz')
dev_data = pd.read_pickle('Video_Games_final_dev.pkl.gz')
test_data = pd.read_pickle('Video_Games_final_test.pkl.gz')
df = np.array(pd.concat([train_data['review_text'], dev_data['review_text'], test_data['review_text']]))

Create model

In [None]:
model, topics, probs = bertopic_model(df)

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:01<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Run visualization

In [None]:
visualize_model(model)

     Topic  Count                                  Name  \
0       -1  12409                   -1_buy_me_fun_after   
1        0  13272               0_xbox_wii_story_system   
2        1   1089              1_sims_sim_zoo_expansion   
3        2    435             2_madden_nba_football_nfl   
4        3    409    3_flight_simulator_aircraft_planes   
..     ...    ...                                   ...   
142    141     11              141_ac1_ac2_turbine_beta   
143    142     11        142_buffy_willow_slayer_xander   
144    143     10   143_airport_airline_tycoon_terminal   
145    144     10  144_sidewinder_joystick_xp_joysticks   
146    145     10   145_cooking_mama_ingredients_recipe   

                                        Representation  \
0    [buy, me, fun, after, itdont, then, playing, l...   
1    [xbox, wii, story, system, final, characters, ...   
2    [sims, sim, zoo, expansion, maxis, dog, animal...   
3    [madden, nba, football, nfl, basketball, ncaa,...   
4

Evaluate model

In [None]:
evaluate_model(model, df, topics, probs)

Coherence Score: 0.6837356366223892

Diversity is: 0.8965986394557823

