Mount Drive



In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/mads_thesis')
!pwd

Mounted at /content/gdrive
/content/gdrive/My Drive/mads_thesis


Install libraries

In [2]:
!pip install octis

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting octis
  Downloading octis-1.12.1-py2.py3-none-any.whl (130 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.9/130.9 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gensim==4.2.0 (from octis)
  Downloading gensim-4.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (24.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.0/24.0 MB[0m [31m65.9 MB/s[0m eta [36m0:00:00[0m
Collecting scikit-learn==0.24.2 (from octis)
  Downloading scikit-learn-0.24.2.tar.gz (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m101.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting scikit-optimize>=0.8.1 (from octis)
  Down

Import libraries

In [3]:
from gensim import corpora
import numpy as np
from octis.dataset.dataset import Dataset
from octis.models.LDA import LDA
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence
import pandas as pd

Build corpus

In [4]:
def build_corpus(train_data):
    print(train_data['pre_processed_text'])
    train_data[['pre_processed_text']].to_csv('corpus.tsv', sep='\t', index=False)

Create model

In [5]:
def lda_model(train_data):
    model = LDA(num_topics=30)
    trained_model = model.train_model(train_data)
    return trained_model

Create Evaluation

In [6]:
def evaluate_model(trained_model, dataset):
    cv = Coherence(texts=dataset.get_corpus(), topk=10, measure='c_v')
    model_cv = cv.score(trained_model)

    print(f'\nCoherence is: {model_cv}\n')

    diversity = TopicDiversity(topk=10)
    model_diversity = diversity.score(trained_model)

    print(f'\nDiversity is: {model_diversity}\n')

    print('Topics:\n')
    for topic in trained_model['topics'][:10]:
        print(' '.join(topic))

Read data

In [9]:
train_data = pd.read_pickle('Video_Games_final_train.pkl.gz')
dev_data = pd.read_pickle('Video_Games_final_dev.pkl.gz')
test_data = pd.read_pickle('Video_Games_final_test.pkl.gz')
df = pd.concat([train_data['pre_processed_text'], dev_data['pre_processed_text'], test_data['pre_processed_text']])

Save data as tsv (only way to read it using OCTIS)

In [10]:
df.to_csv('corpus.tsv', sep='\t', index=False)

Load Data

In [11]:
dataset = Dataset()
dataset.load_custom_dataset_from_folder('.')

Train Model

In [12]:
trained_model = lda_model(dataset)

Evaluate Model

In [13]:
evaluate_model(trained_model, dataset)


Coherence is: 0.4108791283067196


Diversity is: 0.6566666666666666

Topics:

'unit', 'game', 'strategy', 'battle', 'new', 'war', 'one', 'empire', 'build', 'map',
'card', 'game', 'memory', 'sonic', 'zelda', 'link', 'gamecube', 'evil', 'resident', 'sega',
'mode', 'player', 'game', 'play', 'new', 'team', 'year', 'madden', 'like', 'ball',
'game', 'sims', 'new', 'get', 'sim', 'expansion', 'play', 'like', 'would', 'pack',
'game', 'level', 'get', 'one', 'play', 'well', 'new', 'different', 'player', 'time',
'city', 'wheel', 'gta', 'keyboard', 'vice', 'car', 'mouse', 'grand', 'auto', 'theft',
'buy', 'screen', 'one', 'case', 'get', 'pad', 'battery', 'use', 'work', 'psp',
'pinball', 'game', 'bug', 'table', 'installed', 'forum', 'soe', 'cu', 'megaman', 'sing',
'money'] 'waste', 'socom', 'cheat', 'good'] 'gameshark', 'money', 'youre', 'dont', 'shark',
'game', 'like', 'one', 'get', 'movie', 'time', 'even', 'enemy', 'much', 'first',
