# Topic Modeling with Gensim and Optimal Topic Coherence

### Importing Libraries and Loading Data

In [1]:
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import plotly_express as px
import re
import nltk
import pandas as pd

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/studio-lab-
[nltk_data]     user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/studio-lab-
[nltk_data]     user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/studio-lab-
[nltk_data]     user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Loading dataset
df_games = pd.read_csv('all_games.csv')
df_games.head()

Unnamed: 0,name,platform,release_date,summary,meta_score,user_review
0,The Legend of Zelda: Ocarina of Time,Nintendo 64,"November 23, 1998","As a young boy, Link is tricked by Ganondorf, ...",99,9.1
1,Tony Hawk's Pro Skater 2,PlayStation,"September 20, 2000",As most major publishers' development efforts ...,98,7.4
2,Grand Theft Auto IV,PlayStation 3,"April 29, 2008",[Metacritic's 2008 PS3 Game of the Year; Also ...,98,7.7
3,SoulCalibur,Dreamcast,"September 8, 1999","This is a tale of souls and swords, transcendi...",98,8.4
4,Grand Theft Auto IV,Xbox 360,"April 29, 2008",[Metacritic's 2008 Xbox 360 Game of the Year; ...,98,7.9


In [3]:
# displaying count of rows and columns
print(f'Count Rows: {df_games.shape[0]}, Count Columns: {df_games.shape[1]}')

Count Rows: 18800, Count Columns: 6


In [4]:
# top ten games with most reviews
df_top_platforms = df_games['platform'].value_counts().head(20).reset_index()
print(df_top_platforms)

             platform  count
0                  PC   4864
1       PlayStation 4   2056
2            Xbox 360   1644
3       PlayStation 2   1414
4              Switch   1399
5       PlayStation 3   1256
6            Xbox One   1179
7                Xbox    789
8                  DS    720
9                 Wii    655
10                PSP    512
11           GameCube    448
12   Game Boy Advance    438
13                3DS    396
14   PlayStation Vita    257
15        PlayStation    187
16              Wii U    184
17          Dreamcast    125
18      PlayStation 5    124
19      Xbox Series X     77


In [5]:
# displaying platform string names
df_games['platform'].unique()

array([' Nintendo 64', ' PlayStation', ' PlayStation 3', ' Dreamcast',
       ' Xbox 360', ' Wii', ' Xbox One', ' PC', ' Switch',
       ' PlayStation 2', ' PlayStation 4', ' GameCube', ' Xbox', ' Wii U',
       ' Game Boy Advance', ' 3DS', ' Xbox Series X', ' DS',
       ' PlayStation Vita', ' PlayStation 5', ' PSP', ' Stadia'],
      dtype=object)

In [6]:
# filtering on ds games
df_ds = df_games.loc[df_games['platform'] == ' DS']
print(df_ds['platform'].value_counts())

platform
 DS    720
Name: count, dtype: int64


In [7]:
# counting games with missing comments
print(df_ds['summary'].isna().sum())

2


In [8]:
# removing games with missing comments
df_ds = df_ds.dropna(subset=['summary'])
print(df_ds['summary'].isna().sum())

0


### Data Preprocessing

In [9]:
# initialize WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# defining stop words
stop_words = set(stopwords.words('english'))
print(stopwords)

<WordListCorpusReader in '/home/studio-lab-user/nltk_data/corpora/stopwords'>


In [10]:
# creating preprocess function
def preprocess(document):
    ''' Performs lowercase, tokenization, 
    stop word removal and lemmatization'''
    document = document.lower() # converts to lower case
    document = word_tokenize(document) # tokenize into words
    document = [word for word in document if word.isalpha()] # remove numbers and punctuation
    document = [word for word in document if word not in stop_words] # remove stop words
    document = [lemmatizer.lemmatize(word) for word in document] # lemmatize words
    document = [word for word in document if len(word) > 1] # removing one character words
    
    return document

In [11]:
# preprocessing texts
preprocessed_texts = [preprocess(document) for document in df_ds['summary']]

# displaying first three summaries
for text in preprocessed_texts[:1]:
    print(text, '\n')

['grand', 'theft', 'auto', 'chinatown', 'war', 'entirely', 'original', 'entry', 'critically', 'acclaimed', 'grand', 'theft', 'auto', 'series', 'brings', 'new', 'level', 'interactivity', 'sprawling', 'open', 'environment', 'use', 'touch', 'screen', 'player', 'navigate', 'way', 'street', 'uncover', 'truth', 'behind', 'epic', 'tale', 'crime', 'corruption', 'within', 'triad', 'crime', 'syndicate', 'delivering', 'unprecedented', 'amount', 'depth', 'become', 'true', 'trademark', 'franchise'] 



### Building LDA Model

In [12]:
# building dictionary
dictionary = Dictionary(preprocessed_texts)

# removing extreme words
dictionary.filter_extremes(no_below=5, no_above=0.25)

# creating corpus
corpus = [dictionary.doc2bow(text) for text in preprocessed_texts]
print(dictionary)

Dictionary<1690 unique tokens: ['acclaimed', 'amount', 'become', 'behind', 'brings']...>


### Optimal Number of Topics Based on Coherence

In [13]:
# list to store coherence scores
coherence_scores = []

# topics range
topics_range = range(2, 21)

# looping through topics range
for num_topics in topics_range:
    # creating lda model
    lda_model = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=num_topics,
        alpha=0.01,
        eta=0.01
    )
    # creating coherence model with u_mass
    coherence_model = CoherenceModel(
        model=lda_model,
        texts=preprocessed_texts,
        dictionary=dictionary,
        coherence='u_mass'
    )
    # retrieving coherence score
    coherence_score = coherence_model.get_coherence()
    # appending coherence score to list
    coherence_scores.append(coherence_score)

In [14]:
# printing coherence scores
print(coherence_scores)

[-1.566598097660909, -1.5940777129785426, -1.6263689698390407, -1.8050623822419918, -1.9733077063049373, -2.0575507874340184, -1.9176949049449274, -2.2458451576716363, -2.301158497520358, -2.1329265691314316, -2.0807694089700477, -2.172061701889378, -2.3882307240188787, -2.2701791757753345, -2.4060236419237166, -2.5486988550228262, -2.4705895503044295, -2.3888153678937663, -2.5609017705760038]


### Visualizing Coherence Scores

In [15]:
# creating dataframe
df_topics = pd.DataFrame(
    {'Number of Topics': list(topics_range),
     'Coherence Score': coherence_scores}
)

# line plot
fig = px.line(
    df_topics,
    x='Number of Topics',
    y='Coherence Score',
    labels={
        'x': 'Number of Topics',
        'y': 'Coherence Score'},
    title='Coherence Scores for Num of Topics'
)

fig.show(renderer='iframe')

### Optimal Model and Top Words in Topics

In [16]:
# choosing optimal model
optimal_num_topics = topics_range[2]

# training LDA model with optimal number of topics
optimal_lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=optimal_num_topics
)

# output number of topics and top words for each topic
print(f'Optimal Num of Topics: {optimal_num_topics}\n')

Optimal Num of Topics: 4



In [17]:
# creating dictionary that stores topics and top words
optimal_lda_topics = optimal_lda_model.show_topics(
    num_topics=optimal_num_topics, 
    num_words=10,
    formatted=False
)
print(len(optimal_lda_topics))

4


In [18]:
# displayin topics and top words
for topic_id, topic in optimal_lda_topics:
    words = ', '.join([word[0] for word in topic])
    print(f'Topic {topic_id + 1}: {words}\n')

Topic 1: friend, time, also, hero, one, level, puzzle, series, make, including

Topic 2: use, control, way, friend, pokemon, one, adventure, touch, different, character

Topic 3: character, screen, challenge, action, enemy, experience, unique, time, control, adventure

Topic 4: puzzle, screen, friend, unique, action, way, island, character, help, using

