In [1]:
import numpy as np
import pandas as pd

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel, Phrases
from gensim.models.phrases import Phraser
from gensim.models import LdaModel



# Plotting tools
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
import matplotlib.pyplot as plt
%matplotlib inline


In [2]:
from steam_reviews import ReviewLoader

appid_list = [
211420,
570940,
236430,
335300,
374320,
814380,
1245620,
485510,
1325200,
1448440,
1172380,
1774580,
265300,
678960,
606280,
378540,
644830
]

# language_list = ['english', 'schinese', 'russian']
language_list = ['english']
text_data = []

for appid in appid_list:
    for language in language_list:
        file_path_n = 'reviews_' + str(appid) + '_' + language + '_n.json'
        reviews_n = ReviewLoader().load_from_local(file_path_n)
        review_list_n = reviews_n.review_list()
        text_data += review_list_n

len(text_data)

87810

In [3]:
text_data

['controls are pretty shonky and does not translate well',
 '[h1]1/10',
 "I have to admit, I just can't get into Dark Souls.  The 3d Combat just feels clunky to me, I'm a mouse and keyboard guy for most games, and controller for platformers. But 3D combat on a controller just feels weird.  I beat the entire game, but it was like pulling teeth for me.  \n\nBut I appreciate and respect the hell out of the game.  It popularized difficulty in video games, and showed the world that people want challenges in their games.  Also, the level design was pretty neat, and not enough games design their levels good enough.  Ah well, I really wish I could have enjoyed this.",
 'Returning players will enjoy it but new comers won\'t.  When Dark Souls came out, it was kind of fun.  The RPG aspects, story telling, combat, large world, diverse enemy\'s, different play styles.  In it\'s time, the game set a new standard and it was awesome.  But compared to today\'s standards, the game isn\'t holding up.  \n

In [4]:
import re

# ÂéªÈô§ÁâπÊÆäÂ≠óÁ¨¶ÂíåÊï∞Â≠ó
cleaned_text = [re.sub(r'[^a-zA-Z\s]', '', sentence) for sentence in text_data]
# ËΩ¨Âåñ‰∏∫Â∞èÂÜô
lowercase_text = [sentence.lower() for sentence in cleaned_text]

In [5]:
cleaned_text

['controls are pretty shonky and does not translate well',
 'h',
 'I have to admit I just cant get into Dark Souls  The d Combat just feels clunky to me Im a mouse and keyboard guy for most games and controller for platformers But D combat on a controller just feels weird  I beat the entire game but it was like pulling teeth for me  \n\nBut I appreciate and respect the hell out of the game  It popularized difficulty in video games and showed the world that people want challenges in their games  Also the level design was pretty neat and not enough games design their levels good enough  Ah well I really wish I could have enjoyed this',
 'Returning players will enjoy it but new comers wont  When Dark Souls came out it was kind of fun  The RPG aspects story telling combat large world diverse enemys different play styles  In its time the game set a new standard and it was awesome  But compared to todays standards the game isnt holding up  \n\nCons  Bad clipping BS hit boxes broken environme

In [6]:
lowercase_text

['controls are pretty shonky and does not translate well',
 'h',
 'i have to admit i just cant get into dark souls  the d combat just feels clunky to me im a mouse and keyboard guy for most games and controller for platformers but d combat on a controller just feels weird  i beat the entire game but it was like pulling teeth for me  \n\nbut i appreciate and respect the hell out of the game  it popularized difficulty in video games and showed the world that people want challenges in their games  also the level design was pretty neat and not enough games design their levels good enough  ah well i really wish i could have enjoyed this',
 'returning players will enjoy it but new comers wont  when dark souls came out it was kind of fun  the rpg aspects story telling combat large world diverse enemys different play styles  in its time the game set a new standard and it was awesome  but compared to todays standards the game isnt holding up  \n\ncons  bad clipping bs hit boxes broken environme

In [7]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

# ÂàÜËØç
tokenized_text = [word_tokenize(sentence) for sentence in lowercase_text]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\YouChain\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
tokenized_text

[['controls',
  'are',
  'pretty',
  'shonky',
  'and',
  'does',
  'not',
  'translate',
  'well'],
 ['h'],
 ['i',
  'have',
  'to',
  'admit',
  'i',
  'just',
  'cant',
  'get',
  'into',
  'dark',
  'souls',
  'the',
  'd',
  'combat',
  'just',
  'feels',
  'clunky',
  'to',
  'me',
  'im',
  'a',
  'mouse',
  'and',
  'keyboard',
  'guy',
  'for',
  'most',
  'games',
  'and',
  'controller',
  'for',
  'platformers',
  'but',
  'd',
  'combat',
  'on',
  'a',
  'controller',
  'just',
  'feels',
  'weird',
  'i',
  'beat',
  'the',
  'entire',
  'game',
  'but',
  'it',
  'was',
  'like',
  'pulling',
  'teeth',
  'for',
  'me',
  'but',
  'i',
  'appreciate',
  'and',
  'respect',
  'the',
  'hell',
  'out',
  'of',
  'the',
  'game',
  'it',
  'popularized',
  'difficulty',
  'in',
  'video',
  'games',
  'and',
  'showed',
  'the',
  'world',
  'that',
  'people',
  'want',
  'challenges',
  'in',
  'their',
  'games',
  'also',
  'the',
  'level',
  'design',
  'was',
  'pre

In [9]:
from nltk.corpus import stopwords
import nltk

# ÂéªÈô§ÂÅúÁî®ËØç
custom_stopwords = ['dark', 'souls', 'game', 'games', 'sekiro', 'elden', 'ring', 'nioh', 'wo', 'long', 'fallen dynasty',
                   'star', 'wars', 'jedi', 'lords', 'fallen', 'code', 'vein', 'darksiders', 'surge']
stop_words = set(stopwords.words('english'))
all_stopwords = stop_words.union(custom_stopwords)
filtered_text = [[word for word in sentence if word.lower() not in all_stopwords] for sentence in tokenized_text]
filtered_text

[['controls', 'pretty', 'shonky', 'translate', 'well'],
 ['h'],
 ['admit',
  'cant',
  'get',
  'combat',
  'feels',
  'clunky',
  'im',
  'mouse',
  'keyboard',
  'guy',
  'controller',
  'platformers',
  'combat',
  'controller',
  'feels',
  'weird',
  'beat',
  'entire',
  'like',
  'pulling',
  'teeth',
  'appreciate',
  'respect',
  'hell',
  'popularized',
  'difficulty',
  'video',
  'showed',
  'world',
  'people',
  'want',
  'challenges',
  'also',
  'level',
  'design',
  'pretty',
  'neat',
  'enough',
  'design',
  'levels',
  'good',
  'enough',
  'ah',
  'well',
  'really',
  'wish',
  'could',
  'enjoyed'],
 ['returning',
  'players',
  'enjoy',
  'new',
  'comers',
  'wont',
  'came',
  'kind',
  'fun',
  'rpg',
  'aspects',
  'story',
  'telling',
  'combat',
  'large',
  'world',
  'diverse',
  'enemys',
  'different',
  'play',
  'styles',
  'time',
  'set',
  'new',
  'standard',
  'awesome',
  'compared',
  'todays',
  'standards',
  'isnt',
  'holding',
  'cons'

In [9]:
# import nltk
# from nltk.stem import SnowballStemmer

# stemmer = SnowballStemmer("english")

# stemmed_text = [[stemmer.stem(word) for word in sentence] for sentence in filtered_text]

In [10]:
# ÂáÜÂ§áÊñáÊú¨Êï∞ÊçÆÂíåÂàÜËØç
# ËøôÈáåÂÅáËÆæ‰Ω†Â∑≤ÁªèÊúâ‰∫Ü‰∏Ä‰∏™ÊñáÊú¨Êï∞ÊçÆÈõÜÂπ∂ËøõË°å‰∫ÜÂàÜËØçÂ§ÑÁêÜ

# ÂàõÂª∫ËØçÂÖ∏ÂíåÊñáÊ°£-ËØçÈ¢ëÁü©Èòµ
dictionary = corpora.Dictionary(filtered_text)  # your_corpusÊòØÂ∑≤ÂàÜËØçÁöÑÊñáÊú¨Êï∞ÊçÆ
corpus = [dictionary.doc2bow(text) for text in filtered_text]

# ÈÄâÊã©‰∏çÂêå‰∏ªÈ¢òÊï∞ËøõË°åËÆ≠ÁªÉÂíåËØÑ‰º∞
start = 3  # ‰∏ªÈ¢òÊï∞ÁöÑËµ∑ÂßãÂÄº
limit = 10  # ‰∏ªÈ¢òÊï∞ÁöÑÁªìÊùüÂÄº
step = 1  # ‰∏ªÈ¢òÊï∞ÁöÑÊ≠•Èïø

coherence_scores = []  # Áî®‰∫éÂ≠òÂÇ®‰∏ªÈ¢ò‰∏ÄËá¥ÊÄßÂàÜÊï∞
perplexity_scores = []  # Áî®‰∫éÂ≠òÂÇ®Âõ∞ÊÉëÂ∫¶ÂàÜÊï∞

for num_topics in range(start, limit, step):
    lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)
    
    # ËÆ°ÁÆó‰∏ªÈ¢ò‰∏ÄËá¥ÊÄßÂàÜÊï∞
    coherence_model = CoherenceModel(model=lda_model, texts=filtered_text, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    coherence_scores.append(coherence_score)
    
    # ËÆ°ÁÆóÂõ∞ÊÉëÂ∫¶ÂàÜÊï∞
    perplexity_score = lda_model.log_perplexity(corpus)
    perplexity_scores.append(perplexity_score)

# ÊâìÂç∞ÁªìÊûú
print("‰∏ªÈ¢òÊï∞ vs. ‰∏ªÈ¢ò‰∏ÄËá¥ÊÄßÂàÜÊï∞:")
for num_topics, coherence_score in zip(range(start, limit, step), coherence_scores):
    print(f"{num_topics} ‰∏ªÈ¢ò: {coherence_score}")

print("\n‰∏ªÈ¢òÊï∞ vs. Âõ∞ÊÉëÂ∫¶ÂàÜÊï∞:")
for num_topics, perplexity_score in zip(range(start, limit, step), perplexity_scores):
    print(f"{num_topics} ‰∏ªÈ¢ò: {perplexity_score}")

‰∏ªÈ¢òÊï∞ vs. ‰∏ªÈ¢ò‰∏ÄËá¥ÊÄßÂàÜÊï∞:
3 ‰∏ªÈ¢ò: 0.42717070109493477
4 ‰∏ªÈ¢ò: 0.4694556085796414
5 ‰∏ªÈ¢ò: 0.4388745274424649
6 ‰∏ªÈ¢ò: 0.45611270202933757
7 ‰∏ªÈ¢ò: 0.39655371231044073
8 ‰∏ªÈ¢ò: 0.41564137216553865
9 ‰∏ªÈ¢ò: 0.4211719085587638

‰∏ªÈ¢òÊï∞ vs. Âõ∞ÊÉëÂ∫¶ÂàÜÊï∞:
3 ‰∏ªÈ¢ò: -7.980958102962883
4 ‰∏ªÈ¢ò: -7.9944673377874675
5 ‰∏ªÈ¢ò: -8.021047365440017
6 ‰∏ªÈ¢ò: -8.066933663748015
7 ‰∏ªÈ¢ò: -8.12944171575294
8 ‰∏ªÈ¢ò: -8.209081939540175
9 ‰∏ªÈ¢ò: -8.320238222659734


In [10]:
# ÂàõÂª∫ËØçË¢ãÊ®°Âûã
dictionary = corpora.Dictionary(filtered_text)
# Â∞ÜÊñáÊú¨Êï∞ÊçÆËΩ¨Âåñ‰∏∫ÊñáÊ°£-ËØçÈ¢ëÁü©Èòµ
corpus = [dictionary.doc2bow(text) for text in filtered_text]

In [11]:
# ËÆ≠ÁªÉLDAÊ®°Âûã
num_topics = 10
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15, random_state=42)

In [12]:
# Êü•Áúã‰∏ªÈ¢ò
topics = lda_model.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.031*"like" + 0.015*"dont" + 0.014*"really" + 0.013*"good" + 0.011*"im" + 0.010*"would" + 0.010*"get" + 0.010*"one" + 0.010*"play" + 0.008*"first"')
(1, '0.073*"shit" + 0.049*"fucking" + 0.040*"trash" + 0.035*"like" + 0.034*"fuck" + 0.032*"na" + 0.031*"garbage" + 0.026*"ass" + 0.023*"sucks" + 0.019*"gon"')
(2, '0.058*"planet" + 0.052*"stutters" + 0.046*"cons" + 0.044*"pros" + 0.020*"dog" + 0.019*"mid" + 0.019*"simulator" + 0.015*"easy" + 0.011*"grind" + 0.010*"recover"')
(3, '0.024*"combat" + 0.017*"story" + 0.013*"like" + 0.012*"design" + 0.011*"feels" + 0.010*"good" + 0.009*"gameplay" + 0.008*"feel" + 0.008*"world" + 0.008*"boring"')
(4, '0.029*"coop" + 0.022*"play" + 0.019*"multiplayer" + 0.014*"online" + 0.013*"friend" + 0.012*"app" + 0.012*"origin" + 0.011*"steam" + 0.010*"friends" + 0.010*"player"')
(5, '0.109*"camera" + 0.021*"scene" + 0.020*"hack" + 0.019*"slash" + 0.019*"unresponsive" + 0.015*"controls" + 0.014*"slow" + 0.011*"motion" + 0.009*"lag" + 0.009*"e"')
(6, '0.0

In [13]:
pyLDAvis.enable_notebook()
lda_display = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(lda_display)

In [14]:
pyLDAvis.save_html(lda_display, 'output_eng_n.html')

#### 

In [15]:
import pickle

# ÂÅáËÆæ‰Ω†ÁöÑLDAÊ®°ÂûãÂëΩÂêç‰∏∫lda_model
with open('lda_model_en_t10.pkl', 'wb') as f:
    pickle.dump(lda_model, f)

In [16]:
import pickle

# Âä†ËΩΩLDAÊ®°Âûã
with open('lda_model_en_t10.pkl', 'rb') as f:
    lda_model = pickle.load(f)

In [17]:
from steam_reviews import ReviewLoader

appid_list = [
211420,
570940,
236430,
335300,
374320,
814380,
1245620,
485510,
1325200,
1448440,
1172380,
1774580,
265300,
678960,
606280,
378540,
644830
]

# language_list = ['english', 'schinese', 'russian']
language_list = ['english']

topic_mapping = {0: 2, 1: 8, 2: 10, 3: 3, 4: 7, 5: 9, 6: 6, 7: 4, 8: 5, 9: 1}
group_mapping = {2: 0, 6: 0, 10: 0, 4: 1, 5: 1, 9: 1, 7: 2, 1: 3, 3: 4, 8: 5}
index = 0
results = []

for appid in appid_list:
    for language in language_list:
        result_list = [0] * 6
        file_path_n = 'reviews_' + str(appid) + '_' + language + '_n.json'
        reviews_n = ReviewLoader().load_from_local(file_path_n)
        review_list_n = reviews_n.review_list()
        for review in review_list_n:
            topics = lda_model[corpus[index]]
            top_topic = max(topics, key=lambda x: x[1])  # ÊâæÂà∞Ê¶ÇÁéáÊúÄÈ´òÁöÑ‰∏ªÈ¢ò
            topic_index = topic_mapping[top_topic[0]]
            result_list[group_mapping[topic_index]] += 1
            index += 1
            
        print(result_list)    
        results.append(result_list)

[1849, 1614, 232, 411, 196, 172]
[1692, 637, 589, 463, 333, 163]
[1297, 522, 218, 379, 305, 105]
[2783, 622, 278, 1209, 1182, 417]
[3593, 1959, 917, 1404, 725, 432]
[1772, 594, 32, 1315, 555, 209]
[8297, 9383, 905, 3991, 2885, 1136]
[479, 476, 6, 414, 248, 57]
[284, 246, 13, 243, 94, 34]
[335, 444, 10, 206, 128, 25]
[2876, 1105, 235, 1303, 1870, 274]
[1970, 6887, 121, 309, 296, 241]
[1157, 589, 23, 745, 637, 110]
[951, 160, 77, 524, 690, 108]
[472, 104, 3, 377, 261, 15]
[310, 71, 7, 515, 241, 31]
[215, 124, 5, 140, 121, 26]


In [19]:
import csv

# ÊåáÂÆöË¶Å‰øùÂ≠òÁöÑÊñá‰ª∂Âêç
filename = "output.csv"

# ‰ΩøÁî® csv Ê®°ÂùóÂàõÂª∫ CSV Êñá‰ª∂Âπ∂ÂÜôÂÖ•Êï∞ÊçÆ
with open(filename, mode="w", newline="") as file:
    writer = csv.writer(file, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)

    # ÈÅçÂéÜ‰∫åÁª¥ÂàóË°®Âπ∂ÂÜôÂÖ• CSV Êñá‰ª∂
    for row in results:
        writer.writerow(row)