# Sentiment analysis & keyword extraction for user reviews on Steam

In [35]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, confusion_matrix

import yake as yk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('dark', {'axes.grid' : False})

## Sentiment analysis

In [36]:
df = pd.read_csv('../../datasets/steam_reviews.csv')
df.head()

Unnamed: 0,game_id,review,sentiment
0,292030,one greatest rpgs date triumph genre even 7 ye...,1
1,292030,stunning upgraded graphic best rpg game ever,1
2,292030,rowch,1
3,292030,leave cat friend pas pet give thumb フ l ミ xノ ヽ...,1
4,292030,consumed,1


In [37]:
# training and testing data

df_train = df.copy().iloc[:1500, :]
df_test = df.copy().iloc[1500:, :]
print(df_train.shape, df_test.shape)

(1500, 3) (500, 3)


In [38]:
# empty reviews present after cleaning

df.review.isna().sum()

15

In [39]:
# removing empty reviews

df_train = df_train.dropna()
df_test = df_test.dropna()
print(df_train.shape, df_test.shape)

(1488, 3) (497, 3)


### SVM model

In [40]:
# vectorizing text docs

docs_train, docs_test = df_train.review.to_list(), df_test.review.to_list()
vectorizer = CountVectorizer()
vectorizer.fit(docs_train)

vectors_train = vectorizer.transform(docs_train)
vectors_test = vectorizer.transform(docs_test)

vocab_train = vectorizer.vocabulary_

In [41]:
print(type(vocab_train), len(vocab_train))
vectors_train.shape

<class 'dict'> 5145


(1488, 5145)

In [42]:
X_train, X_test = vectors_train.toarray(), vectors_test.toarray()
Y_train, Y_test = df_train.sentiment.to_numpy(), df_test.sentiment.to_numpy()
print(X_train.shape, Y_train.shape)

(1488, 5145) (1488,)


In [43]:
print(len(X_train[Y_train==1]), len(X_train[Y_train==0]))

1320 168


In [44]:
# need to upsample the negative sentiment category

minority_samples_needed = len(X_train[Y_train==1]) - len(X_train[Y_train==0])
X_re_neg = resample(X_train[Y_train==0], n_samples=minority_samples_needed)
print(type(X_re_neg), X_re_neg.shape)

<class 'numpy.ndarray'> (1152, 5145)


In [45]:
X_train = np.concatenate((X_train, X_re_neg))
Y_train = np.concatenate( (Y_train, np.zeros(len(X_re_neg))) )
print(X_train.shape, Y_train.shape)

(2640, 5145) (2640,)


In [46]:
# create SVM classifier

classifier = svm.SVC(kernel='rbf')
classifier.fit(X_train, Y_train)
predictions = classifier.predict(X_test)
predictions.shape

(497,)

In [47]:
# evaluate model

print(f"Confusion matrix:\n{confusion_matrix(Y_test, predictions)}\n"
      f"Accuracy_score: {accuracy_score(Y_test, predictions)}")

Confusion matrix:
[[  7  36]
 [  8 446]]
Accuracy_score: 0.9114688128772636


## Keyword extraction

In [48]:
# removing null reviews off the original dataframe

df = df.dropna()

In [49]:
# extractor

language = 'en'
max_ngram = 1
keywords_num = 25
extractor = yk.KeywordExtractor(lan=language, n=max_ngram, top=keywords_num)

In [50]:
# keywords by sentiment

pos_revs = ' '.join(df.loc[df["sentiment"]==1, "review"].to_list())
neg_revs = ' '.join(df.loc[df["sentiment"]==0, "review"].to_list())

pos_kws = extractor.extract_keywords(pos_revs)
neg_kws = extractor.extract_keywords(neg_revs)
print(f"Positive:\n {[p for p, _ in pos_kws]}")
print(f"Negative:\n {[n for n, _ in neg_kws]}")

Positive:
 ['game', 'story', 'good', 'great', 'play', 'time', 'played', 'world', 'graphic', 'amazing', 'character', 'fun', 'gameplay', 'love', 'playing', 'experience', 'make', 'recommend', 'feel', 'run', 'lot', 'beautiful', 'hour', 'worth', 'part']
Negative:
 ['game', 'play', 'story', 'time', 'woman', 'hour', 'crash', 'run', 'good', 'men', 'world', 'issue', 'character', 'boring', 'graphic', 'people', 'played', 'make', 'port', 'playing', 'rockstar', 'setting', 'combat', 'buy', 'feel']


In [51]:
# keywords by game

game_kws = dict()

for g in df.game_id.unique():
  all_rev_text = ' '.join(df.loc[df["game_id"]==g, "review"].to_list())
  game_kws[g] = extractor.extract_keywords(all_rev_text)

for k, v in game_kws.items():
  print(f'{k}:{[w for w, _ in v]}\n')

292030:['game', 'story', 'good', 'rpg', 'witcher', 'world', 'play', 'played', 'great', 'time', 'quest', 'hour', 'character', 'playing', 'fun', 'make', 'combat', 'experience', 'side', 'feel', 'amazing', 'made', 'gameplay', 'rpgs', 'geralt']

1888930:['game', 'story', 'good', 'play', 'time', 'run', 'played', 'graphic', 'great', 'issue', 'part', 'bug', 'crash', 'performance', 'amazing', 'port', 'fps', 'setting', 'playing', 'recommend', 'character', 'experience', 'gameplay', 'worth', 'thing']

1151640:['game', 'story', 'good', 'great', 'world', 'play', 'graphic', 'time', 'gameplay', 'fun', 'character', 'amazing', 'played', 'combat', 'beautiful', 'machine', 'enemy', 'open', 'feel', 'love', 'lot', 'woman', 'quest', 'aloy', 'recommend']

1174180:['game', 'story', 'good', 'play', 'world', 'time', 'rockstar', 'dead', 'love', 'great', 'arthur', 'red', 'character', 'played', 'online', 'fun', 'redemption', 'hour', 'player', 'amazing', 'make', 'horse', 'mission', 'open', 'feel']

