In [1]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
# warnings.filterwarnings("ignore", category=VisibleDeprecationWarning)

In [30]:
%matplotlib inline
import re
import numpy as np
import pandas as pd
from typing import Dict
from matplotlib import pyplot as plt

import spacy
from textacy.preprocessing.replace import urls, hashtags, numbers, emails, emojis, currency_symbols
from textacy.preprocessing.remove import punctuation
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

In [3]:
data = pd.read_csv('./Data/steam_final.csv')
data.head()

Unnamed: 0,app_id,app_name,review_id,language,review,timestamp_created,timestamp_updated,recommended,votes_helpful,votes_funny,...,steam_purchase,received_for_free,written_during_early_access,author.steamid,author.num_games_owned,author.num_reviews,author.playtime_forever,author.playtime_last_two_weeks,author.playtime_at_review,author.last_played
0,346110,ARK: Survival Evolved,34229965,english,"Great Game, terrible optimisation. Hopefully t...",1502693270,1502693270,False,5,0,...,True,False,True,76561198027465920,147,18,18112.0,1031.0,10328.0,1611217000.0
1,583470,The End Is Nigh,33494894,english,Huh. I actually don't like it. I've 100%-ed ea...,1499918663,1500003674,False,374,6,...,True,False,False,76561198024153634,625,40,81.0,0.0,81.0,1499918000.0
2,306130,The Elder Scrolls Online,38018491,english,very trashy talking about the tsev skyrim le,1512259531,1512259531,False,1,0,...,False,False,False,76561198372062226,67,2,2312.0,0.0,31.0,1541884000.0
3,275850,No Man's Sky,24850717,english,Lag as shit,1471028227,1471028227,False,0,0,...,True,False,False,76561198123479228,47,2,28.0,0.0,28.0,1471026000.0
4,271590,Grand Theft Auto V,33214384,english,Precarious policies.,1499142004,1499142004,False,4,0,...,True,False,False,76561197984388338,74,3,2332.0,0.0,2120.0,1517301000.0


## data preprocess

> encode label: `Recommended` -> `1`, `Not Recommended` -> `0`

In [4]:
data['label'] = np.ones(len(data))
data.loc[data['recommended']==False, 'label'] = 0

In [5]:
data = data[['review', 'label']]
data.head()

Unnamed: 0,review,label
0,"Great Game, terrible optimisation. Hopefully t...",0.0
1,Huh. I actually don't like it. I've 100%-ed ea...,0.0
2,very trashy talking about the tsev skyrim le,0.0
3,Lag as shit,0.0
4,Precarious policies.,0.0


In [6]:
data['review'].str.len().describe()

count    69909.000000
mean       302.586677
std        644.887908
min          1.000000
25%         26.000000
50%         91.000000
75%        291.000000
max       8000.000000
Name: review, dtype: float64

In [7]:
data[data['review'].isnull()]['label'].describe()

count    91.000000
mean      0.857143
std       0.351866
min       0.000000
25%       1.000000
50%       1.000000
75%       1.000000
max       1.000000
Name: label, dtype: float64

drop NA reviews

In [8]:
data.dropna(how='any', inplace=True)
data.shape

(69909, 2)

drop duplicated reviews

In [9]:
data.drop_duplicates('review', inplace=True)
data.shape

(64249, 2)

In [10]:
data

Unnamed: 0,review,label
0,"Great Game, terrible optimisation. Hopefully t...",0.0
1,Huh. I actually don't like it. I've 100%-ed ea...,0.0
2,very trashy talking about the tsev skyrim le,0.0
3,Lag as shit,0.0
4,Precarious policies.,0.0
...,...,...
69994,good game i like it,1.0
69995,It just runs really well and is super streamli...,1.0
69996,I like tthis game iot have gun and it have a b,1.0
69998,Bought it again for the 3rd time on PC (had no...,1.0


### text preprocess

#### clean text

In [11]:
data['review'] = data['review'].\
                  apply(urls).\
                  apply(hashtags).\
                  apply(numbers).\
                  apply(currency_symbols).\
                  apply(emojis).\
                  apply(emails)

In [12]:
data['review'] = data['review'].apply(punctuation)

In [13]:
data['review']

0        Great Game  terrible optimisation  Hopefully t...
1        Huh  I actually don t like it  I ve  NUMBER   ...
2             very trashy talking about the tsev skyrim le
3                                              Lag as shit
4                                     Precarious policies 
                               ...                        
69994                                  good game i like it
69995    It just runs really well and is super streamli...
69996       I like tthis game iot have gun and it have a b
69998    Bought it again for the 3rd time on PC  had no...
69999                   damn good game and super addicting
Name: review, Length: 64249, dtype: object

In [14]:
data['review'] = data['review'].str.replace("  ", " ", regex=True)
data['review'] = data['review'].str.replace(r'(<br />)', " ", regex=True)
data['review'] = data['review'].str.replace(r'[.!?\-\"\\]', ' ', regex=True)
data['review'] = data['review'].str.replace("\n", " ", regex=True)

In [15]:
data['review'] = data['review'].str.replace("EMOJI", "", regex=True)

some data-orientated RegEx cleaning

In [15]:
def regex_cleaning(review):
    review = re.sub(r'\baa*(?:h*)?\b', 'ah', review)
    review = re.sub(r'\ba+n+d+\b', 'and', review)
    review = re.sub(r'\babso?u?lo?u?e?te?l*e?y\b', 'absolutely', review)
    review = re.sub(r'\baf+\b', 'as fuck', review)
    review = re.sub(r'\banyt+hing*s?\b', 'anything', review)
    review = re.sub(r'\bbec+u?a*s*u?(?:s|c|z)*e?\b', 'because', review)
    review = re.sub(r'\bboi+\b', 'boy', review)
    review = re.sub(r'\bbo+r+i+n+g+\b', 'boring', review)
    review = re.sub(r'\bchall(?:e|a)n?a?ge?ing\b', 'challenging', review)
    review = re.sub(r'\bcoo+l\b', 'cool', review)
    review = re.sub(r'\bdam+(?:it)?\b', 'damn', review)
    review = re.sub(r'\b(defina[a-z]+|definet[a-z]+|defini[a-z]+)\b', 'definitely', review)
    review = re.sub(r'\beh+\b', 'eh', review)
    review = re.sub(r'\bes+entu?i?ai?l+y\b', 'essentially', review)
    review = re.sub(r'\bever+\b', 'ever', review)
    review = re.sub(r'\bfavou?i?r?(?:i|0)ti?es?\b', 'favorite', review)
    review = re.sub(r'\bfu+n*\b', 'fun', review)
    review = re.sub(r'\bfu+c+k+(?:i+n+g+)?\b', 'fuck', review)
    review = re.sub(r'\bg+o+d+\b', 'god', review)
    review = re.sub(r'\b(h+a+)+\b', 'haha', review)
    review = re.sub(r'\bh+m+\b', 'hmm', review)
    review = re.sub(r'\blmao+\b', 'laugh', review)
    review = re.sub(r'\b(lo+l+|(?:lo)+l)\b', 'laugh', review)
    review = re.sub(r'\bl(?:o+|u+)v+(?:e+)?\b', 'love', review)
    review = re.sub(r'\bmuli?t?i?pla?(?:y|i)?a?er?s?\b', 'multiplayer', review)
    review = re.sub(r'\bn+i+c+e+\b', 'nice', review)
    review = re.sub(r'\bno+(?:pe)?\b', 'no', review)
    review = re.sub(r'\bo+h+\b', 'oh', review)
    review = re.sub(r'\breco(?:me)?mm*(?:e|a|o)?n*d?o??(?:ed)?\b', 'recommend', review)
    review = re.sub(r'\bre+a+l*y+\b', 'really', review)
    review = re.sub(r'\b(shi+t+)+\b', 'shit', review)
    review = re.sub(r'\bso+\b', 'so', review)
    review = re.sub(r'\bto{2,}\b', 'too', review)   
    review = re.sub(r'\bve+r+y+\b', 'very', review)   
    review = re.sub(r'\bw+a+y+\b', 'way', review)
    review = re.sub(r'\b(wh+y+)+\b', 'why', review)
    review = re.sub(r'\b(wo+w+)+\b', 'wow', review)
    review = re.sub(r'\b(ya+y+)+\b', 'yay', review)
    review = re.sub(r'\bye+a+h?\b', 'yeah', review)
    review = re.sub(r'\bz{2,}\b', 'z', review)
    return review

In [16]:
data['review'] = data['review'].apply(regex_cleaning)

#### removing stopwords

In [17]:
docs = data['review']
labels = data['label']

In [18]:
nlp = spacy.load('en_core_web_sm', disable=["ner", "tagger"])
words_set = set(nlp.vocab.strings)
docs_cleaned = list(
    map(lambda doc: " ".join([token.text for token in nlp(doc) 
                                      if (not token.is_stop) and (token.text in words_set)]), docs))



In [19]:
docs_cleaned

['Great Game terrible Hopefully fix instead trying cash grab',
 'Huh actually don t like ve   ed game Edmund ve ah fan ah minute End came quickly t expectations focus far precision Meat Boy s charm s fluidity speed precision miles ah minute far methodical drawn mention ah lot forced trial error pretty meh cartridges ah big ticket item merely stages lo fi graphics Music great s odd charm fact     public domain music rock Isaac graphics fantastic Controls perfect s ah self contained game map stage real yes backtrack world entirety search mileage   real charm ah bland color scheme world little variety waiting jumping right spot repeat opposed Meat Boy s dynamic levels far obstacles variety different mechanics speed kept things fresh End basically Meat Boy distilled main mechanics remove wall jumping sprint switch focus fluidity precision dull stages feel like change feel',
 'trashy talking le',
 'Lag shit',
 'policies',
 'liked game',
 'need Chinese party party Chinese experience Paradox 

remove extra white spaces

In [20]:
docs_cleaned = [re.sub(' +', ' ', i) for i in docs_cleaned]
# docs_cleaned = [sentence for sentence in docs_cleaned if len(sentence)>1]
docs_cleaned

['Great Game terrible Hopefully fix instead trying cash grab',
 'Huh actually don t like ve ed game Edmund ve ah fan ah minute End came quickly t expectations focus far precision Meat Boy s charm s fluidity speed precision miles ah minute far methodical drawn mention ah lot forced trial error pretty meh cartridges ah big ticket item merely stages lo fi graphics Music great s odd charm fact public domain music rock Isaac graphics fantastic Controls perfect s ah self contained game map stage real yes backtrack world entirety search mileage real charm ah bland color scheme world little variety waiting jumping right spot repeat opposed Meat Boy s dynamic levels far obstacles variety different mechanics speed kept things fresh End basically Meat Boy distilled main mechanics remove wall jumping sprint switch focus fluidity precision dull stages feel like change feel',
 'trashy talking le',
 'Lag shit',
 'policies',
 'liked game',
 'need Chinese party party Chinese experience Paradox Chinese 

remove consecutively repeated patterns

In [21]:
docs_cleaned = [re.sub(r'^(.+?)\1+', r'\1', i) for i in docs_cleaned]

#### lemmatization

In [22]:
data_cleaned = pd.DataFrame(docs_cleaned, columns=['review'])
# data_cleaned['label'] = labels # otherwise, index problems
data_cleaned['label'] = labels.tolist()
data_cleaned

Unnamed: 0,review,label
0,Great Game terrible Hopefully fix instead tryi...,0.0
1,Huh actually don t like ve ed game Edmund ve a...,0.0
2,trashy talking le,0.0
3,Lag shit,0.0
4,policies,0.0
...,...,...
64244,god game like,1.0
64245,runs super streamlined,1.0
64246,like game iot gun ah b,1.0
64247,Bought 3rd time PC normal legendary edition pi...,1.0


In [28]:
# function to convert nltk tag to wordnet tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

In [24]:
lemmatizer = WordNetLemmatizer()

In [31]:
data_cleaned['review'] = data_cleaned['review'].apply(lemmatize_sentence)

In [32]:
data_cleaned

Unnamed: 0,review,label
0,Great Game terrible Hopefully fix instead try ...,0.0
1,Huh actually don t like ve ed game Edmund ve a...,0.0
2,trashy talk le,0.0
3,Lag shit,0.0
4,policy,0.0
...,...,...
64244,god game like,1.0
64245,run super streamline,1.0
64246,like game iot gun ah b,1.0
64247,Bought 3rd time PC normal legendary edition pi...,1.0


In [33]:
data_cleaned.to_csv('./Data/steam_cleaned_v2.csv')