In [29]:
import pandas as pd
import numpy as np
import ast
import re
import nltk

from profanity_check import predict, predict_prob
from langdetect import detect
from flair.models import TextClassifier
from flair.data import Sentence
from re import finditer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize

In [2]:
train = pd.read_csv("train_ds.csv")
test = pd.read_csv("test_ds.csv")
print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

Train shape: (17494, 9)
Test shape: (8045, 8)


## Feature Engineering

### Language Detection

It is possible that our datasets contain reviews from different languages.

In this case, we'll make use of *langdetect* library in order to detect the language of our reviews and add that as a feature to our dataset.

In [3]:
languages_train = []

for i in train.user_review:
    try:
        lang = detect(str(i))
    except:
        lang = 'error'
    languages_train.append(lang)

languages_test = []

for i in test.user_review:
    try:
        lang = detect(str(i))
    except:
        lang = 'error'
    languages_test.append(lang)
    
train['language'] = languages_train
test['language'] = languages_test

In [4]:
print(f"There are {train[train.language=='en'].shape[0]/train.shape[0]*100}% songs that are in TRAIN English language")
print(f"There are {(1 - train[train.language=='en'].shape[0]/train.shape[0])*100}% songs that are NOT in TRAIN English language")

There are 98.77672344803933% songs that are in TRAIN English language
There are 1.2232765519606725% songs that are NOT in TRAIN English language


In [5]:
print(f"There are {test[test.language=='en'].shape[0]/test.shape[0]*100}% songs that are in TEST English language")
print(f"There are {(1 - test[test.language=='en'].shape[0]/test.shape[0])*100}% songs that are NOT in TEST English language")

There are 98.91858297078932% songs that are in TEST English language
There are 1.081417029210685% songs that are NOT in TEST English language


Since we have little data containing non-English reviews, we'll drop them from our dataset.

In [6]:
train = train[train.language == 'en']
test = test[test.language == 'en']
print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

Train shape: (17280, 10)
Test shape: (7958, 9)


### Profanity

We'll add another feature, profanity, that is telling us if a review contains profanity, swearings or not. Maybe we can correlate this feature with a negative sentiment of the user's review. 

In [7]:
train["has_swearing"] = train.user_review.apply(lambda x: 1 if predict([x]) == 1 else 0)
test["has_swearing"] = test.user_review.apply(lambda x: 1 if predict([x]) == 1 else 0)

In [8]:
print(f"There are {train[train.has_swearing==1].shape[0]} from train reviews that contain profanity")
print(f"There are {test[test.has_swearing==1].shape[0]} from test reviews that contain profanity")

There are 445 from train reviews that contain profanity
There are 184 from test reviews that contain profanity


In [9]:
train[train.has_swearing == 1].sample(5)

Unnamed: 0,review_id,title,year,user_review,user_suggestion,developer,publisher,tags,overview,language,has_swearing
14291,20235,Infestation: The New Z,2017.0,Admin has no responsibility with their suck se...,0,Fredaikis AB,Fredaikis AB,"['Zombies', 'Free to Play', 'Survival', 'Multi...",Infestation: The New Z is a FREE TO PLAY multi...,en,1
1947,2411,Dota 2,2014.0,"This one is an awesome game! Beautiful, balanc...",1,Valve,Valve,"['Free to Play', 'MOBA', 'Strategy', 'Multipla...","The most-played game on Steam.Every day, milli...",en,1
6300,8675,Heroes & Generals,2017.0,It's got 2 main ingredients that can wreck any...,0,RETO MOTO,RETO MOTO,"['Free to Play', 'World War II', 'Multiplayer'...","Heroes & Generals is a full on, all-out WAR ex...",en,1
6710,9085,Heroes & Generals,2014.0,"Early Access Reviewkill someone : ""NOOOB""get k...",1,RETO MOTO,RETO MOTO,"['Free to Play', 'World War II', 'Multiplayer'...","Heroes & Generals is a full on, all-out WAR ex...",en,1
17092,25061,Cuisine Royale,2018.0,Early Access ReviewIt has a pop up game launch...,0,Darkflow Software,Gaijin Distribution KFT,"['Early Access', 'Free to Play', 'Battle Royal...",Cuisine Royale is an all-kitchen-warfare Battl...,en,1


### Sentiment Analysis

This feature could also be considered as the target value (user_suggestion).

We'll make use of a Flair model that predicts if a review is positive (1) or negative (0). 

We can use this Flair as a model to compare with the actual labels.

In [10]:
sia = TextClassifier.load('en-sentiment')
def sentiment_prediction(x):
    sentence = Sentence(x)
    sia.predict(sentence)
    score = sentence.labels[0]
    if "POSITIVE" in str(score):
        return 1
    elif "NEGATIVE" in str(score):
        return 0
    else:
        return 0
    
train["sentiment"] = train["user_review"].apply(sentiment_prediction)
test['sentiment'] = test['user_review'].apply(sentiment_prediction)

2022-03-07 21:50:28,333 loading file C:\Users\Raluca\.flair\models\sentiment-en-mix-distillbert_4.pt


In [43]:
train = train.reset_index()
test = test.reset_index()

### Label Encoding of some of the Categorical columns

In [45]:
dfC3 = [ast.literal_eval(i) for i in train.tags]
ids,U = pd.factorize(np.concatenate(dfC3))
df_out = pd.DataFrame([np.isin(U,i) for i in dfC3], columns=U).astype(int)
dataframe_train = pd.concat([train, df_out], axis = 1)
dataframe_train = dataframe_train.drop(['tags'], axis = 1)
dataframe_train = dataframe_train.drop(['index', 'language', 'year', 'title', 'developer', 'publisher', 'overview'], axis = 1)
dataframe_train.head(3)

Unnamed: 0,level_0,review_id,user_review,user_suggestion,has_swearing,sentiment,clean_review,Horror,Free to Play,Cute,...,City Builder,Resource Management,Gun Customization,Education,Puzzle,America,Masterpiece,Family Friendly,Mod,Classic
0,0,1,I'm scared and hearing creepy voices. So I'll...,1,0,0,scared hearing creepy voice pause moment write...,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,1,2,"Best game, more better than Sam Pepper's YouTu...",1,0,1,best game better sam pepper tube account need ...,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,2,3,"A littly iffy on the controls, but once you kn...",1,0,1,littly iffy control know play easy master made...,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [46]:
dfC3 = [ast.literal_eval(i) for i in test.tags]
ids,U = pd.factorize(np.concatenate(dfC3))
df_out = pd.DataFrame([np.isin(U,i) for i in dfC3], columns=U).astype(int)
dataframe_test = pd.concat([test, df_out], axis = 1)
dataframe_test = dataframe_test.drop(['tags'], axis = 1)
dataframe_test = dataframe_test.drop(['index', 'title', 'language', 'year', 'developer', 'publisher', 'overview'], axis = 1)
dataframe_test.head(3)

Unnamed: 0,level_0,review_id,user_review,has_swearing,sentiment,clean_review,FPS,Multiplayer,Shooter,Action,...,Superhero,Comic Book,Magic,Board Game,Third-Person Shooter,Battle Royale,Zombies,Violent,Walking Simulator,Gore
0,0,1603,"Nice graphics, new maps, weapons and models. B...",0,0,nice graphic new map weapon model developer li...,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,1,1604,I would not recommend getting into this at its...,0,0,would recommend getting current state csgo hit...,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,2,1605,Edit 11/12/18I have tried playing CS:GO recent...,0,0,edit tried playing c go recently dramatically ...,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


## Preprocessing the data

We'll create a new column, *clean_review*, in which we store the cleaned data.

The preprocessing steps are:
- removing Early Access Review string from some of the users' reviews
- handling camelcase: "heRocks" -> "he Rocks"
- removing extra whitespaces
- removing weird symbols
- lowercasing
- tokenization
- stopwords removal
- removal of words shorter than 3 characters
- lemmatization

In [47]:
def remove_EAR(X):
    X = X.replace("Early Access Review", "")
    return X

def handle_camelcase(X):
    matches = finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', X)
    return " ".join([m.group(0) for m in matches])

def handling_whitespaces(X):
    X = " ".join(X.split())
    X = X.strip()
    return X

waste_symbols = "人̳⣟⣦̪⠓▒͎¸⠟⣅>⡾ ⠻⣀⣛„ͭ⣮⡻⠦⡀͐‘̨⣆̤⣿<／丶⣞͇⣵͞⠹ͩ⢒̯⢸⣤̗̫ͯ͆̔͠⠛⢻⠏-́☐̺͛̋⠸⣥⠄̷＼͟·⌒͗⠁́｀⢹\\⢄͈̌ͨ⢤彡~¯/⠶⠲ˆ⡥̮̻͔☉⣻̣ゝ⡞̧͙̿̒̊̑ノ⠭ͤ_⠐⣇҉̚–⡄´̓█▄☑⣧̴͖̍｜⣷̭͘͝｡⠴̜̄ʖ¨̵̏͢⢂͋;͒:⢉つ̾＿̈⣴⣌ͫ⢛⡹⣈へ⢯,̅⣭̩̬̕⡈ム͡⣼ͦ)̛͜ヽ̝̥⣠⢟̶⠤̡͉⠘̹̈́⡴̠⢀）⠇⣾͊⢰̞ͮ̇`⠑⡿\u3000⠃⣸⠾͍̆ͅ￣⢚̓⠂⡵─⢬ー⠿(⠆⠉̦*͕ﾉ⣹⡟⣬⠙▓⡐7͏̟̲⢿⢦（̰♥̸̢⣙͓̂▀くﾌ⠀.⠰⡒°̖̎､⣒⣰̼⢅⣁⠒͑⢾⡂͌̀ͧ…̃▐ﾚ、丿⢌|̱⢴⡠⣩▌⣉͚ͪ'⢆⢠⡇⡛⣏⡶⣜⣄⡸⠈̘ͣ⣽̉̽̐ͥ⡏ͬ⣗⣶░⠋⠔̙͂^"

def remove_waste_symbols(X):
    for item in waste_symbols:
        X = X.replace(item, " ") 
    return X

def eliminate_chars(X):
    X = remove_EAR(X)
    X = remove_waste_symbols(X)
    X = handle_camelcase(X)
    X = handling_whitespaces(X)
    return X

def text_preprocessing(text, lemmatize):
    if not isinstance(text, str):
        text = text.decode('ISO-8859-1')
    
    text = re.sub('[^a-zA-Z]', ' ', text) # Clear the special characters from our dataset
    text = text.lower() 
    text = text.split() 
    text = ' '.join(text)
    #print('\tClear Text.\n', text)

    # Tokenize
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    #print('\tTokenizing.\n', tokens)

    # Removing the stopwords
    stop = stopwords.words('english')
    tokens = [token for token in tokens if token not in set(stop)]
    #print('\tRemoving the stopwords.\n', tokens)
    
    # Remove words shorter than 3 characters
    tokens = [token for token in tokens if len(token) >= 2]
    #print('\tRemoving the words shorter than 3 characters\n', tokens)
    

    if lemmatize:
        # Lemmatize
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        #print('\tLemmatizing.\n', tokens)
    else:
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(word) for word in tokens]
    
    # Preprocessed text
    tokens = ' '.join(tokens)
    
    return tokens

dataframe_train['clean_review'] = dataframe_train['user_review'].apply(lambda x: eliminate_chars(x))
dataframe_test['clean_review'] = dataframe_test['user_review'].apply(lambda x: eliminate_chars(x))
dataframe_train['clean_review'] = [text_preprocessing(doc, True) for doc in dataframe_train.clean_review]
dataframe_test['clean_review'] = [text_preprocessing(doc, True) for doc in dataframe_test.clean_review]

### Conclusion

We'll save the datasets created in order to continue with the Machine Learning approaches.

In [54]:
dataframe_train = dataframe_train.drop(['level_0'], axis = 1)
dataframe_test = dataframe_test.drop(['level_0'], axis = 1)

dataframe_train.to_csv("final_train.csv", index = False)
dataframe_test.to_csv("final_test.csv", index = False)