In [14]:
import pandas as pd
import numpy as np
import re

# Merge the review data

In [15]:
# Import the dataset
pos = pd.read_csv("er-pos.csv")
neg = pd.read_csv("er-neg.csv")

In [4]:
# Drop NA reviews
pos.dropna(inplace=True)
neg.dropna(inplace=True)

In [5]:
# Reshuffle rows
pos = pos.sample(frac=1, random_state=19991222)
neg = neg.sample(frac=1, random_state=19991222)

In [6]:
# Concatenate
df = pd.concat([pos, neg])

In [7]:
# Change from boolean to int
df["voted_up"] = [1 if boolean else 0 for boolean in df.voted_up.tolist()]
df["received_for_free"] = [1 if boolean else 0 for boolean in df.received_for_free.tolist()]

# Text preprocessing

In [8]:
# Drops anything that is not alphanumeric or an ASCII character.
def drop_strange_symbols(string):
    return ''.join([i for i in string if i.isalnum() or ord(i)<128])

# Remove strange characters, links, formatting tags and line breaks. 
def clean_reviews(text):
    text = drop_strange_symbols(text)
    text = re.sub(r'\n', ' ', text) # remove line breaks
    text = re.sub(r'\[[^\[\]]*\]', '', text) # remove [b]format tags[\b]
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE) # remove links
    text = re.sub(r' +', ' ', text) # remove extra spaces
    return text

df["clean"] = df["review"].apply(lambda x : clean_reviews(x))

# Remove reviews that don't contain any letters/numbers at all.
def only_symbols(string):
    return all([not i.isalnum() for i in string])

df = df[[not only_symbols(i) for i in df.review]]

# Drop reviews with fewer than 3 characters
df = df[[len(text) >= 3 for text in df.clean]]

# Reset index
df.reset_index(inplace=True, drop=True)

In [9]:
df.sample(10)

Unnamed: 0,review,voted_up,received_for_free,clean
24081,fantastic game \n10/10 story and It's difficult,1,0,fantastic game 10/10 story and It's difficult
43132,It has been a whole week and the game still wo...,0,0,It has been a whole week and the game still wo...
11528,Game is extremely engaging. Love to struggle t...,1,0,Game is extremely engaging. Love to struggle t...
44220,Game runs poorly. I have a good PC. Even if I ...,0,0,Game runs poorly. I have a good PC. Even if I ...
30255,Non existent story.,0,0,Non existent story.
14752,"The PvP scene is the most respectful, respecta...",1,0,"The PvP scene is the most respectful, respecta..."
7017,L + You're Tarnished,1,0,L + You're Tarnished
12049,"Highly addicting and frustrating, at least unt...",1,0,"Highly addicting and frustrating, at least unt..."
7326,delicious like moms home cooked spaghetti,1,0,delicious like moms home cooked spaghetti
11348,Best single player game of all time no argument,1,0,Best single player game of all time no argument


In [10]:
df.shape

(49374, 4)

In [11]:
df.voted_up.value_counts()

0    24725
1    24649
Name: voted_up, dtype: int64

In [12]:
df.received_for_free.value_counts()

0    48614
1      760
Name: received_for_free, dtype: int64

In [13]:
df.review.apply(lambda x: len(x)).describe()

count    49374.000000
mean       407.818265
std        891.780362
min          3.000000
25%         29.000000
50%        104.000000
75%        383.000000
max       8000.000000
Name: review, dtype: float64

# Saving changes

In [12]:
final = df[["clean", "voted_up"]]
final.rename(columns={"clean": "review"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final.rename(columns={"clean": "review"}, inplace=True)


In [13]:
final.sample(10)

Unnamed: 0,review,voted_up
42247,You would think that with the recommended hard...,0
36623,squid games,0
574,"Turns out fromsoft make good games, who knew?",1
16945,Amazing. Worth every penny.,1
27265,THE GAME SUCKS. pC GAME WITH pS CONTROLS. tHE ...,0
14179,Secret dung eater sex scene makes it worth the...,1
23649,A true Masterpiece. The best game From Softwar...,1
37209,This is a masterpiece and will be Game of the ...,0
8054,"very engaging, difficult but fun",1
16359,Elden Ring has saved gaming.,1


In [14]:
final.to_csv("er-reviews.csv", index=False)