In [1]:
#pip install scikit-learn
#pip install pandas

# Main imports pandas and sklearn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

# We need to do some web scraping to get the review text
import urllib.request
from bs4 import BeautifulSoup

#Other imports
import time
import sys
from os import system
from IPython.display import clear_output

# Load in data
### There are a few steps to getting clean data
- Read data from kaggle into dataframes
- Get rid of unecessary features of the data
- (For general data) get the review text from the web
- Clean data using dropna

In [2]:
# Lets define our file paths for reading in data
# download dataset at: https://www.kaggle.com/datasets/joyshil0599/multi-decade-video-game-review-dataset
GENERAL_DATASET_PATH = "data/general_reviews.csv"
# download dataset at: https://www.kaggle.com/datasets/noahx1/elden-ring-steam-reviews
ELDEN_RING_DATASET_PATH = "data/elden_ring_steam_reviews.csv"
# Path to save cleaned general data file
GENERAL_CLEAN_DATA_PATH = 'data/general_data_clean.csv'

#load in dataset
# We have two datasets:
# one using gamespot reviews (general_data)
# the other Elden Ring Steam reviews (elden_ring_data)
general_data = pd.read_csv(GENERAL_DATASET_PATH)
elden_ring_data = pd.read_csv(ELDEN_RING_DATASET_PATH)


In [3]:
# Clean and format General Data
# Lets only keep the columns we want
general_data = general_data[['Review link','Rating/10']]
general_data.head()

Unnamed: 0,Review link,Rating/10
0,https://www.gamespot.com/reviews/tales-of-symp...,5.0
1,https://www.gamespot.com/reviews/kirbys-return...,7.0
2,https://www.gamespot.com/reviews/hogwarts-lega...,6.0
3,https://www.gamespot.com/reviews/atomic-heart-...,6.0
4,https://www.gamespot.com/reviews/like-a-dragon...,8.0


In [4]:
# We define 3 sentiments to classify
GOOD = 'Good'
NUETRAL = 'Nuetral'
BAD = 'Bad'

# General data
# I tried to choose values for rating that made sense and split the data kind of evenly
def GetSentiment(rating):
    if(rating is None):
        return None
    if(rating > 5):
        return GOOD
#     if(rating > 5):
#         return NUETRAL
    if(rating > 0):
        return BAD
s = []
for rating in general_data['Rating/10'].to_list():
    s.append(GetSentiment(rating))
general_data['Sentiment'] = s
general_data.dropna(inplace=True)
print("DATA DISTRIBUTION:")
SENTIMENTS = general_data['Sentiment'].unique()
for s in SENTIMENTS:
    print(f"{s}: {sum(1 for i in general_data['Sentiment'] if i == s)}")
general_data.head()

DATA DISTRIBUTION:
Bad: 1984
Good: 10608


Unnamed: 0,Review link,Rating/10,Sentiment
0,https://www.gamespot.com/reviews/tales-of-symp...,5.0,Bad
1,https://www.gamespot.com/reviews/kirbys-return...,7.0,Good
2,https://www.gamespot.com/reviews/hogwarts-lega...,6.0,Good
3,https://www.gamespot.com/reviews/atomic-heart-...,6.0,Good
4,https://www.gamespot.com/reviews/like-a-dragon...,8.0,Good


In [5]:
# Now lets define how to get the text of the reviews from the web
def GetReviewsFromWeb():
    num_processed = 1
    prev_time = time.time()
    num_links = len(general_data.index)
    check_interval = 5
    reviews = []
    failed_links = []
    for link in general_data['Review link']:
        # Try to get the review text
        try:
            page = urllib.request.urlopen(link).read()
            page = BeautifulSoup(page)
            review = ""
            body = page.find(class_="article-body typography-format")
            paragraphs = body.find_all("p")
            if(len(paragraphs)==0):
                raise Exception('NO REVIEW TEXT FOUND')
            for p in paragraphs:
                review+=p.text+" "
            reviews.append(review)
        # If we could not get the review text
        except:
            reviews.append(None)
            failed_links.append(link)
        # Display progress
        num_processed = num_processed + 1
        if(num_processed % check_interval == 0):
            prediction = (time.time() - prev_time)/(check_interval) * (num_links-len(reviews))
            prev_time = time.time()
            system('cls')
            clear_output(wait=True)
            print(f"Reviews lost: {len(failed_links)}")
            print(f"processed: {num_processed} / {num_links}")
            print(f"Time left: {int(prediction)} s")  
    #Update dataframe and write to file
    general_data['Review'] = reviews
    # Now lets finish cleaning the data by dropping any rows where we do not have a review
    old_size = general_data.shape
    general_data.dropna(inplace=True)
    general_data.to_csv(GENERAL_CLEAN_DATA_PATH)
    new_size = general_data.shape
    print("Dropping invalid reviews.")
    print(f"Old count: {old_size[0]}")
    print(f"New count: {new_size[0]}")
    print(f"Removed {old_size[0]-new_size[0]} reviews.")

In [6]:
# If we don't already have the clean data we need to get it from the web
file_found = True
try:
    general_data = pd.read_csv(GENERAL_CLEAN_DATA_PATH)
except:
    file_found = False
if(not file_found):
    GetReviewsFromWeb()


In [7]:
#split datasets
TEST_SIZE = 0.3
general_X = general_data['Review']
general_y = general_data['Sentiment']
general_X_train, general_X_test, general_y_train, general_y_test = train_test_split(general_X, general_y, test_size = TEST_SIZE, random_state=1)
general_X_train = general_X_train.to_list()
general_X_test = general_X_test.to_list()
general_y_train = general_y_train.to_list()
general_y_test = general_y_test.to_list()

# Creating svm
#### Helpful link to understand how this works:
- https://medium.com/@vasista/sentiment-analysis-using-svm-338d418e3ff1



## Vectorize the data
We will be using a method called **Term Frequency and Inverse Document Frequency (TF-IDF)**. 
Here is how it works: https://medium.com/@vasista/preparing-the-text-data-with-scikit-learn-b31a3df567e

#### sklearn supplies an easy way to implement this

In [8]:
# Create feature vectors
vectorizer = TfidfVectorizer(min_df = 5,max_df = 0.8,sublinear_tf = True,use_idf = True)
train_vectors = vectorizer.fit_transform(general_X_train)
test_vectors = vectorizer.transform(general_X_test)

## Create a Linear SVM model

In [9]:
svm_linear=svm.SVC(kernel='linear')
print('training SVM...')
start_time = time.time()
svm_linear.fit(train_vectors, general_y_train)
end_time = time.time()
print(f"Trained SVM in {end_time-start_time} s")

training SVM...
Trained SVM in 9.3456289768219 s


In [10]:
print('Testing SVM ...')
start_time = time.time()
prediction = svm_linear.predict(test_vectors)
end_time = time.time()
print(f"Tested SVM in {end_time-start_time} s")

Testing SVM ...
Tested SVM in 3.81997013092041 s


## Linear SVM general data results

In [11]:
report_linear = classification_report(general_y_test, prediction,output_dict=True,zero_division=0)
for key in report_linear.keys():
    print(f'{key}: {report_linear[key]}\n')


Bad: {'precision': 0.729903536977492, 'recall': 0.822463768115942, 'f1-score': 0.7734241908006815, 'support': 276}

Good: {'precision': 0.7336956521739131, 'recall': 0.6164383561643836, 'f1-score': 0.6699751861042184, 'support': 219}

accuracy: 0.7313131313131314

macro avg: {'precision': 0.7317995945757025, 'recall': 0.7194510621401629, 'f1-score': 0.7216996884524499, 'support': 495}

weighted avg: {'precision': 0.731581260670454, 'recall': 0.7313131313131314, 'f1-score': 0.7276558432683069, 'support': 495}



 # Create a Decision tree model

In [12]:
# #Lets make a descision tree
TREE_DEPTH = 2
dtc = DecisionTreeClassifier(max_depth = TREE_DEPTH)

print('training Decision Tree...')
start_time = time.time()
dtc.fit(train_vectors, general_y_train)
end_time = time.time()
print(f"Trained Decision Tree in {end_time-start_time} s")

training Decision Tree...
Trained Decision Tree in 0.24613380432128906 s


In [13]:
print('Testing Decision Tree ...')
start_time = time.time()
prediction = dtc.predict(test_vectors)
end_time = time.time()
print(f"Tested Decision Tree in {end_time-start_time} s")

Testing Decision Tree ...
Tested Decision Tree in 0.00400090217590332 s


## Decision Tree general data results

In [14]:
report_tree = classification_report(general_y_test, prediction,output_dict=True,zero_division=0)
for key in report_tree.keys():
    print(f'{key}: {report_tree[key]}\n')


Bad: {'precision': 0.7232704402515723, 'recall': 0.4166666666666667, 'f1-score': 0.5287356321839081, 'support': 276}

Good: {'precision': 0.5208333333333334, 'recall': 0.7990867579908676, 'f1-score': 0.6306306306306307, 'support': 219}

accuracy: 0.5858585858585859

macro avg: {'precision': 0.6220518867924528, 'recall': 0.6078767123287672, 'f1-score': 0.5796831314072695, 'support': 495}

weighted avg: {'precision': 0.633707356584715, 'recall': 0.5858585858585859, 'f1-score': 0.5738164496785186, 'support': 495}



# Test SVM on Elden Ring Reviews

In [15]:
# For elden ring data
elden_ring_data = elden_ring_data[['review','voted_up']]
def GetSentiment(up_vote):
    if(up_vote):
        return GOOD
    else:
        return BAD
s = []
for rating in elden_ring_data['voted_up'].to_list():
    s.append(GetSentiment(rating))
elden_ring_data['Sentiment'] = s
elden_ring_data.dropna(inplace=True)
print("DATA DISTRIBUTION:")
SENTIMENTS = elden_ring_data['Sentiment'].unique()
for s in SENTIMENTS:
    print(f"{s}: {sum(1 for i in elden_ring_data['Sentiment'] if i == s)}")
elden_ring_data.head()

DATA DISTRIBUTION:
Good: 9174
Bad: 591


Unnamed: 0,review,voted_up,Sentiment
0,being killed over and over again is fun,True,Good
1,I write this review as I have 100% completed E...,True,Good
2,Fun,True,Good
3,pretty cool.,True,Good
4,AMAZINGGGGGGGGGGGGG,True,Good


In [16]:
# Create vectors
elden_ring_test_X = elden_ring_data['review'].to_list()
elden_ring_test_y = elden_ring_data['Sentiment'].to_list()
elden_ring_vectors = vectorizer.transform(elden_ring_test_X)

In [17]:
# Test on elden ring reviews
print('Testing SVM on Elden Ring Reviews...')
start_time = time.time()
prediction = svm_linear.predict(elden_ring_vectors)
end_time = time.time()
print(f"Tested SVM in {end_time-start_time} s")

Testing SVM on Elden Ring Reviews...
Tested SVM in 10.122633934020996 s


## Elden ring SVM results

In [18]:
report_elden_ring = classification_report(elden_ring_test_y, prediction,output_dict=True,zero_division=0)
for key in report_tree.keys():
    print(f'{key}: {report_elden_ring[key]}\n')


Bad: {'precision': 0.06591380502419912, 'recall': 0.9678510998307953, 'f1-score': 0.12342215988779803, 'support': 591}

Good: {'precision': 0.9825206991720331, 'recall': 0.11641595814257685, 'f1-score': 0.20816684533671181, 'support': 9174}

accuracy: 0.1679467485919099

macro avg: {'precision': 0.5242172520981161, 'recall': 0.5421335289866861, 'f1-score': 0.16579450261225492, 'support': 9765}

weighted avg: {'precision': 0.9270455661007202, 'recall': 0.1679467485919099, 'f1-score': 0.20303790431261473, 'support': 9765}

