In [1]:
#imports
#pip install scikit-learn
#pip install pandas

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn import metrics
# We need to do some web scraping to get the review text
import urllib.request
from bs4 import BeautifulSoup
#Other imports
import time
import sys
from os import system
from IPython.display import clear_output

In [2]:
# Lets define our file paths for reading in data
# download dataset at: https://www.kaggle.com/datasets/joyshil0599/multi-decade-video-game-review-dataset
GENERAL_DATASET_PATH = "data/general_reviews.csv"
# download dataset at: https://www.kaggle.com/datasets/noahx1/elden-ring-steam-reviews
ELDEN_RING_DATASET_PATH = "data/elden_ring_steam_reviews.csv"

# Other paths to save cleaned data files
GENERAL_CLEAN_DATA_PATH = '/data/general_data_clean.csv'

#load in datasets
# We have two datasets:
# one using gamespot reviews (general_data)
# the other Elden Ring Steam reviews (elden_ring_data)
general_data = pd.read_csv(GENERAL_DATASET_PATH)
elden_ring_data = pd.read_csv(ELDEN_RING_DATASET_PATH)


In [3]:
# Clean and format General Data
# Lets only keep the columns we want
general_data = general_data[['Review link','Rating/10','Rating Comments']]
general_data.head()

Unnamed: 0,Review link,Rating/10,Rating Comments
0,https://www.gamespot.com/reviews/tales-of-symp...,5.0,Mediocre
1,https://www.gamespot.com/reviews/kirbys-return...,7.0,Good
2,https://www.gamespot.com/reviews/hogwarts-lega...,6.0,Fair
3,https://www.gamespot.com/reviews/atomic-heart-...,6.0,Fair
4,https://www.gamespot.com/reviews/like-a-dragon...,8.0,Great


In [4]:
# We can get all the comment types with:
# print(general_data['Rating Comments'].unique())
# This gives the following list:
# ['Mediocre' 'Good' 'Fair' 'Great' 'Superb' 'Poor' 'Essential' 'Abysmal'
#  'Early Access' 'Terrible' 'Bad' 'Good\x1a' nan 'Essential"width-100 src']
# Lets convert these into just good and bad. We will make a dictionary showing what to convert to in each case
GOOD = 0
NUETRAL = 1
BAD = 2
#In case we want to use rating to indicate sentiment
# def GetSentiment(rating):
#     if(rating < 5):
#         return BAD
#     else:
#         return GOOD
comment_dict = {'Mediocre':NUETRAL,'Good':GOOD,'Fair':NUETRAL,'Great':GOOD,'Superb':GOOD,
                'Poor':BAD,'Essential':GOOD,'Abysmal':BAD,'Early Access':NUETRAL,
                'Terrible':BAD,'Bad':BAD,'Good\x1a':GOOD,'Essential"width-100 src':None}
#Convert rating comments using dict
general_data.replace({'Rating Comments':comment_dict}, inplace=True)
general_data.dropna(inplace=True)
print(general_data['Rating Comments'].unique())
print(f"shape={general_data.shape}")
#Lets rename Rating Comments to Sentiment
general_data.rename(columns={'Rating Comments':'Sentiment'},inplace=True)
general_data.head()

[1. 0. 2.]
shape=(12591, 3)


Unnamed: 0,Review link,Rating/10,Sentiment
0,https://www.gamespot.com/reviews/tales-of-symp...,5.0,1.0
1,https://www.gamespot.com/reviews/kirbys-return...,7.0,0.0
2,https://www.gamespot.com/reviews/hogwarts-lega...,6.0,1.0
3,https://www.gamespot.com/reviews/atomic-heart-...,6.0,1.0
4,https://www.gamespot.com/reviews/like-a-dragon...,8.0,0.0


In [5]:
# Now lets define how to get the text of the reviews from the web
def GetReviewsFromWeb():
    num_processed = 1
    prev_time = time.time()
    num_links = len(general_data.index)
    check_interval = 5
    reviews = []
    failed_links = []
    for link in general_data['Review link']:
        # Try to get the review text
        try:
            page = urllib.request.urlopen(link).read()
            page = BeautifulSoup(page)
            review = ""
            body = page.find(class_="article-body typography-format")
            paragraphs = body.find_all("p")
            if(len(paragraphs)==0):
                raise Exception('NO REVIEW TEXT FOUND')
            for p in paragraphs:
                review+=p.text+" "
            reviews.append(review)
        # If we could not get the review text
        except:
            reviews.append(None)
            failed_links.append(link)
        # Display progress
        num_processed = num_processed + 1
        if(num_processed % check_interval == 0):
            prediction = (time.time() - prev_time)/(check_interval) * (num_links-len(reviews))
            prev_time = time.time()
            system('cls')
            clear_output(wait=True)
            print(f"Reviews lost: {len(failed_links)}")
            print(f"processed: {num_processed} / {num_links}")
            print(f"Time left: {int(prediction)} s")  
    #Update dataframe and write to file
    general_data['Review'] = reviews
    # Now lets finish cleaning the data by dropping any rows where we do not have a review
    old_size = general_data.shape
    general_data.dropna(inplace=True)
    new_size = general_data.shape
    print("Dropping invalid reviews.")
    print(f"Old count: {old_size[0]}")
    print(f"New count: {new_size[0]}")
    print(f"Removed {old_size[0]-new_size[0]} reviews.")

In [6]:
# If we don't already have the clean data we need to get it from the web
file_found = True
try:
    general_data = pd.read_csv(GENERAL_CLEAN_DATA_PATH)
except:
    file_found = False
if(not file_found):
    GetReviewsFromWeb()


Reviews lost: 36
processed: 12590 / 12591
Time left: 1 s
Dropping invalid reviews.
Old count: 12591
New count: 12555
Removed 36 reviews.


In [7]:
general_data.to_csv(GENERAL_CLEAN_DATA_PATH)

https://www.gamespot.com/reviews/hogwarts-legacy-review-sleight-of-hand/1900-6418032/


In [8]:
#split datasets
TEST_SIZE = 0.3
general_X = 
general_Y = 
general_X_train, general_X_test, general_y_train, general_y_test = train_test_split(general_X, general_y, test_size = TEST_SIZE, random_state=1)

SyntaxError: invalid syntax (3864098717.py, line 3)

In [None]:
#create svm
print('h')

In [None]:
#train svm with general reviews data set


In [None]:
#Test accuracy of svm
