In [2]:
#imports
#pip install scikit-learn
#pip install pandas

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn import metrics
# We need to do some web scraping to get the review text
import urllib.request
from bs4 import BeautifulSoup
#Other imports
import time
from os import system
from IPython.display import clear_output

In [3]:
#load in datasets
GENERAL_DATASET_PATH = "data/general_reviews.csv"
ELDEN_RING_DATASET_PATH = "data/elden_ring_steam_reviews.csv"
general_data = pd.read_csv(GENERAL_DATASET_PATH)
elden_ring_data = pd.read_csv(ELDEN_RING_DATASET_PATH)
general_data.head()

Unnamed: 0,Gaming Console name,Review link,Review Title,Time Since Review,Comment Count,Likes Received,Rating/10,Rating Comments
0,NS,https://www.gamespot.com/reviews/tales-of-symp...,Tales Of Symphonia Remastered Review - A Class...,20 days ago,23,6,5.0,Mediocre
1,NS,https://www.gamespot.com/reviews/kirbys-return...,Kirby's Return To Dream Land Deluxe Review - K...,22 days ago,2,2,7.0,Good
2,PS5,https://www.gamespot.com/reviews/hogwarts-lega...,Hogwarts Legacy Review - Sleight Of Hand,23 days ago,0,593,6.0,Fair
3,XBSX,https://www.gamespot.com/reviews/atomic-heart-...,Atomic Heart Review - Crispy Critters,23 days ago,226,23,6.0,Fair
4,PC,https://www.gamespot.com/reviews/like-a-dragon...,Like A Dragon: Ishin Review - Rewriting History,27 days ago,11,4,8.0,Great


In [4]:
# Clean and format Data
# Lets only keep the columns we want
general_data = general_data[['Review link','Rating/10','Rating Comments']]
general_data.head()

Unnamed: 0,Review link,Rating/10,Rating Comments
0,https://www.gamespot.com/reviews/tales-of-symp...,5.0,Mediocre
1,https://www.gamespot.com/reviews/kirbys-return...,7.0,Good
2,https://www.gamespot.com/reviews/hogwarts-lega...,6.0,Fair
3,https://www.gamespot.com/reviews/atomic-heart-...,6.0,Fair
4,https://www.gamespot.com/reviews/like-a-dragon...,8.0,Great


In [5]:
# We can get all the comment types with:
# print(general_data['Rating Comments'].unique())
# This gives the following list:
# ['Mediocre' 'Good' 'Fair' 'Great' 'Superb' 'Poor' 'Essential' 'Abysmal'
#  'Early Access' 'Terrible' 'Bad' 'Good\x1a' nan 'Essential"width-100 src']
# Lets convert these into just good and bad. We will make a dictionary showing what to convert to in each case
GOOD = 0
NUETRAL = 1
BAD = 2
#In case we want to use rating to indicate sentiment
# def GetSentiment(rating):
#     if(rating < 5):
#         return BAD
#     else:
#         return GOOD
comment_dict = {'Mediocre':NUETRAL,'Good':GOOD,'Fair':NUETRAL,'Great':GOOD,'Superb':GOOD,
                'Poor':BAD,'Essential':GOOD,'Abysmal':BAD,'Early Access':NUETRAL,
                'Terrible':BAD,'Bad':BAD,'Good\x1a':GOOD,'Essential"width-100 src':None}
#Convert rating comments using dict
general_data.replace({'Rating Comments':comment_dict}, inplace=True)
general_data.dropna(inplace=True)
print(general_data['Rating Comments'].unique())
print(f"shape={general_data.shape}")
#Lets rename Rating Comments to Sentiment
general_data.rename(columns={'Rating Comments':'Sentiment'},inplace=True)
general_data.head()

[1. 0. 2.]
shape=(12591, 3)


Unnamed: 0,Review link,Rating/10,Sentiment
0,https://www.gamespot.com/reviews/tales-of-symp...,5.0,1.0
1,https://www.gamespot.com/reviews/kirbys-return...,7.0,0.0
2,https://www.gamespot.com/reviews/hogwarts-lega...,6.0,1.0
3,https://www.gamespot.com/reviews/atomic-heart-...,6.0,1.0
4,https://www.gamespot.com/reviews/like-a-dragon...,8.0,0.0


In [6]:
# Now lets go get the text of the reviews
def GetReviewsFromWeb():
    i = 0
    num_links = len(general_data.index)
    check_interval = 5
    reviews = []
    for link in general_data['Review link']:
        if(i % check_interval == 0):
            if(i>0):
                prediction = (time.time() - prev_time)/(check_interval) * (num_links-len(reviews))
                system('cls')
                clear_output(wait=True)
                print(f"processed: {i} / {num_links}")
                print(f"Time left: {int(prediction)} s")  
            prev_time = time.time()
        i = i + 1
        try:
            page = urllib.request.urlopen(link).read()
            page = BeautifulSoup(page)
            review = ""
            body = page.find(class_="content-body review-article article article--review article--one-column content-body--buffer")
            body = body.find(class_="article-body typography-format")
            paragraphs = body.find_all("p",dir="ltr")
            for p in paragraphs:
                review+=p.text+" "
            reviews.append(review)
        except:
            reviews.append(None)
    general_data['Review'] = reviews
    general_data.to_csv('general_data_clean.csv',index=False)
    old_size = general_data.shape
    general_data.dropna(inplace=True)
    new_size = general_data.shape
    print(f"Old size: {old-size}")
    print(f"New size: {new_size}")
    print(f"We failed to find {old_size-new_size} reviews")
    general_data.to_csv('general_data_clean_2.csv',index=False)
    
GetReviewsFromWeb()

processed: 5 / 12591
Time left: 1689 s
processed: 10 / 12591
Time left: 1082 s
processed: 15 / 12591
Time left: 526 s
processed: 20 / 12591
Time left: 549 s
processed: 25 / 12591
Time left: 649 s
processed: 30 / 12591
Time left: 374 s
processed: 35 / 12591
Time left: 412 s
processed: 40 / 12591
Time left: 241 s
processed: 45 / 12591
Time left: 260 s
processed: 50 / 12591
Time left: 345 s
processed: 55 / 12591
Time left: 172 s
processed: 60 / 12591
Time left: 167 s
processed: 65 / 12591
Time left: 179 s
processed: 70 / 12591
Time left: 135 s
processed: 75 / 12591
Time left: 170 s
processed: 80 / 12591
Time left: 144 s
processed: 85 / 12591
Time left: 134 s
processed: 90 / 12591
Time left: 105 s
processed: 95 / 12591
Time left: 110 s
processed: 100 / 12591
Time left: 90 s
processed: 105 / 12591
Time left: 105 s
processed: 110 / 12591
Time left: 109 s
processed: 115 / 12591
Time left: 86 s
processed: 120 / 12591
Time left: 82 s
processed: 125 / 12591
Time left: 82 s
processed: 130 / 12591

processed: 1055 / 12591
Time left: 7 s
processed: 1060 / 12591
Time left: 12 s
processed: 1065 / 12591
Time left: 9 s
processed: 1070 / 12591
Time left: 11 s
processed: 1075 / 12591
Time left: 8 s
processed: 1080 / 12591
Time left: 12 s
processed: 1085 / 12591
Time left: 9 s
processed: 1090 / 12591
Time left: 11 s
processed: 1095 / 12591
Time left: 10 s
processed: 1100 / 12591
Time left: 8 s
processed: 1105 / 12591
Time left: 8 s
processed: 1110 / 12591
Time left: 9 s
processed: 1115 / 12591
Time left: 10 s
processed: 1120 / 12591
Time left: 7 s
processed: 1125 / 12591
Time left: 9 s
processed: 1130 / 12591
Time left: 9 s
processed: 1135 / 12591
Time left: 9 s
processed: 1140 / 12591
Time left: 7 s
processed: 1145 / 12591
Time left: 9 s
processed: 1150 / 12591
Time left: 9 s
processed: 1155 / 12591
Time left: 8 s
processed: 1160 / 12591
Time left: 9 s
processed: 1165 / 12591
Time left: 10 s
processed: 1170 / 12591
Time left: 12 s
processed: 1175 / 12591
Time left: 9 s
processed: 1180 /

processed: 2105 / 12591
Time left: 4 s
processed: 2110 / 12591
Time left: 4 s
processed: 2115 / 12591
Time left: 4 s
processed: 2120 / 12591
Time left: 4 s
processed: 2125 / 12591
Time left: 5 s
processed: 2130 / 12591
Time left: 5 s
processed: 2135 / 12591
Time left: 6 s
processed: 2140 / 12591
Time left: 5 s
processed: 2145 / 12591
Time left: 4 s
processed: 2150 / 12591
Time left: 4 s
processed: 2155 / 12591
Time left: 4 s
processed: 2160 / 12591
Time left: 4 s
processed: 2165 / 12591
Time left: 4 s
processed: 2170 / 12591
Time left: 4 s
processed: 2175 / 12591
Time left: 4 s
processed: 2180 / 12591
Time left: 4 s
processed: 2185 / 12591
Time left: 3 s
processed: 2190 / 12591
Time left: 4 s
processed: 2195 / 12591
Time left: 4 s
processed: 2200 / 12591
Time left: 6 s
processed: 2205 / 12591
Time left: 4 s
processed: 2210 / 12591
Time left: 5 s
processed: 2215 / 12591
Time left: 4 s
processed: 2220 / 12591
Time left: 4 s
processed: 2225 / 12591
Time left: 4 s
processed: 2230 / 12591
T

processed: 3160 / 12591
Time left: 2 s
processed: 3165 / 12591
Time left: 2 s
processed: 3170 / 12591
Time left: 3 s
processed: 3175 / 12591
Time left: 3 s
processed: 3180 / 12591
Time left: 2 s
processed: 3185 / 12591
Time left: 3 s
processed: 3190 / 12591
Time left: 2 s
processed: 3195 / 12591
Time left: 2 s
processed: 3200 / 12591
Time left: 2 s
processed: 3205 / 12591
Time left: 3 s
processed: 3210 / 12591
Time left: 2 s
processed: 3215 / 12591
Time left: 2 s
processed: 3220 / 12591
Time left: 3 s
processed: 3225 / 12591
Time left: 2 s
processed: 3230 / 12591
Time left: 3 s
processed: 3235 / 12591
Time left: 2 s
processed: 3240 / 12591
Time left: 3 s
processed: 3245 / 12591
Time left: 3 s
processed: 3250 / 12591
Time left: 2 s
processed: 3255 / 12591
Time left: 3 s
processed: 3260 / 12591
Time left: 3 s
processed: 3265 / 12591
Time left: 2 s
processed: 3270 / 12591
Time left: 2 s
processed: 3275 / 12591
Time left: 2 s
processed: 3280 / 12591
Time left: 2 s
processed: 3285 / 12591
T

processed: 4215 / 12591
Time left: 2 s
processed: 4220 / 12591
Time left: 2 s
processed: 4225 / 12591
Time left: 2 s
processed: 4230 / 12591
Time left: 2 s
processed: 4235 / 12591
Time left: 1 s
processed: 4240 / 12591
Time left: 2 s
processed: 4245 / 12591
Time left: 1 s
processed: 4250 / 12591
Time left: 2 s
processed: 4255 / 12591
Time left: 2 s
processed: 4260 / 12591
Time left: 2 s
processed: 4265 / 12591
Time left: 1 s
processed: 4270 / 12591
Time left: 2 s
processed: 4275 / 12591
Time left: 2 s
processed: 4280 / 12591
Time left: 2 s
processed: 4285 / 12591
Time left: 2 s
processed: 4290 / 12591
Time left: 2 s
processed: 4295 / 12591
Time left: 2 s
processed: 4300 / 12591
Time left: 1 s
processed: 4305 / 12591
Time left: 2 s
processed: 4310 / 12591
Time left: 1 s
processed: 4315 / 12591
Time left: 1 s
processed: 4320 / 12591
Time left: 1 s
processed: 4325 / 12591
Time left: 2 s
processed: 4330 / 12591
Time left: 2 s
processed: 4335 / 12591
Time left: 2 s
processed: 4340 / 12591
T

processed: 5270 / 12591
Time left: 1 s
processed: 5275 / 12591
Time left: 1 s
processed: 5280 / 12591
Time left: 1 s
processed: 5285 / 12591
Time left: 1 s
processed: 5290 / 12591
Time left: 1 s
processed: 5295 / 12591
Time left: 2 s
processed: 5300 / 12591
Time left: 1 s
processed: 5305 / 12591
Time left: 1 s
processed: 5310 / 12591
Time left: 1 s
processed: 5315 / 12591
Time left: 1 s
processed: 5320 / 12591
Time left: 1 s
processed: 5325 / 12591
Time left: 2 s
processed: 5330 / 12591
Time left: 1 s
processed: 5335 / 12591
Time left: 1 s
processed: 5340 / 12591
Time left: 1 s
processed: 5345 / 12591
Time left: 2 s
processed: 5350 / 12591
Time left: 1 s
processed: 5355 / 12591
Time left: 2 s
processed: 5360 / 12591
Time left: 1 s
processed: 5365 / 12591
Time left: 1 s
processed: 5370 / 12591
Time left: 1 s
processed: 5375 / 12591
Time left: 1 s
processed: 5380 / 12591
Time left: 1 s
processed: 5385 / 12591
Time left: 1 s
processed: 5390 / 12591
Time left: 1 s
processed: 5395 / 12591
T

processed: 6325 / 12591
Time left: 1 s
processed: 6330 / 12591
Time left: 1 s
processed: 6335 / 12591
Time left: 1 s
processed: 6340 / 12591
Time left: 1 s
processed: 6345 / 12591
Time left: 1 s
processed: 6350 / 12591
Time left: 1 s
processed: 6355 / 12591
Time left: 1 s
processed: 6360 / 12591
Time left: 1 s
processed: 6365 / 12591
Time left: 1 s
processed: 6370 / 12591
Time left: 1 s
processed: 6375 / 12591
Time left: 1 s
processed: 6380 / 12591
Time left: 1 s
processed: 6385 / 12591
Time left: 1 s
processed: 6390 / 12591
Time left: 1 s
processed: 6395 / 12591
Time left: 1 s
processed: 6400 / 12591
Time left: 1 s
processed: 6405 / 12591
Time left: 1 s
processed: 6410 / 12591
Time left: 1 s
processed: 6415 / 12591
Time left: 1 s
processed: 6420 / 12591
Time left: 1 s
processed: 6425 / 12591
Time left: 1 s
processed: 6430 / 12591
Time left: 1 s
processed: 6435 / 12591
Time left: 1 s
processed: 6440 / 12591
Time left: 1 s
processed: 6445 / 12591
Time left: 1 s
processed: 6450 / 12591
T

processed: 7380 / 12591
Time left: 1 s
processed: 7385 / 12591
Time left: 1 s
processed: 7390 / 12591
Time left: 1 s
processed: 7395 / 12591
Time left: 1 s
processed: 7400 / 12591
Time left: 1 s
processed: 7405 / 12591
Time left: 1 s
processed: 7410 / 12591
Time left: 1 s
processed: 7415 / 12591
Time left: 1 s
processed: 7420 / 12591
Time left: 1 s
processed: 7425 / 12591
Time left: 1 s
processed: 7430 / 12591
Time left: 1 s
processed: 7435 / 12591
Time left: 1 s
processed: 7440 / 12591
Time left: 1 s
processed: 7445 / 12591
Time left: 1 s
processed: 7450 / 12591
Time left: 1 s
processed: 7455 / 12591
Time left: 1 s
processed: 7460 / 12591
Time left: 1 s
processed: 7465 / 12591
Time left: 1 s
processed: 7470 / 12591
Time left: 1 s
processed: 7475 / 12591
Time left: 1 s
processed: 7480 / 12591
Time left: 1 s
processed: 7485 / 12591
Time left: 1 s
processed: 7490 / 12591
Time left: 1 s
processed: 7495 / 12591
Time left: 1 s
processed: 7500 / 12591
Time left: 1 s
processed: 7505 / 12591
T

processed: 8435 / 12591
Time left: 0 s
processed: 8440 / 12591
Time left: 1 s
processed: 8445 / 12591
Time left: 1 s
processed: 8450 / 12591
Time left: 1 s
processed: 8455 / 12591
Time left: 0 s
processed: 8460 / 12591
Time left: 1 s
processed: 8465 / 12591
Time left: 1 s
processed: 8470 / 12591
Time left: 0 s
processed: 8475 / 12591
Time left: 1 s
processed: 8480 / 12591
Time left: 0 s
processed: 8485 / 12591
Time left: 0 s
processed: 8490 / 12591
Time left: 0 s
processed: 8495 / 12591
Time left: 0 s
processed: 8500 / 12591
Time left: 1 s
processed: 8505 / 12591
Time left: 1 s
processed: 8510 / 12591
Time left: 0 s
processed: 8515 / 12591
Time left: 1 s
processed: 8520 / 12591
Time left: 0 s
processed: 8525 / 12591
Time left: 1 s
processed: 8530 / 12591
Time left: 0 s
processed: 8535 / 12591
Time left: 1 s
processed: 8540 / 12591
Time left: 1 s
processed: 8545 / 12591
Time left: 1 s
processed: 8550 / 12591
Time left: 1 s
processed: 8555 / 12591
Time left: 1 s
processed: 8560 / 12591
T

processed: 9490 / 12591
Time left: 0 s
processed: 9495 / 12591
Time left: 4 s
processed: 9500 / 12591
Time left: 0 s
processed: 9505 / 12591
Time left: 0 s
processed: 9510 / 12591
Time left: 0 s
processed: 9515 / 12591
Time left: 1 s
processed: 9520 / 12591
Time left: 0 s
processed: 9525 / 12591
Time left: 0 s
processed: 9530 / 12591
Time left: 0 s
processed: 9535 / 12591
Time left: 1 s
processed: 9540 / 12591
Time left: 0 s
processed: 9545 / 12591
Time left: 0 s
processed: 9550 / 12591
Time left: 0 s
processed: 9555 / 12591
Time left: 0 s
processed: 9560 / 12591
Time left: 1 s
processed: 9565 / 12591
Time left: 0 s
processed: 9570 / 12591
Time left: 0 s
processed: 9575 / 12591
Time left: 0 s
processed: 9580 / 12591
Time left: 0 s
processed: 9585 / 12591
Time left: 0 s
processed: 9590 / 12591
Time left: 1 s
processed: 9595 / 12591
Time left: 0 s
processed: 9600 / 12591
Time left: 0 s
processed: 9605 / 12591
Time left: 0 s
processed: 9610 / 12591
Time left: 0 s
processed: 9615 / 12591
T

processed: 10530 / 12591
Time left: 0 s
processed: 10535 / 12591
Time left: 0 s
processed: 10540 / 12591
Time left: 0 s
processed: 10545 / 12591
Time left: 0 s
processed: 10550 / 12591
Time left: 0 s
processed: 10555 / 12591
Time left: 0 s
processed: 10560 / 12591
Time left: 0 s
processed: 10565 / 12591
Time left: 0 s
processed: 10570 / 12591
Time left: 0 s
processed: 10575 / 12591
Time left: 0 s
processed: 10580 / 12591
Time left: 0 s
processed: 10585 / 12591
Time left: 0 s
processed: 10590 / 12591
Time left: 0 s
processed: 10595 / 12591
Time left: 0 s
processed: 10600 / 12591
Time left: 0 s
processed: 10605 / 12591
Time left: 0 s
processed: 10610 / 12591
Time left: 0 s
processed: 10615 / 12591
Time left: 0 s
processed: 10620 / 12591
Time left: 0 s
processed: 10625 / 12591
Time left: 0 s
processed: 10630 / 12591
Time left: 0 s
processed: 10635 / 12591
Time left: 0 s
processed: 10640 / 12591
Time left: 0 s
processed: 10645 / 12591
Time left: 0 s
processed: 10650 / 12591
Time left: 0 s


processed: 11555 / 12591
Time left: 0 s
processed: 11560 / 12591
Time left: 0 s
processed: 11565 / 12591
Time left: 0 s
processed: 11570 / 12591
Time left: 0 s
processed: 11575 / 12591
Time left: 0 s
processed: 11580 / 12591
Time left: 0 s
processed: 11585 / 12591
Time left: 0 s
processed: 11590 / 12591
Time left: 0 s
processed: 11595 / 12591
Time left: 0 s
processed: 11600 / 12591
Time left: 0 s
processed: 11605 / 12591
Time left: 0 s
processed: 11610 / 12591
Time left: 0 s
processed: 11615 / 12591
Time left: 0 s
processed: 11620 / 12591
Time left: 0 s
processed: 11625 / 12591
Time left: 0 s
processed: 11630 / 12591
Time left: 0 s
processed: 11635 / 12591
Time left: 0 s
processed: 11640 / 12591
Time left: 0 s
processed: 11645 / 12591
Time left: 0 s
processed: 11650 / 12591
Time left: 0 s
processed: 11655 / 12591
Time left: 0 s
processed: 11660 / 12591
Time left: 0 s
processed: 11665 / 12591
Time left: 0 s
processed: 11670 / 12591
Time left: 0 s
processed: 11675 / 12591
Time left: 0 s


processed: 12580 / 12591
Time left: 0 s
processed: 12585 / 12591
Time left: 0 s
processed: 12590 / 12591
Time left: 0 s


In [7]:
general_data.head()

Unnamed: 0,Review link,Rating/10,Sentiment,Review
0,https://www.gamespot.com/reviews/tales-of-symp...,5.0,1.0,Tales of Symphonia was a formative experience ...
1,https://www.gamespot.com/reviews/kirbys-return...,7.0,0.0,Mario is the most versatile character in the N...
2,https://www.gamespot.com/reviews/hogwarts-lega...,6.0,1.0,It's difficult to find someone oblivious to th...
3,https://www.gamespot.com/reviews/atomic-heart-...,6.0,1.0,"In the alternate history of Atomic Heart, a sc..."
4,https://www.gamespot.com/reviews/like-a-dragon...,8.0,0.0,"Take the faces, voices, and over-the-top theat..."


In [28]:
print(general_data['Review'][58])

nan


In [None]:
#split datasets
TEST_SIZE = 0.3
general_X = 
general_Y = 
general_X_train, general_X_test, general_y_train, general_y_test = train_test_split(general_X, general_y, test_size = TEST_SIZE, random_state=1)

In [None]:
#create svm
print('h')

In [None]:
#train svm with general reviews data set


In [None]:
#Test accuracy of svm
