In [45]:
#pip install scikit-learn
#pip install pandas

# Main imports pandas and sklearn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

# We need to do some web scraping to get the review text
import urllib.request
from bs4 import BeautifulSoup

#Other imports
import time
import sys
from os import system
from IPython.display import clear_output

# Load in data
### There are a few steps to getting clean data
- Read data from kaggle into dataframes
- Get rid of unecessary features of the data
- (For general data) get the review text from the web
- Clean data using dropna

## Read in data from csv

In [46]:
# Lets define our file paths for reading in data
# download dataset at: https://www.kaggle.com/datasets/joyshil0599/multi-decade-video-game-review-dataset
GENERAL_DATASET_PATH = "data/general_reviews.csv"
# download dataset at: https://www.kaggle.com/datasets/noahx1/elden-ring-steam-reviews
ELDEN_RING_DATASET_PATH = "data/elden_ring_steam_reviews.csv"
# Path to save cleaned general data file
GENERAL_CLEAN_DATA_PATH = 'data/general_data_clean.csv'

#load in dataset
# We have two datasets:
# one using gamespot reviews (general_data)
# the other Elden Ring Steam reviews (elden_ring_data)
general_data = pd.read_csv(GENERAL_DATASET_PATH)
elden_ring_data = pd.read_csv(ELDEN_RING_DATASET_PATH)


In [47]:
# Lets only keep the columns we want
general_data = general_data[['Review link','Rating/10']]
elden_ring_data = elden_ring_data[['voted_up','review']]

# Read in reviews from web (for general data)

In [48]:
# Now lets define how to get the text of the reviews from the web
def GetReviewsFromWeb():
    num_processed = 1
    prev_time = time.time()
    num_links = len(general_data.index)
    check_interval = 5
    reviews = []
    failed_links = []
    for link in general_data['Review link']:
        # Try to get the review text
        try:
            page = urllib.request.urlopen(link).read()
            page = BeautifulSoup(page)
            review = ""
            body = page.find(class_="article-body typography-format")
            paragraphs = body.find_all("p")
            if(len(paragraphs)==0):
                raise Exception('NO REVIEW TEXT FOUND')
            for p in paragraphs:
                review+=p.text+" "
            reviews.append(review)
        # If we could not get the review text
        except:
            reviews.append(None)
            failed_links.append(link)
        # Display progress
        num_processed = num_processed + 1
        if(num_processed % check_interval == 0):
            prediction = (time.time() - prev_time)/(check_interval) * (num_links-len(reviews))
            prev_time = time.time()
            system('cls')
            clear_output(wait=True)
            print(f"Reviews lost: {len(failed_links)}")
            print(f"processed: {num_processed} / {num_links}")
            print(f"Time left: {int(prediction)} s")  
    #Update dataframe and write to file
    general_data['Review'] = reviews
    # Now lets finish cleaning the data by dropping any rows where we do not have a review
    old_size = general_data.shape
    general_data.dropna(inplace=True)
    general_data.to_csv(GENERAL_CLEAN_DATA_PATH)
    new_size = general_data.shape
    print("Dropping invalid reviews.")
    print(f"Old count: {old_size[0]}")
    print(f"New count: {new_size[0]}")
    print(f"Removed {old_size[0]-new_size[0]} reviews.")

In [49]:
# If we don't already have the clean data we need to get it from the web
file_found = True
try:
    general_data = pd.read_csv(GENERAL_CLEAN_DATA_PATH)
except FileNotFoundError as e:
    file_found = False
if(not file_found):
    GetReviewsFromWeb()


# Now lets define our classifications (Sentiment)

In [50]:
# We define 2 sentiments to classify
GOOD = 'Good'
BAD = 'Bad'


In [51]:
# General data
# I tried to choose values for rating that made sense and split the data kind of evenly
def GetSentimentFromRating(rating):
    if(rating is None):
        return None
    if(rating >= 7):
        return GOOD
    if(rating > 0):
        return BAD
s = []
for rating in general_data['Rating/10'].to_list():
    s.append(GetSentimentFromRating(rating))
general_data['Sentiment'] = s
general_data.dropna(inplace=True)
print("DATA DISTRIBUTION:")
SENTIMENTS = general_data['Sentiment'].unique()
for s in SENTIMENTS:
    print(f"{s}: {sum(1 for i in general_data['Sentiment'] if i == s)}")
general_data.head()

DATA DISTRIBUTION:
Bad: 487
Good: 1160


Unnamed: 0,Review link,Rating/10,Sentiment,Review
0,https://www.gamespot.com/reviews/tales-of-symp...,5.0,Bad,Tales of Symphonia was a formative experience ...
1,https://www.gamespot.com/reviews/kirbys-return...,7.0,Good,Mario is the most versatile character in the N...
2,https://www.gamespot.com/reviews/hogwarts-lega...,6.0,Bad,It's difficult to find someone oblivious to th...
3,https://www.gamespot.com/reviews/atomic-heart-...,6.0,Bad,"In the alternate history of Atomic Heart, a sc..."
4,https://www.gamespot.com/reviews/like-a-dragon...,8.0,Good,"Take the faces, voices, and over-the-top theat..."


In [53]:
#elden ring data
elden_ring_data = elden_ring_data[['review','voted_up']]
elden_ring_data.rename(columns={'review':'Review'},inplace=True)
def GetSentimentFromUpVote(up_vote):
    if(up_vote):
        return GOOD
    else:
        return BAD
s = []
for upvote in elden_ring_data['voted_up'].to_list():
    s.append(GetSentimentFromUpVote(upvote))
elden_ring_data['Sentiment'] = s
elden_ring_data.dropna(inplace=True)
print("DATA DISTRIBUTION:")
SENTIMENTS = elden_ring_data['Sentiment'].unique()
for s in SENTIMENTS:
    print(f"{s}: {sum(1 for i in elden_ring_data['Sentiment'] if i == s)}")
elden_ring_data.head()

DATA DISTRIBUTION:
Good: 9174
Bad: 591


Unnamed: 0,Review,voted_up,Sentiment
0,being killed over and over again is fun,True,Good
1,I write this review as I have 100% completed E...,True,Good
2,Fun,True,Good
3,pretty cool.,True,Good
4,AMAZINGGGGGGGGGGGGG,True,Good


# Now Lets create an SVM
## We do this in a function so that we can reuse it later

In [None]:
def CreateSVM(df):
    #split datasets
    TEST_SIZE = 0.3
    X = df['Review']
    y = df['Sentiment'] 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = TEST_SIZE, random_state=1) 
    X_train.to_list()
    X_test = X_test.to_list()
    y_train = y_train.to_list()
    y_test = y_test.to_list()

In [8]:
#split datasets
TEST_SIZE = 0.3
general_X = general_data['Review']
general_y = general_data['Sentiment']
general_X_train, general_X_test, general_y_train, general_y_test = train_test_split(general_X, general_y, test_size = TEST_SIZE, random_state=1)
general_X_train = general_X_train.to_list()
general_X_test = general_X_test.to_list()
general_y_train = general_y_train.to_list()
general_y_test = general_y_test.to_list()

# Creating svm
#### Helpful link to understand how this works:
- https://medium.com/@vasista/sentiment-analysis-using-svm-338d418e3ff1



## Vectorize the data
We will be using a method called **Term Frequency and Inverse Document Frequency (TF-IDF)**. 
Here is how it works: https://medium.com/@vasista/preparing-the-text-data-with-scikit-learn-b31a3df567e

#### sklearn supplies an easy way to implement this

In [9]:
# Create feature vectors
vectorizer = TfidfVectorizer(min_df = 5,max_df = 0.8,sublinear_tf = True,use_idf = True)
train_vectors = vectorizer.fit_transform(general_X_train)
test_vectors = vectorizer.transform(general_X_test)

## Create a Linear SVM model

In [10]:
svm_linear=svm.SVC(kernel='linear')
print('training SVM...')
start_time = time.time()
svm_linear.fit(train_vectors, general_y_train)
end_time = time.time()
print(f"Trained SVM in {end_time-start_time} s")

training SVM...
Trained SVM in 4.233425617218018 s


In [11]:
print('Testing SVM ...')
start_time = time.time()
prediction = svm_linear.predict(test_vectors)
end_time = time.time()
print(f"Tested SVM in {end_time-start_time} s")

Testing SVM ...
Tested SVM in 1.7265186309814453 s


## Linear SVM general data results

In [12]:
report_linear = classification_report(general_y_test, prediction,output_dict=True,zero_division=0)
for key in report_linear.keys():
    print(f'{key}: {report_linear[key]}\n')


Bad: {'precision': 0.729903536977492, 'recall': 0.822463768115942, 'f1-score': 0.7734241908006815, 'support': 276}

Good: {'precision': 0.7336956521739131, 'recall': 0.6164383561643836, 'f1-score': 0.6699751861042184, 'support': 219}

accuracy: 0.7313131313131314

macro avg: {'precision': 0.7317995945757025, 'recall': 0.7194510621401629, 'f1-score': 0.7216996884524499, 'support': 495}

weighted avg: {'precision': 0.731581260670454, 'recall': 0.7313131313131314, 'f1-score': 0.7276558432683069, 'support': 495}



# Test SVM on Elden Ring Reviews

In [13]:
elden_ring_data.head()

Unnamed: 0,review,voted_up,Sentiment
0,being killed over and over again is fun,True,Good
1,I write this review as I have 100% completed E...,True,Good
2,Fun,True,Good
3,pretty cool.,True,Good
4,AMAZINGGGGGGGGGGGGG,True,Good


In [14]:
# Create vectors
elden_ring_test_X = elden_ring_data['review'].to_list()
elden_ring_test_y = elden_ring_data['Sentiment'].to_list()
elden_ring_vectors = vectorizer.transform(elden_ring_test_X)

In [15]:
# Test on elden ring reviews
print('Testing SVM on Elden Ring Reviews...')
start_time = time.time()
prediction = svm_linear.predict(elden_ring_vectors)
end_time = time.time()
print(f"Tested SVM in {end_time-start_time} s")

Testing SVM on Elden Ring Reviews...
Tested SVM in 5.646575689315796 s


## Elden ring SVM results

In [16]:
report_elden_ring = classification_report(elden_ring_test_y, prediction,output_dict=True,zero_division=0)
for key in report_elden_ring.keys():
    print(f'{key}: {report_elden_ring[key]}\n')


Bad: {'precision': 0.06591380502419912, 'recall': 0.9678510998307953, 'f1-score': 0.12342215988779803, 'support': 591}

Good: {'precision': 0.9825206991720331, 'recall': 0.11641595814257685, 'f1-score': 0.20816684533671181, 'support': 9174}

accuracy: 0.1679467485919099

macro avg: {'precision': 0.5242172520981161, 'recall': 0.5421335289866861, 'f1-score': 0.16579450261225492, 'support': 9765}

weighted avg: {'precision': 0.9270455661007202, 'recall': 0.1679467485919099, 'f1-score': 0.20303790431261473, 'support': 9765}



# Now lets do the opposite. Train on Elden Ring and Test on General

In [29]:
#split dataset
TEST_SIZE = 0.3
elden_ring_SVM_X = elden_ring_data['review']
elden_ring_SVM_y = elden_ring_data['Sentiment']
elden_ring_SVM_X_train, elden_ring_SVM_X_test, elden_ring_SVM_y_train, elden_ring_SVM_y_test = train_test_split(elden_ring_SVM_X, elden_ring_SVM_y, test_size = TEST_SIZE, random_state=1)
elden_ring_SVM_X_train = elden_ring_X_train.to_list()
elden_ring_SVM_X_test = elden_ring_X_test.to_list()
elden_ring_SVM_y_train = elden_ring_y_train.to_list()
elden_ring_SVM_y_test = elden_ring_y_test.to_list()

AttributeError: 'list' object has no attribute 'to_list'

## Vectorize the data again

In [18]:
# Create feature vectors
vectorizer = TfidfVectorizer(min_df = 5,max_df = 0.8,sublinear_tf = True,use_idf = True)
train_vectors = vectorizer.fit_transform(general_X_train)
test_vectors = vectorizer.transform(general_X_test)

## Create a Linear SVM model

In [19]:
svm_linear=svm.SVC(kernel='linear')
print('training SVM...')
start_time = time.time()
svm_linear.fit(train_vectors, general_y_train)
end_time = time.time()
print(f"Trained SVM in {end_time-start_time} s")

training SVM...
Trained SVM in 4.2284369468688965 s


In [20]:
print('Testing SVM ...')
start_time = time.time()
prediction = svm_linear.predict(test_vectors)
end_time = time.time()
print(f"Tested SVM in {end_time-start_time} s")

Testing SVM ...
Tested SVM in 1.7260348796844482 s


# Lets test it on the test Elden Ring data

In [25]:
# Test on elden ring reviews
print('Testing SVM on Elden Ring Reviews...')
start_time = time.time()
prediction = svm_linear.predict(test_vectors)
end_time = time.time()
print(f"Tested SVM in {end_time-start_time} s")

Testing SVM on Elden Ring Reviews...
Tested SVM in 1.7238216400146484 s


## Results

In [26]:
report_elden_ring = classification_report(elden_ring_y_test, prediction, output_dict=True,zero_division=0)
for key in report_elden_ring.keys():
    print(f'{key}: {report_elden_ring[key]}\n')


ValueError: Found input variables with inconsistent numbers of samples: [2930, 495]