In [1]:
#pip install scikit-learn
#pip install pandas

# Main imports pandas and sklearn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

# We need to do some web scraping to get the review text
import urllib.request
from bs4 import BeautifulSoup

#Other imports
import time
import sys
from os import system
from IPython.display import clear_output

# Load in data
### There are a few steps to getting clean data
- Read data from kaggle into dataframes
- Get rid of unecessary features of the data
- (For general data) get the review text from the web
- Clean data using dropna

## Read in data from csv

In [2]:
# Lets define our file paths for reading in data
# download dataset at: https://www.kaggle.com/datasets/joyshil0599/multi-decade-video-game-review-dataset
GENERAL_DATASET_PATH = "data/general_reviews.csv"
# download dataset at: https://www.kaggle.com/datasets/noahx1/elden-ring-steam-reviews
ELDEN_RING_DATASET_PATH = "data/elden_ring_steam_reviews.csv"
# Path to save cleaned general data file
GENERAL_CLEAN_DATA_PATH = 'data/general_data_clean.csv'

#load in dataset
# We have two datasets:
# one using gamespot reviews (general_data)
# the other Elden Ring Steam reviews (elden_ring_data)
general_data = pd.read_csv(GENERAL_DATASET_PATH)
elden_ring_data = pd.read_csv(ELDEN_RING_DATASET_PATH)


In [3]:
# Lets only keep the columns we want
general_data = general_data[['Review link','Rating/10']]
elden_ring_data = elden_ring_data[['voted_up','review']]

# Read in reviews from web (for general data)

In [4]:
# Now lets define how to get the text of the reviews from the web
def GetReviewsFromWeb():
    num_processed = 1
    prev_time = time.time()
    num_links = len(general_data.index)
    check_interval = 5
    reviews = []
    failed_links = []
    for link in general_data['Review link']:
        # Try to get the review text
        try:
            page = urllib.request.urlopen(link).read()
            page = BeautifulSoup(page)
            review = ""
            body = page.find(class_="article-body typography-format")
            paragraphs = body.find_all("p")
            if(len(paragraphs)==0):
                raise Exception('NO REVIEW TEXT FOUND')
            for p in paragraphs:
                review+=p.text+" "
            reviews.append(review)
        # If we could not get the review text
        except:
            reviews.append(None)
            failed_links.append(link)
        # Display progress
        num_processed = num_processed + 1
        if(num_processed % check_interval == 0):
            prediction = (time.time() - prev_time)/(check_interval) * (num_links-len(reviews))
            prev_time = time.time()
            system('cls')
            clear_output(wait=True)
            print(f"Reviews lost: {len(failed_links)}")
            print(f"processed: {num_processed} / {num_links}")
            print(f"Time left: {int(prediction)} s")  
    # Update dataframe and write to file
    general_data['Review'] = reviews
    

In [5]:
def CleanData():
    # Now lets finish cleaning the data by dropping any rows where we do not have a review
    old_size = general_data.shape
    general_data.dropna(inplace=True)
    general_data.to_csv(GENERAL_CLEAN_DATA_PATH,index=False)
    new_size = general_data.shape
    print("Dropping invalid reviews.")
    print(f"Old count: {old_size[0]}")
    print(f"New count: {new_size[0]}")
    print(f"Removed {old_size[0]-new_size[0]} reviews.")
    

In [6]:
# If we don't already have the clean data we need to get it from the web
file_found = True
try:
    general_data = pd.read_csv(GENERAL_CLEAN_DATA_PATH)
except FileNotFoundError as e:
    file_found = False
if(not file_found):
    GetReviewsFromWeb()
    CleanData()

# Now lets define our classifications (Sentiment)

In [7]:
# We define 2 sentiments to classify
GOOD = 'Good'
BAD = 'Bad'

In [8]:
# General data
# I tried to choose values for rating that made sense and split the data kind of evenly
def GetSentimentFromRating(rating):
    if(rating is None):
        return None
    if(rating > 7):
        return GOOD
    if(rating > 0):
        return BAD
s = []
for rating in general_data['Rating/10'].to_list():
    s.append(GetSentimentFromRating(rating))
general_data['Sentiment'] = s
general_data.dropna(inplace=True)
print("DATA DISTRIBUTION:")
SENTIMENTS = general_data['Sentiment'].unique()
for s in SENTIMENTS:
    print(f"{s}: {sum(1 for i in general_data['Sentiment'] if i == s)}")
general_data.head()

DATA DISTRIBUTION:
Bad: 6743
Good: 5823


Unnamed: 0,Review link,Rating/10,Sentiment,Review
0,https://www.gamespot.com/reviews/tales-of-symp...,5.0,Bad,Tales of Symphonia was a formative experience ...
1,https://www.gamespot.com/reviews/kirbys-return...,7.0,Bad,Mario is the most versatile character in the N...
2,https://www.gamespot.com/reviews/hogwarts-lega...,6.0,Bad,Hogwarts Legacy is developed by Avalanche Soft...
3,https://www.gamespot.com/reviews/atomic-heart-...,6.0,Bad,Atomic Heart doesn't hide its BioShock Infinit...
4,https://www.gamespot.com/reviews/like-a-dragon...,8.0,Good,"Take the faces, voices, and over-the-top theat..."


In [9]:
#elden ring data
elden_ring_data = elden_ring_data[['review','voted_up']]
elden_ring_data.rename(columns={'review':'Review'},inplace=True)
def GetSentimentFromUpVote(up_vote):
    if(up_vote):
        return GOOD
    else:
        return BAD
s = []
for upvote in elden_ring_data['voted_up'].to_list():
    s.append(GetSentimentFromUpVote(upvote))
elden_ring_data['Sentiment'] = s
elden_ring_data.dropna(inplace=True)
print("DATA DISTRIBUTION:")
SENTIMENTS = elden_ring_data['Sentiment'].unique()
for s in SENTIMENTS:
    print(f"{s}: {sum(1 for i in elden_ring_data['Sentiment'] if i == s)}")
elden_ring_data.head()

DATA DISTRIBUTION:
Good: 9174
Bad: 591


Unnamed: 0,Review,voted_up,Sentiment
0,being killed over and over again is fun,True,Good
1,I write this review as I have 100% completed E...,True,Good
2,Fun,True,Good
3,pretty cool.,True,Good
4,AMAZINGGGGGGGGGGGGG,True,Good


# Now we split our data

In [10]:
def SplitData(df,test_split=0.3):
    #split datasets
    X = df['Review']
    y = df['Sentiment'] 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_split, random_state=1) 
    X_train.to_list()
    X_test = X_test.to_list()
    y_train = y_train.to_list()
    y_test = y_test.to_list()
    return X_train, X_test, y_train, y_test 

In [11]:
general_X_train, general_X_test, general_y_train, general_y_test = SplitData(general_data)
elden_ring_X_train, elden_ring_X_test, elden_ring_y_train, elden_ring_y_test = SplitData(elden_ring_data)

## Vectorize the data
We will be using a method called **Term Frequency and Inverse Document Frequency (TF-IDF)**. 
Here is how it works: https://medium.com/@vasista/preparing-the-text-data-with-scikit-learn-b31a3df567e

#### sklearn supplies an easy way to implement this

In [12]:
def CreateVectorizer(X_train):
    vectorizer = TfidfVectorizer(min_df = 5,max_df = 0.8,sublinear_tf = True,use_idf = True)
    train_vectors = vectorizer.fit_transform(X_train)
    return vectorizer, train_vectors

# Creating svm
#### Helpful link to understand how this works:
- https://medium.com/@vasista/sentiment-analysis-using-svm-338d418e3ff1



In [13]:
def TestSVM(svm_linear, vectorizer, X_test, y_test):
    test_vectors = vectorizer.transform(X_test)
    print('Testing SVM ...')
    start_time = time.time()
    prediction = svm_linear.predict(test_vectors)
    end_time = time.time()
    print(f"Tested SVM in {end_time-start_time} s")
    report = classification_report(y_test, prediction, output_dict=True, zero_division=0)
    for key in report.keys():
        print(f'{key}: {report[key]}\n')
    return report

In [14]:
def CreateSVM(x_train, y_train):
    vectorizer, train_vectors = CreateVectorizer(x_train)
    svm_linear = svm.SVC(kernel='linear')
    print('training SVM...')
    start_time = time.time()
    svm_linear.fit(train_vectors, y_train)
    end_time = time.time()
    print(f"Trained SVM in {end_time-start_time} s")
    return svm_linear, vectorizer

# Train SVM with general data

In [15]:
general_svm, general_vectorizer = CreateSVM(general_X_train, general_y_train)

training SVM...
Trained SVM in 161.44585418701172 s


# Test general_svm on general test data

In [16]:
report = TestSVM(general_svm, general_vectorizer, general_X_test, general_y_test)

Testing SVM ...
Tested SVM in 67.00135326385498 s
Bad: {'precision': 0.8589935226706528, 'recall': 0.8534653465346534, 'f1-score': 0.8562205115470574, 'support': 2020}

Good: {'precision': 0.8321043675553035, 'recall': 0.8382857142857143, 'f1-score': 0.8351836037574722, 'support': 1750}

accuracy: 0.8464190981432361

macro avg: {'precision': 0.8455489451129781, 'recall': 0.8458755304101839, 'f1-score': 0.8457020576522648, 'support': 3770}

weighted avg: {'precision': 0.8465118193677718, 'recall': 0.8464190981432361, 'f1-score': 0.8464553686739077, 'support': 3770}



# Test general_svm on elden ring data

In [17]:
report = TestSVM(general_svm, general_vectorizer, elden_ring_X_test, elden_ring_y_test)

Testing SVM ...
Tested SVM in 9.404067993164062 s
Bad: {'precision': 0.06494960806270997, 'recall': 1.0, 'f1-score': 0.12197686645636174, 'support': 174}

Good: {'precision': 1.0, 'recall': 0.09107402031930334, 'f1-score': 0.16694379780512136, 'support': 2756}

accuracy: 0.14505119453924914

macro avg: {'precision': 0.5324748040313549, 'recall': 0.5455370101596517, 'f1-score': 0.14446033213074155, 'support': 2930}

weighted avg: {'precision': 0.9444714101716422, 'recall': 0.14505119453924914, 'f1-score': 0.16427340666017795, 'support': 2930}



# Train SVM with elden ring data

In [18]:
elden_ring_svm, elden_ring_vectorizer = CreateSVM(elden_ring_X_train, elden_ring_y_train)

training SVM...
Trained SVM in 0.9890315532684326 s


# Test elden_ring_svm on elden ring test data

In [19]:
report = TestSVM(elden_ring_svm, elden_ring_vectorizer, elden_ring_X_test, elden_ring_y_test)

Testing SVM ...
Tested SVM in 0.3330652713775635 s
Bad: {'precision': 0.8043478260869565, 'recall': 0.21264367816091953, 'f1-score': 0.33636363636363636, 'support': 174}

Good: {'precision': 0.95249653259362, 'recall': 0.9967343976777939, 'f1-score': 0.974113475177305, 'support': 2756}

accuracy: 0.9501706484641638

macro avg: {'precision': 0.8784221793402882, 'recall': 0.6046890379193567, 'f1-score': 0.6552385557704706, 'support': 2930}

weighted avg: {'precision': 0.9436986230604597, 'recall': 0.9501706484641638, 'f1-score': 0.9362402765583362, 'support': 2930}



# Test elden_ring_svm on general data

In [20]:
report = TestSVM(elden_ring_svm, elden_ring_vectorizer, general_X_test, general_y_test)

Testing SVM ...
Tested SVM in 2.3496575355529785 s
Bad: {'precision': 0.7410714285714286, 'recall': 0.20544554455445543, 'f1-score': 0.32170542635658916, 'support': 2020}

Good: {'precision': 0.5, 'recall': 0.9171428571428571, 'f1-score': 0.6471774193548386, 'support': 1750}

accuracy: 0.5358090185676393

macro avg: {'precision': 0.6205357142857143, 'recall': 0.5612942008486563, 'f1-score': 0.4844414228557139, 'support': 3770}

weighted avg: {'precision': 0.6291682455475559, 'recall': 0.5358090185676393, 'f1-score': 0.47278659021519304, 'support': 3770}

