# Sample Testing

In [4]:
import numpy as np
import pandas as pd
import arff
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

from urllib.request import urlretrieve

def load_game_data():
    url = 'https://api.openml.org/data/v1/download/22102514/PC-Games-2020.arff'
    filename = 'pc_game_dataset.arff'
    file, http_response = urlretrieve(url, filename)
    dataset = arff.load(open(file, 'r'))
    attributes = np.array(dataset['attributes'])
    data = np.array(dataset['data'])
    data = clean_data(data)
    return data, attributes

# Use this to save bandwidth and time if the project has the data file already downloaded
def load_game_data_from_file():
    file = 'pc_game_dataset.arff'
    dataset = arff.load(open(file, 'r'))
    attributes = np.array(dataset['attributes'])
    data = np.array(dataset['data'])
    data = clean_data(data)
    return data, attributes

def clean_data(data):
    result = []
    for element in data:
        if element[6] != "" and element[6] is not None and element[25] != "" and element[25] is not None:
            result.append(element)
    return np.array(result)

A, b = load_game_data_from_file()

import string

def process_string(subject):
    term = subject.strip()
    term = str.lower(term)
    term = term.translate(str.maketrans("","", string.punctuation))
    return term

results = set([])
genres = A[:,6]
for entry in genres:
    terms = str(entry).split(',')
    for term in terms:
        results.add(process_string(term))
y_labels = list(results)

y = []
for entry in A:
    y_row = [0] * len(y_labels)
    for genre in str(entry[6]).split(','):
        y_row[y_labels.index(process_string(genre))] = 1
    y.append(y_row)
    
y = np.array(y)

docs = [str(n) for n in A[:,25]]
tv = TfidfVectorizer(smooth_idf=True, sublinear_tf=True, max_df=0.5, min_df=5, lowercase=True, stop_words='english')
tv_result = tv.fit_transform(docs)
X = tv_result

print(X.shape)
print(y.shape)

(27094, 22979)
(27094, 20)


In [6]:
sample_testing_results = []

In the cell below, enter the title and description of any game in the steam library that was published after 2020. Compare the listed genres in the store page to those output here.

In [7]:
title = "Noita"
description = ["Noita is a magical action roguelite set in a world where every pixel is physically simulated. Fight, explore, melt, burn, freeze and evaporate your way through the procedurally generated world using spells you've created yourself."]
actual = ["action", "adventure", "indie", "rpg", "simulation"]

In [8]:
classifier = OneVsRestClassifier(LinearSVC(C=0.9, loss='hinge', max_iter=50000))
classifier.fit(X, y)
print("Classifier Trained")

Classifier Trained


In [10]:
sample = TfidfVectorizer(vocabulary=tv.vocabulary_, smooth_idf=True, sublinear_tf=True, max_df=0.5, min_df=5, lowercase=True, stop_words='english')
sample_result = sample.fit_transform(description)
y_pred = classifier.predict(sample_result)

result = []
for label in range(len(y_labels)):
    if y_pred[0][label] != 0:
        result.append(y_labels[label])

trained = np.isin(title, A[:, 2])
correct = 0
incorrect = 0
for genre in result:
    if genre in actual:
        correct += 1
    else:
        incorrect += 1
correct_ratio = "" + str(correct) + "/" + str(len(actual))
test_output = [trained, title, description[0], actual, result, correct_ratio, incorrect]


sample_testing_results.append(test_output)

columns = ['Already in training set', 'Title', 'Description', 'Actual genres', 'Predicted Genres', 'Correct Genres', 'Incorrect Genres']
pd.DataFrame(sample_testing_results, columns=columns)

Unnamed: 0,Already in training set,Title,Description,Actual genres,Predicted Genres,Correct Genres,Incorrect Genres
0,True,Noita,Noita is a magical action roguelite set in a w...,"[action, adventure, indie, rpg, simulation]","[action, adventure, indie]",3/5,0
