# Video Game Classifier Project
Tyler Short and Gideon Keith-Stanley

### Background
The PC Games 2020 dataset contains the results of scraping and sorting the entire catalog of Valve's "Steam" video game store, and includes data on over 27,000 titles. These data include title, description, genre, price points, several success metrics, and more. We hypothesize that using the "bag of words" method as seen in email spam filters, we can train a machine learning model using the digested description of video games, and use that information to classify games by genre.

In [1]:
import numpy as np
import pandas as pd

### Data Loader
This routine downloads the dataset from OpenML.org and processes it with liac-arff. 

In [15]:
import arff

from urllib.request import urlretrieve

def load_game_data():
    url = 'https://api.openml.org/data/v1/download/22102514/PC-Games-2020.arff'
    filename = 'pc_game_dataset.arff'
    file, http_response = urlretrieve(url, filename)
    dataset = arff.load(open(file, 'r'))
    attributes = np.array(dataset['attributes'])
    data = np.array(dataset['data'])
    return data, attributes

# Use this to save bandwidth and time if the project has the data file in the /data folder
def load_game_data_from_file():
    file = 'pc_game_dataset.arff'
    dataset = arff.load(open(file, 'r'))
    attributes = np.array(dataset['attributes'])
    data = np.array(dataset['data'])
    return data, attributes
    

In [3]:
data, attributes = load_game_data()

## Preprocessing
This code digests the dataset into the form we need and prepares it for use by the model.

In [4]:
stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

In [5]:
import string

def process_string(subject):
    term = subject.strip()
    term = str.lower(term)
    term = term.translate(str.maketrans("","", string.punctuation))
    return term

In [6]:
results = set([])
genres = data[:,6]
for entry in genres:
    terms = str(entry).split(',')
    for term in terms:
        results.add(process_string(term))
y_headers = list(results)

y = []
for entry in data:
    y_row = [0] * len(y_headers)
    for genre in str(entry[6]).split(','):
        y_row[y_headers.index(process_string(genre))] = 1
    y.append(y_row)
    
y = np.array(y)

print("y is now our label vector")

In [7]:
bag = {}
descriptions = data[:,25]

for entry in descriptions:
    terms = str(entry).split()
    for term in terms:
        term = process_string(term)
        if term not in stopwords:  
            if term in bag:
                bag[term] = bag[term] + 1
            else:
                bag[term] = 1

In [8]:
sorted_bag = sorted(bag, reverse=True, key=bag.get)
x_headers = sorted_bag[:20000]

This next cell takes almost ten minutes to run

In [9]:
X = []
for entry in data:
    x_row = [0] * len(x_headers)
    for word in str(entry[25]).split():
        word = process_string(word)
        if word in x_headers:
            x_row[x_headers.index(word)] += 1
    X.append(x_row)

X = np.array(X)
print("X is now our feature vector")

X is now our feature vector


In [48]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt
import pandas as pd

A, b = load_game_data_from_file()

In [66]:
labels = [str(n).split(',') for n in A[:,6]]
text = [str(n).split() for n in A[:,25]]

X_train, X_test, y_train, y_test = train_test_split(text, labels, random_state=0, test_size=0.3)

In [67]:
labels[0]

['Action', ' Free to Play']

In [65]:
cv = CountVectorizer(lowercase=True, stop_words='english')
X_train_counts = cv.fit_transform(X_train)
tf = TfidfTransformer().fit(X_train_counts)
X_train_transformed = tf.transform(X_train_counts)

X_test_counts = cv.transform(X_test)
X_test_transformed = tf.transform(X_test_counts)

le = LabelEncoder()
y_train_labels_fit = le.fit(y_train)
y_train_lables_trf = le.transform(y_train)

print(le.classes_)

AttributeError: 'list' object has no attribute 'lower'

In [58]:
genre_docs = [str(n) for n in A[:,6]]
cv = CountVectorizer(lowercase=True, stop_words='english')
cv_result_a = cv.fit_transform(genre_docs)
y = cv_result_a.toarray()

In [59]:
docs = [str(n) for n in A[:,25]]
cv = CountVectorizer(lowercase=True, stop_words='english')
cv_result = cv.fit_transform(docs)
X = cv_result

## Multi-Class LinearSVC


In [60]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, t_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

svm = LinearSVC(multi_class='ovr')

In [61]:
svm.fit(x_train, y_train)

ValueError: y should be a 1d array, got an array of shape (21175, 26) instead.

In [None]:
y_pred = svm.predict(x_test)

print(np.mean(y_test == y_pred)