## Data Capture

In [None]:
import numpy as np
import pandas as pd
import arff
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

from urllib.request import urlretrieve

def load_game_data():
    url = 'https://api.openml.org/data/v1/download/22102514/PC-Games-2020.arff'
    filename = 'pc_game_dataset.arff'
    file, http_response = urlretrieve(url, filename)
    dataset = arff.load(open(file, 'r'))
    attributes = np.array(dataset['attributes'])
    data = np.array(dataset['data'])
    data = clean_data(data)
    return data, attributes

# Use this to save bandwidth and time if the project has the data file already downloaded
def load_game_data_from_file():
    file = 'pc_game_dataset.arff'
    dataset = arff.load(open(file, 'r'))
    attributes = np.array(dataset['attributes'])
    data = np.array(dataset['data'])
    data = clean_data(data)
    return data, attributes

def clean_data(data):
    result = []
    for element in data:
        if element[6] != "" and element[6] is not None and element[25] != "" and element[25] is not None:
            result.append(element)
    return np.array(result)

A, b = load_game_data_from_file()

## Preprocessing

In [42]:
import string

def process_string(subject):
    term = subject.strip()
    term = str.lower(term)
    term = term.translate(str.maketrans("","", string.punctuation))
    return term

results = set([])
genres = A[:,6]
for entry in genres:
    terms = str(entry).split(',')
    for term in terms:
        results.add(process_string(term))
y_labels = list(results)

y = []
for entry in A:
    y_row = [0] * len(y_labels)
    for genre in str(entry[6]).split(','):
        y_row[y_labels.index(process_string(genre))] = 1
    y.append(y_row)
    
    
y = np.array(y)

docs = [str(n) for n in A[:,25]]
tv = TfidfVectorizer(smooth_idf=True, sublinear_tf=True, max_df=0.5, min_df=5, lowercase=True, stop_words='english')
tv_result = tv.fit_transform(docs)
X = tv_result

print(X.shape)
print(y.shape)

(27094, 22979)
(27094, 20)


## Analysis

### LinearSVC C=0.9

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

#LinearSVC
results = []
for state in range(10):
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = state)
    classifier = OneVsRestClassifier(LinearSVC(C=0.9, loss='hinge', max_iter=50000))
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    results.append(np.mean(y_pred == y_test))
print(np.mean(results))



0.9232402509533768


By Genre (~45sec)

In [None]:
genres = []
accuracy = []

for z in range(len(y_labels)):
    tmp = y[:,z]
    results = []
    for state in range(10):
        x_train, x_test, y_train, y_test = train_test_split(X, tmp, test_size = 0.3, random_state = state)
        classifier = OneVsRestClassifier(LinearSVC(C=0.9, loss='hinge', max_iter=50000))
        classifier.fit(x_train, y_train)
        y_pred = classifier.predict(x_test)
        results.append(np.mean(y_pred == y_test))
    genres.append(y_labels[z])
    accuracy.append(np.mean(results))



['indie', 'design  illustration', 'action', 'software training', 'early access', 'gore', 'audio production', 'simulation', 'movie', 'racing', 'strategy', 'massively multiplayer', 'violent', 'free to play', 'sports', 'adventure', 'rpg', 'utilities', 'casual', 'education']
[np.float64(0.7955345060893099), np.float64(0.9999507934555296), np.float64(0.7826054865297084), np.float64(0.9999876983638825), np.float64(0.9116004428589001), np.float64(0.9999876983638825), np.float64(0.9999753967277648), np.float64(0.8637347767253045), np.float64(0.9999384918194121), np.float64(0.9788042809693689), np.float64(0.8655062123262394), np.float64(0.9737360068889164), np.float64(0.9999876983638825), np.float64(0.9354164103825809), np.float64(0.9696395620617542), np.float64(0.7542256120063968), np.float64(0.8906753598228565), np.float64(0.9999753967277648), np.float64(0.7435354902201993), np.float64(0.9999876983638825)]


In [None]:
df = pd.DataFrame(accuracy, genres)
df

### LinearSVC - Bagging

In [111]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import LinearSVC

#x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = state)
#lsvc = LinearSVC(C=0.9, loss='hinge', max_iter=50000)
#bg = BaggingClassifier(estimator=lsvc, n_estimators=100)
genres = []
accuracy = []
results = []
    

tmp = y[:,8]
results = []
x_train, x_test, y_train, y_test = train_test_split(X, tmp, test_size = 0.1, random_state = 6)
lsvc = OneVsRestClassifier(LinearSVC(C=.09, loss='hinge', max_iter=50000))
bg = BaggingClassifier(estimator=lsvc, n_estimators=100)    
lsvc.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
print(y_labels[8])
print(np.mean(y_pred == y_test))


#genres.append(y_labels[z])
#accuracy.append(np.mean(results))
#param_grid = {
#    'max_samples' : [0.05, 0.1, 0.2, 0.5]
#}
#
#clf = GridSearchCV(estimator=bg, param_grid=param_grid)
#clf.fit(x_train, y_train)

simulation
0.6162361623616236


In [87]:
df = pd.DataFrame(accuracy, genres)
df

Unnamed: 0,0
utilities,0.681439
rpg,0.563432
free to play,0.654945
violent,0.681402
design illustration,0.681365
action,0.442435
massively multiplayer,0.662177
audio production,0.681439
simulation,0.62428
early access,0.629779


### RandomForest n_estimators = 150 (30min~ to run)

In [None]:
from sklearn.ensemble import RandomForestClassifier

genres = []
accuracy = []
for z in range(len(y_labels)):
    tmp = y[:,z]
    results = []
    for state in range(10):
        x_train, x_test, y_train, y_test = train_test_split(X, tmp, test_size = 0.3, random_state = state)
        classifier = RandomForestClassifier(n_estimators=100, n_jobs=-1)
        classifier.fit(x_train,y_train)
        y_pred = classifier.predict(x_test)
        results.append(np.mean(y_pred == y_test))
    genres.append(y_labels[z])
    accuracy.append(np.mean(results))

In [54]:
df = pd.DataFrame(accuracy, genres)
df

Unnamed: 0,0
utilities,0.999951
rpg,0.859565
free to play,0.942022
violent,0.999988
design illustration,0.999951
action,0.7837
massively multiplayer,0.970427
audio production,0.999951
simulation,0.841838
early access,0.909718


RandomForest Hyperparamater tuning (over 3 hours runtime)

In [None]:
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import GridSearchCV

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
param_grid = { 
    'n_estimators': [25, 50, 100, 150], 
    'max_features': ['sqrt', 'log2', None], 
    'max_depth': [3, 6, 9, None], 
    'max_leaf_nodes': [3, 6, 9, None], 
}
grid_search = GridSearchCV(RandomForestClassifier(n_jobs=-1), 
                           param_grid=param_grid) 
grid_search.fit(x_train, y_train) 
print(grid_search.best_estimator_) 

RandomForestClassifier(max_features=None, n_estimators=150, n_jobs=-1)
