In [1]:
import gzip
from collections import defaultdict
import numpy as np
import pandas as pd
import string
import sklearn


li = []
def readJSON(path):
    '''
    This function reads in a filepath to convert JSON into the separate user, game, and data
    
    Params:
        path (string) - filepath that stores JSON data
    '''
    for l in gzip.open(path, 'rt'):
        d = eval(l)
        u = d['userID']
        try:
            g = d['gameID']
        except Exception as e:
            g = None
        yield u,g,d

In [2]:
gameCount = defaultdict(int)
totalPlayed = 0

for user,game,_ in readJSON("train.json.gz"):
    gameCount[game] += 1
    totalPlayed += 1

mostPopular = [(gameCount[x], x) for x in gameCount]
mostPopular.sort()
mostPopular.reverse()

In [3]:
# Task 1:

c = 0
X_train = []
X_val = []
userGames = defaultdict(set)
gameUsers = defaultdict(set)
users = set()
games = set()
for user, game, d in readJSON('train.json.gz'):
    userGames[user].add(game)
    gameUsers[game].add(user)
    users.add(user)
    games.add(game)
    if c < 165000: 
        X_train.append(d)
    else:
        X_val.append(d)
    c+=1
    
X_train = pd.DataFrame(X_train)[['userID', 'gameID']]
X_val = pd.DataFrame(X_val)[['userID', 'gameID']]
X_val['y'] = np.array([1]*10000)

In [4]:
def user_didnt_play(uID):
    '''
    This function takes a userID and returns all games that the user didn't play
    
    Params:
        uID (string) - userID of the data
    '''
    g = list(userGames[uID])
    g_prime = games.copy()
    for i in g:
        g_prime.remove(i)
    return np.random.choice(list(g_prime))

negative = []
for i in list(X_val['userID']):
    negative.append({'userID': i, 'gameID': user_didnt_play(i), 'y': 0})

In [5]:
neg = pd.DataFrame(negative)
X_val = X_val.append(neg).reset_index(drop=True)

In [6]:
return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > totalPlayed/2:
        break
        
predictions = []
for g in X_val['gameID']:
    if g in return1:
        predictions.append(1)
    else:
        predictions.append(0)
        
{'Accuracy - T1': np.mean(np.array(predictions) == X_val['y'])}

{'Accuracy - T1': 0.68245}

In [7]:
# Task 2:
return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > totalPlayed/1.5:
        break
        
predictions = []
for g in X_val['gameID']:
    if g in return1:
        predictions.append(1)
    else:
        predictions.append(0)
        
{'Accuracy - T2': np.mean(np.array(predictions) == X_val['y'])}

{'Accuracy - T2': 0.703}

In [8]:
# Task 3:
def Jaccard(s1, s2):
    '''
    This function takes two sets and computes the Jaccard Similarity between them
    
    Params:
        s1 (set) - first set to compare
        s2 (set) - second set to compare
    '''
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    return numer / denom

In [9]:
parameter = {}
for thresh in [0.019, 0.020, 0.021, 0.023, 0.024, 0.025, 0.026, 0.027, 0.028, 0.029]:
    preds = []
    for i in range(20000):
        jaccard_values = []
        user, game = X_val.iloc[i, 0], X_val.iloc[i, 1]
        gUsers = gameUsers[game]
        uGames = userGames[user]
        for x in uGames:
            if x == game:
                continue
            jaccard_values.append(Jaccard(gUsers, gameUsers[x]))
        if max(jaccard_values) > thresh:
            preds.append(1)
        else:
            preds.append(0)
    fpr = np.sum((np.array(preds)==1) & (X_val['y']==0)) / np.sum(X_val['y']==0)
    fnr = np.sum((np.array(preds)==0) & (X_val['y']==1)) / np.sum(X_val['y']==1)
    parameter[thresh] = {'Accuracy': np.mean(np.array(preds)==X_val['y']), \
                         'FNR': fnr, 'FPR': fpr, 'BER': 0.5 * (fpr + fnr)}
pd.DataFrame(parameter)

Unnamed: 0,0.019,0.020,0.021,0.023,0.024,0.025,0.026,0.027,0.028,0.029
Accuracy,0.6267,0.6436,0.6569,0.6895,0.7064,0.7202,0.7303,0.73985,0.75185,0.75855
FNR,0.0034,0.007,0.0108,0.0216,0.0284,0.0402,0.0505,0.062,0.0774,0.0974
FPR,0.7432,0.7058,0.6754,0.5994,0.5588,0.5194,0.4889,0.4583,0.4189,0.3855
BER,0.3733,0.3564,0.3431,0.3105,0.2936,0.2798,0.2697,0.26015,0.24815,0.24145


In [10]:
# Task 4:
return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > totalPlayed/2:
        break
        
        
preds = []
for i in range(20000):
    jaccard_values = []
    user, game = X_val.iloc[i, 0], X_val.iloc[i, 1]
    gUsers = gameUsers[game]
    uGames = userGames[user]
    for x in uGames:
        if x == game:
            continue
        jaccard_values.append(Jaccard(gUsers, gameUsers[x]))
    if (max(jaccard_values) > 0.024) & (game in return1):
        preds.append(1)
    else:
        preds.append(0)

In [184]:
np.mean(np.array(preds) == X_val['y'])

0.6873

In [185]:
# Task 5:
predictions = open("predictions_Played.txt", 'w')
for l in open("pairs_Played.txt"):
    if l.startswith("userID"):
    #header
        predictions.write(l)
        continue
    u,g = l.strip().split('-')
    gUsers = gameUsers[g]
    uGames = userGames[u]
    jaccard_values = []
    for x in uGames:
        if x == game:
            continue
        else:
            jaccard_values.append(Jaccard(gUsers, gameUsers[x]))
    if len(jaccard_values) == 0:
        predictions.write(u + '-' + g + ",0\n")
    elif (max(jaccard_values) > 0.024) & (g in return1):
        predictions.write(u + '-' + g + ",1\n")
    else:
        predictions.write(u + '-' + g + ",0\n")
        
predictions.close()

In [76]:
# Task 6:
c = 0
X_train = []
X_val = []
userGames = defaultdict(set)
gameUsers = defaultdict(set)
users = set()
games = set()
for user, game, d in readJSON('train_Category.json.gz'):
    userGames[user].add(game)
    gameUsers[game].add(user)
    users.add(user)
    games.add(game)
    if c < 165000: 
        X_train.append(d)
    else:
        X_val.append(d)
    c+=1
    
X_train = pd.DataFrame(X_train)
X_val = pd.DataFrame(X_val)

In [77]:
from collections import OrderedDict
def clean(s):
    '''
    This function takes a review and cleans the text by removing capitalization and punctuation
    
    Params:
        s (string) - review text
    '''
    punctuation = set(string.punctuation)
    for i in punctuation:
        s = s.replace(i, '')
    return s.lower()

X_train['text'] = X_train['text'].apply(clean)

wordCount = defaultdict(int)
for i in X_train['text']:
    for w in i.split():
        wordCount[w] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()
counts[:10]

[(544597, 'the'),
 (317620, 'and'),
 (305414, 'a'),
 (291882, 'to'),
 (245359, 'game'),
 (227234, 'of'),
 (208417, 'is'),
 (200633, 'you'),
 (195953, 'i'),
 (190966, 'it')]

In [78]:
# Task 7:

word = [w for c, w in counts]
words = {}
for i in range(1000):
    words[word[i]] = i

In [79]:
def feature(data, wo):
    '''
    This function creates a feature matrix out of our training data.
    
    Params:
        data (pandas.DataFrame) - our dataset
        wo (list) - list of top N words occuring in the corpus of reviews
    Return:
        Returns a feature matrix of the data passed in
    '''
    encoding = []
    for i in data:
        feat = [0]*len(wo)
        for w in i.split():
            if w in wo.keys():
                feat[wo[w]] += 1
            else:
                continue
        encoding.append(feat)
    return encoding

In [80]:
model = sklearn.linear_model.LogisticRegression(C=1, max_iter=10000, fit_intercept=True)
model.fit(feature(X_train['text'], words), np.array(X_train['genre']))

LogisticRegression(C=1, max_iter=10000)

In [81]:
np.mean(np.array(model.predict(feature(X_val['text'], words))) == X_val['genre'])

0.6372

In [82]:
# Task 8
word = [w for c,w in counts]
words = {}
for i in range(1500):
    words[word[i]] = i

In [83]:
model8 = sklearn.linear_model.LogisticRegression(C=10, max_iter=10000, fit_intercept=True)
model8.fit(feature(X_train['text'], words), np.array(X_train['genre']))

LogisticRegression(C=10, max_iter=10000)

In [84]:
np.mean(np.array(model8.predict(feature(X_val['text'], words))) == X_val['genre'])

0.649

In [191]:
predictions = open("predictions_Category.txt", 'w')
categories = {'Action': 1, 'Strategy': 2, 'RPG': 3, 'Adventure': 4, 'Sports': 5}
predictions.write('userID-reviewID,prediction\n')
test_data = []
users_and_reviewID = []
for user,game,d in readJSON('test_Category.json.gz'):
    test_data.append(d['text'])
    users_and_reviewID.append((user, d['reviewID']))
    
p = model8.predict(feature(test_data, words))

for i in range(len(p)):
    predictions.write(users_and_reviewID[i][0] + '-' + users_and_reviewID[i][1] + ',' + str(categories[p[i]]) + "\n")
        
predictions.close()