# Import Modules

In [1]:
import requests
import os
import json
import gzip
import pandas as pd
import numpy as np
from urllib.request import urlopen

# Amazon Video Game Data Set (URL)

In [2]:
video_game_data = "http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Video_Games_5.json.gz"

video_game_data_request = requests.get(video_game_data)

video_game_data_request

<Response [200]>

# Load into Pandas DataFrame 

In [3]:
video_game_data = pd.read_json(
    "http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Video_Games_5.json.gz",
    compression = 'gzip',
    lines=True
)

# Examine Data

In [7]:
video_game_data.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,5,True,"10 17, 2015",A1HP7NVNPFMA4N,700026657,Ambrosia075,"This game is a bit hard to get the hang of, bu...",but when you do it's great.,1445040000,,,
1,4,False,"07 27, 2015",A1JGAP0185YJI6,700026657,travis,I played it a while but it was alright. The st...,"But in spite of that it was fun, I liked it",1437955200,,,
2,3,True,"02 23, 2015",A1YJWEXHQBWK2B,700026657,Vincent G. Mezera,ok game.,Three Stars,1424649600,,,
3,2,True,"02 20, 2015",A2204E1TH211HT,700026657,Grandma KR,"found the game a bit too complicated, not what...",Two Stars,1424390400,,,
4,5,True,"12 25, 2014",A2RF5B5H74JLPE,700026657,jon,"great game, I love it and have played it since...",love this game,1419465600,,,


In [40]:
video_game_data.overall.value_counts()

5    299759
4     93654
3     49146
1     30883
2     24135
Name: overall, dtype: int64

In [39]:
video_game_data.isna().sum()

overall                0
verified               0
reviewTime             0
reviewerID             0
asin                   0
reviewerName          76
reviewText           158
summary              109
unixReviewTime         0
vote              389784
style             208340
image             493943
dtype: int64

In [55]:
video_game_data[video_game_data.reviewText.isna()==True].overall.value_counts()

Series([], Name: overall, dtype: int64)

There are a very small number of reviews with no text, they make up a very small percentage of the overall reiews and don't look particularly biased, so they should be safe to drop

In [48]:
video_game_data.dropna(subset=['reviewText'], inplace=True)

Series([], Name: overall, dtype: int64)

In [56]:
video_game_data.isna().sum()

overall                0
verified               0
reviewTime             0
reviewerID             0
asin                   0
reviewerName          76
reviewText             0
summary              103
unixReviewTime         0
vote              389635
style             208296
image             493819
dtype: int64

Missing values in the other columns should not be as big of an issue, as they are not as integral to our analysis

# Set up NLP [IN PROGRESS]

In [45]:
import nltk
nltk.download('stopwords') 
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

def preprocess_text(text):
    # Tokenise words while ignoring punctuation
    tokeniser = RegexpTokenizer(r'\w+')
    tokens = tokeniser.tokenize(text)
    
    # Lowercase and lemmatise 
    lemmatiser = WordNetLemmatizer()
    lemmas = [lemmatiser.lemmatize(token.lower(), pos='v') for token in tokens]
    
    # Remove stopwords
    keywords= [lemma for lemma in lemmas if lemma not in stopwords.words('english')]
    return keywords

[nltk_data] Downloading package stopwords to /Users/raja_/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/raja_/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Apply NLP Methods [BROKEN]

In [48]:
X_train = video_game_data[['reviewText']]

vectoriser = TfidfVectorizer(analyzer=preprocess_text)
# Fit to the data and transform to feature matrix
X_train = vectoriser.fit_transform(X_train['reviewText'])
# Convert sparse matrix to dataframe
X_train = pd.DataFrame.sparse.from_spmatrix(X_train)
# Save mapping on which index refers to which words
col_map = {v:k for k, v in vectoriser.vocabulary_.items()}
# Rename each column using the mapping
for col in X_train.columns:
    X_train.rename(columns={col: col_map[col]}, inplace=True)
X_train

ValueError: np.nan is an invalid document, expected byte or unicode string.

Citation For data:
    
Justifying recommendations using distantly-labeled reviews and fined-grained aspects
Jianmo Ni, Jiacheng Li, Julian McAuley
Empirical Methods in Natural Language Processing (EMNLP), 2019