# Text Classification of Amazon Review Dataset on Video Games

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json

import nltk
import string
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

### Load and investigate data

In [2]:
file = 'data/amazon_games.json'
df = pd.read_json(file, lines=True)

In [3]:
df.shape

(497577, 12)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 497577 entries, 0 to 497576
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   overall         497577 non-null  int64 
 1   verified        497577 non-null  bool  
 2   reviewTime      497577 non-null  object
 3   reviewerID      497577 non-null  object
 4   asin            497577 non-null  object
 5   reviewerName    497501 non-null  object
 6   reviewText      497419 non-null  object
 7   summary         497468 non-null  object
 8   unixReviewTime  497577 non-null  int64 
 9   vote            107793 non-null  object
 10  style           289237 non-null  object
 11  image           3634 non-null    object
dtypes: bool(1), int64(2), object(9)
memory usage: 42.2+ MB


In [5]:
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,5,True,"10 17, 2015",A1HP7NVNPFMA4N,700026657,Ambrosia075,"This game is a bit hard to get the hang of, bu...",but when you do it's great.,1445040000,,,
1,4,False,"07 27, 2015",A1JGAP0185YJI6,700026657,travis,I played it a while but it was alright. The st...,"But in spite of that it was fun, I liked it",1437955200,,,
2,3,True,"02 23, 2015",A1YJWEXHQBWK2B,700026657,Vincent G. Mezera,ok game.,Three Stars,1424649600,,,
3,2,True,"02 20, 2015",A2204E1TH211HT,700026657,Grandma KR,"found the game a bit too complicated, not what...",Two Stars,1424390400,,,
4,5,True,"12 25, 2014",A2RF5B5H74JLPE,700026657,jon,"great game, I love it and have played it since...",love this game,1419465600,,,


In [6]:
df['asin'].nunique()

17408

In [7]:
for i in list(df.columns[1:6]):
    df = df.drop(i, axis=1)

for i in list(df.columns[3:7]):
    df = df.drop(i, axis=1)

In [8]:
df = df.loc[~df.isna().any(axis=1)]
df.shape

(497316, 3)

In [9]:
df.groupby('overall').count()

Unnamed: 0_level_0,reviewText,summary
overall,Unnamed: 1_level_1,Unnamed: 2_level_1
1,30872,30872
2,24129,24129
3,49138,49138
4,93636,93636
5,299541,299541


__Observation__: Dataset is imbalanced. Try random undersampling and SMOTE.

### Try text classification using undersampled data

1: Do random undersampling on the imbalanced dataset

In [10]:
df_us= [] 

for i in range(1,6):
    df_us.append(df[df['overall']==i].sample(24129))
    
df_us = pd.concat(df_us)

In [11]:
df_us.shape

(120645, 3)

In [12]:
df_us.head()

Unnamed: 0,overall,reviewText,summary
201993,1,I got this microphone for Disney's Sing It. I ...,This is not compatible with Disney's Sing It!!...
345228,1,"I connected my Turtle Beach X12 headphones, to...",DISCONNECTED CONTROLLER
368655,1,The graphics are great and the tanks look good...,"Good game graphics, rigged match making"
397568,1,I bought the game under pressure from a few of...,Why did I do this....
421318,1,Got it out of package put it on and mic broke ...,There are better ones out there.


2: Pre-process data for text classification

In [13]:
lemmatizer  = WordNetLemmatizer()
stopwords = set(nltk.corpus.stopwords.words('english'))
punct = string.punctuation

reviews_processed = []

for review in df_us['reviewText']:
    result = ""
    for sent in nltk.sent_tokenize(review):
        for word in nltk.word_tokenize(sent):
            word = word.lower()
            word = lemmatizer.lemmatize(word)
            if word not in stopwords and word not in punct:
                result = result + " " + word
    reviews_processed.append(result)

3: Classify text based on overall ratings

In [14]:
X_train_us, X_test_us, y_train_us, y_test_us = train_test_split(reviews_processed, df_us['overall'])

count_vectorizer_us = CountVectorizer().fit(X_train_us)
X_train_counts_us = count_vectorizer_us.transform(X_train_us)

tfidf_transformer_us = TfidfTransformer().fit(X_train_counts_us)
X_train_tfidf_us = tfidf_transformer_us.transform(X_train_counts_us)

clf_us = LinearSVC().fit(X_train_tfidf_us, y_train_us)

In [15]:
X_test_counts_us = count_vectorizer_us.transform(X_test_us)

X_test_tfidf_us = tfidf_transformer_us.transform(X_test_counts_us)

y_pred_us = clf_us.predict(X_test_tfidf_us)

In [18]:
print(confusion_matrix(y_test_us, y_pred_us))

[[3936 1181  425  231  235]
 [1761 2257 1139  582  337]
 [ 789 1229 2010 1337  693]
 [ 351  540 1116 2384 1638]
 [ 264  230  387 1174 3936]]


In [19]:
print(classification_report(y_test_us, y_pred_us))

              precision    recall  f1-score   support

           1       0.55      0.66      0.60      6008
           2       0.42      0.37      0.39      6076
           3       0.40      0.33      0.36      6058
           4       0.42      0.40      0.41      6029
           5       0.58      0.66      0.61      5991

    accuracy                           0.48     30162
   macro avg       0.47      0.48      0.47     30162
weighted avg       0.47      0.48      0.47     30162



In [20]:
print(accuracy_score(y_test_us, y_pred_us))

0.4814999005370997


### Try text classification using oversampled data

1: Do random oversampling on the imbalanced dataset

In [33]:
df_os= [] 

for i in range(1,6):
    df_os.append(df[df['overall']==i].sample(299541, replace=True))
    
df_os = pd.concat(df_os)

In [34]:
df_os.shape

(1497705, 3)

In [35]:
df_os.head()

Unnamed: 0,overall,reviewText,summary
73572,1,Do not buy this game until they release a patc...,DO NOT BUY THIS GAME - CRASHES CONSTANTLY
373995,1,Ok so I waited forever to get this for my son ...,UPDATE 9/30 RAZER SUX
31153,1,this is awful. Most over-hyped product of the ...,What were they thinking?
462993,1,Gave this game about 3 hours of my time. It wa...,Boring game.
181633,1,Just go rent god of war 3. This is one of the ...,God of war but worse


2: Pre-process data for text classification

In [36]:
reviews_processed_t = []

for review in df_os['reviewText']:
    result = ""
    for sent in nltk.sent_tokenize(review):
        for word in nltk.word_tokenize(sent):
            word = word.lower()
            word = lemmatizer.lemmatize(word)
            if word not in stopwords and word not in punct:
                result = result + " " + word
    reviews_processed_t.append(result)

In [39]:
X_train, X_test, y_train, y_test = train_test_split(reviews_processed_t, df_os['overall'])

count_vectorizer = CountVectorizer().fit(X_train)
X_train_counts = count_vectorizer.transform(X_train)

tfidf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_tfidf = tfidf_transformer.transform(X_train_counts)

3: Classify text based on overall ratings

In [40]:
clf_os = LinearSVC().fit(X_train_tfidf, y_train)

In [41]:
X_test_counts_os = count_vectorizer.transform(X_test)

X_test_tfidf_os = tfidf_transformer.transform(X_test_counts_os)

y_pred_os = clf_os.predict(X_test_tfidf_os)

In [42]:
print(confusion_matrix(y_test, y_pred_os))

[[65551  4472  1919  1089  1723]
 [ 8250 57012  5134  2142  2253]
 [ 5507  7168 47628  8538  6295]
 [ 2631  3602  8476 43724 16538]
 [ 2527  2060  3734 10827 55627]]


In [43]:
print(classification_report(y_test, y_pred_os))

              precision    recall  f1-score   support

           1       0.78      0.88      0.82     74754
           2       0.77      0.76      0.76     74791
           3       0.71      0.63      0.67     75136
           4       0.66      0.58      0.62     74971
           5       0.67      0.74      0.71     74775

    accuracy                           0.72    374427
   macro avg       0.72      0.72      0.72    374427
weighted avg       0.72      0.72      0.72    374427



In [44]:
print(accuracy_score(y_test, y_pred_os))

0.7198786412304669
