# Final

In [4]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.6.2-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 1.3 MB/s eta 0:00:01
[?25hCollecting click
  Downloading click-8.0.1-py3-none-any.whl (97 kB)
[K     |████████████████████████████████| 97 kB 3.6 MB/s eta 0:00:01
Collecting regex
  Downloading regex-2021.4.4-cp39-cp39-manylinux2014_x86_64.whl (730 kB)
[K     |████████████████████████████████| 730 kB 4.0 MB/s eta 0:00:01
[?25hInstalling collected packages: regex, click, nltk
Successfully installed click-8.0.1 nltk-3.6.2 regex-2021.4.4


In [5]:
# data tools
import os
import re
import json 
import warnings 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
warnings.filterwarnings('ignore')
from subprocess import check_output

# nltk and sklearn
import nltk
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# tensorflow and keras
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import model_from_json
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding, LSTM, Conv1D, MaxPooling1D
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint

AttributeError: module 'keras.utils.generic_utils' has no attribute 'populate_dict_with_module_objects'

In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

# Get the dataset (Amazon Video Game Reviews)

In [None]:
#!curl https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Video_Games_v1_00.tsv.gz -o games.tsv.gz
#!gzip -d games.tsv.gz

# Load into Pandas DataFrame 

There are a small number of lines in this file that don't parse properly

In [None]:
games = pd.read_csv('games.tsv', delimiter = '\t', error_bad_lines=False)

# Examine Data

In [None]:
games.head()

In [None]:
games.dtypes

The data types all seem to be correct

In [None]:
games.describe()

In [None]:
games.star_rating.value_counts()

Star ratings tend to be fairly high on average with a mean of 4, most reviews have no helpful_votes but some reviews have a lot of helpful votes

In [None]:
games.isna().sum()

There are a very small number of NA values, they make up less than .01% of the data so dropping those rows should not have a substantial impact

In [None]:
games.dropna(inplace=True)

In [None]:
games.isna().sum()

In order to get everything the customer wrote for the review, we are combining the 'review_headline' and 'review_body'. There are, however, a large number of reviews where the review headline is just a restatement of the star rating, this is information that is already captured elsewhere so those headlines are being ignored. 

In [None]:
games['review_full'] = np.where(games['review_headline'].str.lower().str.contains('star'), games['review_body'], 
                                games['review_headline'] + ' '+ games['review_body'])

In [None]:
games.head()

We are interested in analyzing whether the review was positive or negative, so a new column is being created which classifies a review as positive if it is 4 stars or greater, negative if it is 2 stars or less, and neutral otherwise

In [None]:
games['Sentiment_target'] = np.where(games.star_rating > 3, 'Positive', np.where(games.star_rating < 3, 'Negative', 'Neutral'))

In [None]:
games.Sentiment_target.value_counts()

# Preprocessing and visualization

Define a pre-processing function to lemmatize the text and remove stopwords (this takes some time to run)

In [None]:
def preprocess(text):
    words = nltk.tokenize.word_tokenize(text)
           
    lem = nltk.stem.WordNetLemmatizer()
    words = [lem.lemmatize(word) for word in words]
    
    stopwords = nltk.corpus.stopwords.words('english')
    words = [word for word in words if word not in stopwords+['br','wa']+list(string.punctuation)]
   
    return ' '.join(words)

In [None]:
games['review_clean'] = games.review_full.apply(preprocess)
games.sample(10)

In [None]:
from wordcloud import WordCloud

In [None]:
full_text = ' '.join(games.review_clean)
wordcloud = WordCloud().generate(full_text)

plt.rcParams["figure.figsize"] = (20,15)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

This is a word cloud showing word frequency across all reviews. The most frequent words here definitely trend towards the positive side with 'great' showing up frequently, this could be because over 50% of all of the reviews are 5 star reviews, and the vast majority of reviews are positive.

In [None]:
full_text = ' '.join(games.review_clean[games.Sentiment_target=='Positive'])
wordcloud = WordCloud().generate(full_text)

plt.rcParams["figure.figsize"] = (20,15)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

This is a word cloud showing word frequency is positive reviews.It looks very similar tothe word cloud showing all reviews, but words expressing positive sentiment like 'good', 'great', and 'love' are more common. 

In [None]:
full_text = ' '.join(games.review_clean[games.Sentiment_target=='Negative'])
wordcloud = WordCloud().generate(full_text)

plt.rcParams["figure.figsize"] = (20,15)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
# Sentiment_target	review_clean
games = games[['review_clean','Sentiment_target']]
games.columns = ['text', 'sentiment']

test = games[games.sentiment != 'Neutral']