In [None]:
!python -m spacy download en_core_web_lg

Collecting en_core_web_lg==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9MB)
[K     |████████████████████████████████| 827.9MB 1.2MB/s 
Building wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.2.5-cp36-none-any.whl size=829180944 sha256=4a6f5d895b94f84006a936291711e4a2ebebacc89ee0597002b80f7f54b7e71d
  Stored in directory: /tmp/pip-ephem-wheel-cache-djq_u480/wheels/2a/c1/a6/fc7a877b1efca9bc6a089d6f506f16d3868408f9ff89f8dbfc
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [1]:
import spacy
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import en_core_web_lg

In [2]:
nlp = spacy.load('en_core_web_lg')

In [24]:
#Read in the scrape results
rgames = pd.read_csv('rps5_comments.csv')#,sep = '\t')
rgames[:5]

Unnamed: 0,submission,author,body,score,created,url_of_post
0,i1rhdh,Semifreak,**Full list of 120fps Xbox Series X games**\n\...,1227,1596288000.0,/r/PS5/comments/i1rhdh/all_ps5_and_xbox_series...
1,i1rhdh,Task876,Don't most 4k TVs only run at 60hz when at 4k?,788,1596288000.0,/r/PS5/comments/i1rhdh/all_ps5_and_xbox_series...
2,i1rhdh,Tstinzy,When a developer says the game “supports” 120f...,163,1596291000.0,/r/PS5/comments/i1rhdh/all_ps5_and_xbox_series...
3,i1rhdh,DerpHog,"The way tech news keeps hearing ""supports up t...",72,1596296000.0,/r/PS5/comments/i1rhdh/all_ps5_and_xbox_series...
4,i1rhdh,PvtCMiller,This article seems like clickbait just to prom...,56,1596287000.0,/r/PS5/comments/i1rhdh/all_ps5_and_xbox_series...


###Attribute frequency to determine top five

In [15]:
import nltk
import re
nltk.download('stopwords')
#We need to filter out stopwords
from nltk.corpus import stopwords
#Use nltk's stopwords list
stopwords_list = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
#clean_comment accepts a string. It then cleans the comment by removing punctuation and stopwords.
#It returns a list of words in that string.
def clean_comment(comment):
    #For cleaning up the test comment. Remove punctuation and make everything lower case.
    comment = re.sub(r'[^\w\s]','',comment.lower())
    
    #Syntax translation:
    #word can be anything. I choose word since it's what we're essentially doing
    #comment.split() converts the comment into a list. 
    #Then see if the word in that list is in the drop_stop or not. 
    #Drops any word that is in the stopwords list.
    stopwords_list = stopwords.words('english')
    #Store this list and return it.
    cleaned = [word for word in comment.split() if word not in stopwords_list]
    return cleaned

In [25]:
#Use function to create a new cleaned comment column.
rgames['cleaned comment'] = rgames['body'].astype(str).apply(clean_comment)

In [18]:
#Accepts a list of words as a parameter.
#Counts word occurrences and stores it as a dictionary which is returned.
def create_word_count(word_list):
    #Create dictionary that acts as the values for the main frame.
    word_frequency={}

    for word in word_list: #Take each word in the words list.
        if word in word_frequency: #If I have seen this word, update number of times seen by 1
            word_frequency[word] +=1
        else: #I have not seen this word yet. Put it in as a key value.
            word_frequency[word]  = 1
    return word_frequency

In [26]:
#Create a new column that contains the word frequency dictionaries
rgames['count dictionary'] = rgames['cleaned comment'].map(create_word_count)

In [27]:
#Create a list of attributes that we think belong 
attribute_list = ['performance','graphics','affordable','favorite','buy','speed',
                 'price','security','fps','games','multiplayer','online','subscription',
                  'quality']
#Feel free to append or add more to the list
#attribute_list.append('Phrase here')

In [28]:
attribute_df = pd.DataFrame(index = attribute_list)

In [29]:
attribute_df['frequency'] = 0

In [30]:
#Count the number of times a brand has been mentioned in a message/comment throughout the entire df
for dictionary in rgames['count dictionary']: #Take each dictionary
    for key,value in dictionary.items(): #For each key (word) and value (frequency)
        if key in attribute_list: #The key matches a word in the attribute list.
            #Add 1 to the value of the word it matched in the dataframe.
            attribute_df.loc[key] += 1

In [32]:
attribute_df.sort_values(by='frequency',ascending=False)[:5]

Unnamed: 0,frequency
games,421
buy,168
price,129
graphics,42
quality,41


###General Word frequency analysis

In [23]:
#Import everything.
import pandas as pd
import nltk
import matplotlib.pyplot as plt
from sklearn import manifold
from matplotlib.font_manager import FontProperties
import re

import nltk
from nltk import FreqDist
from nltk.corpus import stopwords # Import the stop word list
from nltk.tokenize import sent_tokenize, word_tokenize

from collections import Counter
from itertools import chain

In [33]:
#create a new data frame for beers. Clean up the reviews even more for word tokenization.
rgames2 = rgames[['author','body']].copy()

In [34]:
#Replace punctuation with blanks. 
rgames2['new_cleaned_review'] = rgames2['body'].str.replace(r'[^\w\s]+', '')

In [35]:
#Convert to lower case
rgames2['new_cleaned_review'] = rgames2['new_cleaned_review'].str.lower()

In [36]:
stopwords_list = stopwords.words('english')

#Function to get the word tokens
def get_tokens(entry):
    #Use nltk to get word tokens.
    tokens = word_tokenize(entry)
    
    #Tokenize the word here. Returns the word if the word is a string. .isalpha() is a method that does this.
    token_words = [w for w in tokens if w.isalpha()]
    
    return token_words

#Function to remove the stopwords
def remove_stop_words(entry):
    
    no_stopwords = [w for w in entry if not w in stopwords_list]
    return (no_stopwords)


#Function to delist the entries and join the words together in a string.
def rejoin_words(entry):
    
    joined_words = ( " ".join(entry))
    return joined_words



In [39]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [41]:
#Apply functions
#Word tokenization first
rgames2['words'] = rgames2['new_cleaned_review'].apply(get_tokens)

#Remove stopwords next
rgames2['words'] = rgames2['words'].apply(remove_stop_words)

#Rejoin list to a sentence.
rgames2['processed_review'] = rgames2['words'].apply(rejoin_words)

#Iterate through the sentences.

#Use the Counter function from the collections module to count words.
#Use chain from itertools module to iterate.
tally = Counter(chain.from_iterable(map(str.split, rgames2['processed_review'].tolist()))) 

#Create a series for the word counts.
series = pd.Series(tally).sort_values(ascending=False)

#Create a term frequency dataframe for the words and their frequency
term_freq = series.reset_index()
term_freq.columns = ['words','freq']


#Top ten most mentioned words
term_freq.head(10)

Unnamed: 0,words,freq
0,like,697
1,games,631
2,game,582
3,one,406
4,sony,403
5,people,400
6,im,386
7,get,378
8,dont,375
9,would,351


Nothing useful

#Sentiment Analysis

###cosine similarity

In [45]:
#Use function to create a new cleaned body column.
rgames['cleaned body'] = rgames['body'].astype(str).apply(clean_comment)

In [43]:
attributes = ['games','buy','price', 'quality','graphics']

In [46]:
attributes_nlp = nlp('games, buy, price, quality, graphics')
def get_similarity(review_words):
    text_review = ' '.join(review_words)
    review_nlp = nlp(text_review)
    return review_nlp.similarity(attributes_nlp)

rgames['similarity'] =  rgames['cleaned body'].apply(get_similarity)

  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)


In [49]:
similarity = rgames.groupby('author')['similarity'].mean().reset_index().sort_values(by='similarity',ascending = False)
similarity['similarity'].mean()

0.5258203457489231

###Sentiment analysis using vader

In [50]:
pip install VADER

Collecting VADER
[?25l  Downloading https://files.pythonhosted.org/packages/be/0d/df60a0ae9ffb63c409849d2909883963855d10e2ee9a5a71c97be41da300/vader-0.0.2-py3-none-any.whl (45kB)
[K     |███████▏                        | 10kB 17.1MB/s eta 0:00:01[K     |██████████████▎                 | 20kB 1.8MB/s eta 0:00:01[K     |█████████████████████▍          | 30kB 2.3MB/s eta 0:00:01[K     |████████████████████████████▌   | 40kB 2.5MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 1.6MB/s 
Collecting sonopy
  Downloading https://files.pythonhosted.org/packages/2b/4d/862855fb391bc30351f90d6c50ea913df9d18b0ae3b6b8ef3c7aa3ac976f/sonopy-0.1.2.tar.gz
Building wheels for collected packages: sonopy
  Building wheel for sonopy (setup.py) ... [?25l[?25hdone
  Created wheel for sonopy: filename=sonopy-0.1.2-cp36-none-any.whl size=2881 sha256=52c5742f703e47439b6d1903230473d83c5ddc89fe0918d8d554b46dbcd438d3
  Stored in directory: /root/.cache/pip/wheels/b6/39/ba/b2f21d4fbcb362658

In [51]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer



In [52]:
from nltk.tokenize import sent_tokenize


In [53]:
#Create dataframe with original reviews
sent_games = rgames[['author','body']].copy()
#Add in columns to store sentiment for the attributes
sent_games['games_sentiment'] = 0
sent_games['buy_sentiment'] = 0
sent_games['price_sentiment'] = 0
sent_games['quality_sentiment'] = 0
sent_games['graphics_sentiment'] = 0


In [54]:
nltk.download('vader_lexicon')
vader = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [55]:
#Iterrate through the dataframe
for i in sent_games.index:
    #Get a review and split it into sentence tokens
    review = sent_games.loc[i,'body']
    review = sent_tokenize(review)
    
    #Store score values here
    #Resets every time loop runs. This is intentional.
    comp_games = []
    comp_buy = []
    comp_price = []
    comp_quality = []
    comp_graphics = []
    
    #Iterate through the sentence tokens. Get compound sentiment score if the word is present.
    for sent_token in review:
        if 'games' in sent_token.lower():
            games_score = vader.polarity_scores(sent_token).get('compound')
            comp_games.append(games_score)
            
        if 'buy' in sent_token.lower():
            buy_score = vader.polarity_scores(sent_token).get('compound')
            comp_buy.append(buy_score)
            
        if 'price' in sent_token.lower():
            price_score = vader.polarity_scores(sent_token).get('compound')
            comp_price.append(price_score)
        if 'quality' in sent_token.lower():
            quality_score = vader.polarity_scores(sent_token).get('compound')
            comp_quality.append(quality_score)
        if 'graphics' in sent_token.lower():
            graphics_score = vader.polarity_scores(sent_token).get('compound')
            comp_graphics.append(graphics_score)
    
    #Store each sentiment score in the array
    if len(comp_games) == 0: #Nothing is in the list. No sentiment for the review.
        sent_games.loc[i,'games_sentiment'] = None
    else: #Balanced is in the review. Take the average of the list and store it.
        #Taking average means to convert it into a numpy array and then using the np.mean
        sent_games.loc[i,'games_sentiment'] = np.mean(np.array(comp_games))
    
    #Repeat conditions for attributes.
    if len(comp_buy) == 0:
        sent_games.loc[i,'buy_sentiment'] = None
    else:
        sent_games.loc[i,'buy_sentiment'] = np.mean(np.array(comp_buy))
    if len(comp_price) == 0:
        sent_games.loc[i,'price_sentiment'] = None
    else:
        sent_games.loc[i,'price_sentiment'] = np.mean(np.array(comp_price))
    if len(comp_quality) == 0:
        sent_games.loc[i,'quality_sentiment'] = None
    else:
        sent_games.loc[i,'quality_sentiment'] = np.mean(np.array(comp_quality))
    if len(comp_graphics) == 0:
        sent_games.loc[i,'graphics_sentiment'] = None
    else:
        sent_games.loc[i,'graphics_sentiment'] = np.mean(np.array(comp_graphics))

In [56]:
#Group all the sentiments of the review together
avg_sent_games = sent_games.groupby(by=["author"])[["games_sentiment", "buy_sentiment", "price_sentiment","quality_sentiment","graphics_sentiment"]].mean()

In [57]:
#Sentiments that are not present are treated as Zero. 
#Want to pinpoint comments based on attributes.
#Mean is simply adding five columns together and dividng by 5
avg_sent_games['average_sentiment'] = (avg_sent_games['games_sentiment'].fillna(0)+avg_sent_games['buy_sentiment'].fillna(0)
 +avg_sent_games['price_sentiment'].fillna(0)+avg_sent_games['quality_sentiment'].fillna(0)+avg_sent_games['graphics_sentiment'].fillna(0))/5

In [58]:
avg_sent_games.sort_values(by='average_sentiment', ascending = False).fillna(0)

Unnamed: 0_level_0,games_sentiment,buy_sentiment,price_sentiment,quality_sentiment,graphics_sentiment,average_sentiment
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Fyrael,0.929300,0.92930,0.92930,0.0000,0.0000,0.557580
diddaykong,0.531825,0.85775,0.00000,0.0000,0.8402,0.445955
LocusAintBad,0.128900,0.39650,0.58225,0.9341,0.0000,0.408350
DANK_BLUMPKIN,0.895700,0.00000,0.00000,0.8957,0.0000,0.358280
a_to_the_g79,0.851900,0.00000,0.00000,0.8519,0.0000,0.340760
...,...,...,...,...,...,...
JackStillAlive,-0.520100,0.00000,-0.61870,0.0000,0.0000,-0.227760
Voyager5555,-0.665400,-0.61240,0.00000,0.0000,0.0000,-0.255560
Xerosnake90,-0.670500,-0.67050,0.00000,0.0000,0.0000,-0.268200
RandyChimp,-0.680800,-0.68080,0.00000,0.0000,0.0000,-0.272320


In [59]:
avg_sent_games = avg_sent_games.apply('mean')
avg_sent_games

games_sentiment       0.153131
buy_sentiment         0.136190
price_sentiment       0.114231
quality_sentiment     0.202311
graphics_sentiment    0.306042
average_sentiment     0.009128
dtype: float64