In [361]:
import pandas as pd
#import numpy as np
from datetime import datetime
from textblob import TextBlob
import spacy

spacy.require_cpu()
nlp = spacy.load("en_core_web_sm")

# load data
df = pd.read_csv('data/processed_review_data.csv',parse_dates=['date'])

# Restrict to review >=10 words
df = df[df['review_length']>=10]

In [276]:
# clean review text
from functions import lower_case,expandContractions,alpha_num,consec_dup,lemma
import re
def clean(text):
    text = re.sub(r'[?!:]', '.', text) # all sentence ends with '.'
    text = re.sub('\d*\.\d+','', text) # remove all flots
    text = re.sub("[^a-zA-Z0-9. ]", '', text.lower()) # remove all not listed chars and make lowercase
    text = re.sub('\.\.+', '. ',text) #remove repeat fullstops
    text = re.sub(' +',' ', text) # remove extra whitespace
    text = TextBlob(text)
    text = text.correct() # Correct spellings
    return text

for func in [expandContractions,clean,consec_dup,lemma]:
    df.review_text = df.review_text.map(func)

In [373]:
df1 = df.iloc[0:2]
df1.review_text.map(clean)

3    (t, h, e,  , m, a, i, n,  , t, h, i, n, g,  , ...
4    (t, h, i, s,  , m, a, y,  , i, s,  , a,  , j, ...
Name: review_text, dtype: object

In [278]:
# split text into sentences and flatten
sentences = [x.split('.') for x in df.review_text]
sentences = [item for sublist in sentences for item in sublist]

In [281]:
# Extract aspects and descriptors
aspects = []
for sentence in sentences:
  doc = nlp(sentence)
  descriptive_term = ''
  target = ''
  for token in doc:
    if token.dep_ == 'nsubj' and token.pos_ == 'NOUN':
      target = token.text
    if token.pos_ == 'ADJ':
      prepend = ''
      for child in token.children:
        if child.pos_ != 'ADV':
          continue
        prepend += child.text + ' '
      descriptive_term = prepend + token.text
  aspects.append({'aspect': target,
    'description': descriptive_term})

# remove entries with blank aspect or descriptor
aspects = [x for x in aspects if x['aspect']!='' and x['description']!='']

# Add sentiment polarity scores
for aspect in aspects:
  aspect['sentiment'] = TextBlob(aspect['description']).sentiment.polarity

sent_df = pd.DataFrame(aspects)
sent_df



Unnamed: 0,aspect,description,sentiment
0,joke,old,0.100000
1,color,many,0.500000
2,crash,toxic,0.000000
3,content,alive,0.100000
4,player,near good,0.400000
...,...,...,...
11031,camogrinding,new,0.136364
11032,day,buggy,0.000000
11033,crash,entire,0.000000
11034,campaign,exciting,0.300000


In [290]:
sent_df.sort_values(by='sentiment',ascending = False).head(50)

Unnamed: 0,aspect,description,sentiment
8957,game,perfect,1.0
1816,part,perfect,1.0
7060,team,wonderful,1.0
2378,thing,perfect,1.0
2215,mode,awesome,1.0
4617,people,awesome,1.0
9832,idea,perfect,1.0
1991,fix,soon else perfect,1.0
463,card,perfect,1.0
4718,character,perfect,1.0


In [353]:
neutral = sent_df[sent_df['sentiment']==0]

neg = pd.read_csv("C:/Users/rob_t/OneDrive/Documents/Data Science/rMarkDown/SA_steam_reviews/data/negList.csv")
neg = list(neg['Negative'])
neg = list(neutral.loc[neutral['description'].isin(neg)].description+' '+neutral.loc[neutral['description'].isin(neg)].aspect)

pos = pd.read_csv("C:/Users/rob_t/OneDrive/Documents/Data Science/rMarkDown/SA_steam_reviews/data/posList.csv")
pos = list(pos['Positive'])
pos = list(neutral.loc[neutral['description'].isin(pos)].description+' '+neutral.loc[neutral['description'].isin(pos)].aspect)