In [1]:
import pandas as pd
#import numpy as np
from datetime import datetime
from textblob import TextBlob
import spacy

spacy.require_cpu()
nlp = spacy.load("en_core_web_trf")

# load data
df = pd.read_csv('data/processed_review_data.csv',parse_dates=['date'])

# Restrict to review >=10 words
df = df[df['review_length']>=10]

In [2]:
# clean review text
from functions import lower_case,expandContractions,alpha_num,consec_dup,lemma
import re
def clean(text):
    text = re.sub(r'[?!:]', '.', text) # all sentence ends with '.'
    text = re.sub('\d*\.\d+','', text) # remove all flots
    text = re.sub("[^a-zA-Z0-9. ]", '', text) # remove all not listed chars
    text = re.sub('\.\.+', '. ',text) #remove repeat fullstops
    text = re.sub(' +',' ', text) # remove extra whitespace
    text = TextBlob(text)
    text = text.correct() # Correct spellings
    return text

for func in [expandContractions,clean,consec_dup,lemma]:
    df.review_text = df.review_text.map(func)

In [3]:
# split text into sentences and flatten
sentences = [x.split('.') for x in df.review_text]
sentences = [item for sublist in sentences for item in sublist]

In [4]:
# Extract aspects and descriptors
aspects = []
for sentence in sentences:
  doc = nlp(sentence)
  descriptive_term = ''
  target = ''
  for token in doc:
    if token.dep_ == 'nsubj' and token.pos_ == 'NOUN':
      target = token.text
    if token.pos_ == 'ADJ':
      prepend = ''
      for child in token.children:
        if child.pos_ != 'ADV':
          continue
        prepend += child.text + ' '
      descriptive_term = prepend + token.text
  aspects.append({'aspect': target,
    'description': descriptive_term})

# remove entries with blank aspect or descriptor
aspects = [x for x in aspects if x['aspect']!='' and x['description']!='']

# Add sentiment polarity scores
for aspect in aspects:
  aspect['sentiment'] = TextBlob(aspect['description']).sentiment.polarity

sent_df = pd.DataFrame(aspects)
sent_df



Unnamed: 0,aspect,description,sentiment
0,joke,old,0.100000
1,paint,sweetest,0.000000
2,color,many,0.500000
3,crash,toxic,0.000000
4,content,alive,0.100000
...,...,...,...
11147,camogrinding,new,0.136364
11148,day,buggy,0.000000
11149,crash,entire,0.000000
11150,campaign,exciting,0.300000


In [5]:
sent_df[sent_df['sentiment']==0.0]

Unnamed: 0,aspect,description,sentiment
1,paint,sweetest,0.0
3,crash,toxic,0.0
18,patch,last,0.0
19,shit,enough,0.0
20,good,same,0.0
...,...,...,...
11134,pitch,UNGODLY,0.0
11138,driver,additional,0.0
11139,player,permanent,0.0
11148,day,buggy,0.0
