In [0]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import re
import spacy
from tqdm import tqdm
nlp = spacy.load('en_core_web_sm', parse=True, tag=True, entity=True)
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
!python -m spacy download en
!python -m spacy download en_core_web_sm
!pip install vaderSentiment

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
Collecting vaderSentiment
[?25l  Downloading https://files.pythonhosted.org/packages/86/9e/c53e1fc61aac5ee490a6ac5e21b1ac04e55a7c2aba647bb8411c9aadf24e/vaderSentiment-3.2.1-py2.py3-none-any.whl (125kB)
[K     |████████████████████████████████| 133kB 2.7MB/s 
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.2.1


In [0]:
toy_rev = pd.read_csv('Scrapped_Car_Reviews_Toyota.csv',engine='python',index_col=False)
toy_rev['review']=toy_rev['Review_Title']+toy_rev['Review']


In [6]:
toy_rev.shape

(22702, 8)

In [0]:
txt = 'Great car and has long range'
doc = nlp(txt)
spacy.displacy.render(doc,style='dep',jupyter=True)

In [0]:
doc.dtype

AttributeError: ignored

In [0]:
competitors = ['chevy', 'ford','nissan','honda','chevrolet','volkswagen','benz','mercedes','subaru','vw']

## identify competitors 
## Remove stopwords 
## classify pairs / parts of speech 
## Identify key words

In [0]:
text = "*Now In March, see when cherry blossoms are beginning to bloom, the Japanese leg of the Olympic Torch relay will kick off at a soccer stadium 12 miles from the Fukushima Daiichi Nuclear Power Plant. A nearby sports complex will also host the Games’ baseball and softball matches. That’s bound to surprise and maybe worry some attendees, whose memory of the power plant’s catastrophic meltdown nine years ago is still fresh. But in fact, the government has been aggressively decontaminating and rehabilitating Fukushima prefecture, and life is slowly returning to the exclusion zone."
text = clean_text(text)
print (text)

now in march see when cherry blossoms are beginning to bloom the japanese leg of the olympic torch relay will kick off at a soccer stadium  miles from the fukushima daiichi nuclear power plant a nearby sports complex will also host the games baseball and softball matches thats bound to surprise and maybe worry some attendees whose memory of the power plants catastrophic meltdown nine years ago is still fresh but in fact the government has been aggressively decontaminating and rehabilitating fukushima prefecture and life is slowly returning to the exclusion zone


In [0]:
def clean_text(text):
    if text is not None:
        data = str(text)
        data =  data.lower()
        data = re.sub('re:', '', data)
        data = re.sub('-', '', data)
        data = re.sub('_', '', data)
        # Remove data between square brackets
        data =re.sub('\[[^]]*\]', '', data)
        # removes punctuation
        data = re.sub(r'[^\w\s]','',data)
        data = re.sub(r'\n',' ',data)
        data = re.sub(r'[0-9]+','',data)
        # strip html 
        p = re.compile(r'<.*?>')
        data = re.sub(r"\'ve", " have ", data)
        data = re.sub(r"can't", "cannot ", data)
        data = re.sub(r"n't", " not ", data)
        data = re.sub(r"I'm", "I am", data)
        data = re.sub(r" m ", " am ", data)
        data = re.sub(r"\'re", " are ", data)
        data = re.sub(r"\'d", " would ", data)
        data = re.sub(r"\'ll", " will ", data)
        return data
    return 'No Subject'

In [0]:
def pos_logic (string): 
  amod_pairs = []
  advmod_pairs = []
  compound_pairs = []
  xcomp_pairs = []
  neg_pairs = []
  doc = nlp(string) # apply spacy nlp, doc defined 
  str1=''
  str2=''
  for token in doc: 
    # below is logic to determine what parts of speech, iterate through a string 
      if token.pos_ is 'NOUN':
        for j in token.lefts:
          if j.dep_ == 'compound':
            compound_pairs.append((j.text+' '+token.text,token.text))
          if j.dep_ is 'amod' and j.pos_ is 'ADJ': #primary condition
            str1 = j.text+' '+token.text
            amod_pairs.append(j.text+' '+token.text)
            for k in j.lefts:
              if k.dep_ is 'advmod': #secondary condition to get adjective of adjectives
                  str2 = k.text+' '+j.text+' '+token.text
                  amod_pairs.append(k.text+' '+j.text+' '+token.text)
              mtch = re.search(re.escape(str1),re.escape(str2))
              if mtch is not None:
                amod_pairs.remove(str1)
        if token.pos_ is 'VERB':
          for j in token.lefts:
            if j.dep_ is 'advmod' and j.pos_ is 'ADV':
              advmod_pairs.append(j.text+' '+token.text)
            if j.dep_ is 'neg' and j.pos_ is 'ADV':
              neg_pairs.append(j.text+' '+token.text)
          for j in token.rights:
            if j.dep_ is 'advmod'and j.pos_ is 'ADV':
              advmod_pairs.append(token.text+' '+j.text)
        if token.pos_ is 'ADJ':
          for j,h in zip(token.rights,token.lefts):
            if j.dep_ is 'xcomp' and h.dep_ is not 'neg':
              for k in j.lefts:
                if k.dep_ is 'aux':
                  xcomp_pairs.append(token.text+' '+k.text+' '+j.text)
            elif j.dep_ is 'xcomp' and h.dep_ is 'neg':
              if k.dep_ is 'aux':
                neg_pairs.append(h.text +' '+token.text+' '+k.text+' '+j.text)
  return  amod_pairs, advmod_pairs, compound_pairs, xcomp_pairs, neg_pairs

In [0]:
def pos_logic_comp (string): 
  eamod_pairs = []
  eadvmod_pairs = []
  ecompound_pairs = []
  eneg_pairs = []
  excomp_pairs = []
  doc = nlp(string) #doc redefined here, these are the lines not caught by the logic gate. They are competitor reviews 
  str1=''
  str2=''
  for token in doc:
    if token.pos_ is 'NOUN':
      for j in token.lefts:
        if j.dep_ == 'compound':
          ecompound_pairs.append((j.text+' '+token.text,token.text))
        if j.dep_ is 'amod' and j.pos_ is 'ADJ': #primary condition
          str1 = j.text+' '+token.text
          eamod_pairs.append(j.text+' '+token.text)
          for k in j.lefts:
            if k.dep_ is 'advmod': #secondary condition to get adjective of adjectives
              str2 = k.text+' '+j.text+' '+token.text
              eamod_pairs.append(k.text+' '+j.text+' '+token.text)
          mtch = re.search(re.escape(str1),re.escape(str2))
          if mtch is not None:
            eamod_pairs.remove(str1) #appears to be same as above for NOUNS
    if token.pos_ is 'VERB':
      for j in token.lefts:
        if j.dep_ is 'advmod' and j.pos_ is 'ADV':
          eadvmod_pairs.append(j.text+' '+token.text)
        if j.dep_ is 'neg' and j.pos_ is 'ADV':
          eneg_pairs.append(j.text+' '+token.text)
      for j in token.rights:
        if j.dep_ is 'advmod'and j.pos_ is 'ADV':
          eadvmod_pairs.append(token.text+' '+j.text) #appears to be the same for VERBS
    if token.pos_ is 'ADJ':
      for j in token.rights:
        if j.dep_ is 'xcomp':
          for k in j.lefts:
            if k.dep_ is 'aux':
              excomp_pairs.append(token.text+' '+k.text+' '+j.text) #differnt for adj, no accounting for 'negs' here?? 
  return eamod_pairs, eadvmod_pairs, ecompound_pairs, eneg_pairs, excomp_pairs

In [15]:
easpect_terms = []
ecomp_terms = []
aspect_terms = []
comp_terms= []
competitors_mentioned = []

for x in tqdm(range(len(toy_rev['review']))):
  counter = 0
  if toy_rev['review'][x] is not "Nan": 
    text = toy_rev['review'][x]
    cleaned_lines = clean_text(text)
  for comp in competitors: 
    if comp in cleaned_lines: 
        counter = counter +1
        competitors_mentioned.append(comp) 
  if counter >0: 
    eamod_pairs, eadvmod_pairs, ecompound_pairs, eneg_pairs, excomp_pairs = pos_logic_comp(cleaned_lines)
    epairs = list(set(eamod_pairs+ eadvmod_pairs+ eneg_pairs ))   
    for i in range(len(epairs)):
            if len(ecompound_pairs)!=0:
                for comp in ecompound_pairs:
                    mtch = re.search(re.escape(comp[1]),re.escape(epairs[i]))
                    if mtch is not None:
                        epairs[i] = epairs[i].replace(mtch.group(),comp[0])
    easpect_terms.append(epairs)
    ecomp_terms.append(ecompound_pairs)
    #ecomp_terms.append (excomp_pairs)
    aspect_terms.append([])
    comp_terms.append([])
  else: 
    amod_pairs, advmod_pairs, compound_pairs, xcomp_pairs, neg_pairs = pos_logic(cleaned_lines)
    pairs = list(set(amod_pairs+ advmod_pairs+ neg_pairs))
    for i in range(len(pairs)):
            if len(compound_pairs)!=0:
                for comp in compound_pairs:
                    mtch = re.search(re.escape(comp[1]),re.escape(pairs[i]))
                    if mtch is not None:
                        pairs[i] = pairs[i].replace(mtch.group(),comp[0])    
    aspect_terms.append(pairs)
    comp_terms.append(compound_pairs)
    #comp_terms.append(xcomp_pairs)
    easpect_terms.append([])
    ecomp_terms.append([])

  


toy_rev['compound_nouns'] = comp_terms
toy_rev['aspect_keywords'] = aspect_terms
toy_rev['competition_comp_nouns'] = ecomp_terms
toy_rev['competition_aspects'] = easpect_terms
toy_rev.head() 

100%|██████████| 22702/22702 [07:43<00:00, 48.99it/s]


Unnamed: 0.1,Unnamed: 0,Review_Date,Author_Name,Vehicle_Title,Review_Title,Review,Rating,review,compound_nouns,aspect_keywords,competition_comp_nouns,competition_aspects
0,0,on 02/02/17 19:53 PM (PST),Ricardo,1997 Toyota Previa Minivan LE 3dr Minivan,"great vehicle, Toyota best design ever. thank you","there is no way back, enjoy what you have .",5.0,"great vehicle, Toyota best design ever. thank ...","[(vehicle toyota, toyota)]","[great vehicle toyota, best design]",[],[]
1,1,on 12/17/16 16:40 PM (PST),matt,1997 Toyota Previa Minivan LE All-Trac 3dr Min...,"my 4th previa, best van ever made!",1st 95 went over 300k before being totalled b...,5.0,"my 4th previa, best van ever made! 1st 95 went...","[(captain chairs, chairs), (craigslist talk, t...","[minor quirks, mini rv loads, middle bench, ro...",[],[]
2,2,on 04/14/10 07:43 AM (PDT),Joel G,1997 Toyota Previa Minivan LE 3dr Minivan,Mom's Taxi Babies Ride,Sold 86 Toyota Van 285K miles to be replaced ...,5.0,Mom's Taxi Babies Ride Sold 86 Toyota Van 285K...,"[(moms babies, babies), (taxi babies, babies),...","[remote bat, younger brothers, middle seat, ap...",[],[]
3,3,on 11/12/08 17:31 PM (PST),Dennis,1997 Toyota Previa Minivan LE All-Trac 3dr Min...,My Favorite Van Ever,"I have owned lots of vans, and the Previa is ...",4.875,My Favorite Van Ever I have owned lots of vans...,"[(fuel mileage, mileage), (toyota salesman, sa...","[toyota handling, mid engine, stupid handling,...",[],[]
4,4,on 04/14/08 22:47 PM (PDT),Alf Skrastins,1997 Toyota Previa Minivan LE All-Trac 3dr Min...,Best Minivan ever,My 1997 AWD Previa is the third one that I ha...,5.0,Best Minivan ever My 1997 AWD Previa is the th...,"[(k mi, mi), (gas mileage, mileage)]","[reasonable replacement, awd previa, third one...",[],[]


In [16]:
print (len(comp_terms))
print (len(aspect_terms))
print (len(ecomp_terms))
print (len(easpect_terms))

22702
22702
22702
22702


In [0]:
toy_rev.to_csv('toy_rev_procsed')

In [0]:
# Reviewing logic of substituting compound pairs into the pairs list to add context. Just keeping notes here

amod_pairs, advmod_pairs, compound_pairs, xcomp_pairs, neg_pairs = pos_logic(toy_rev['review'][100])
pairs = list(set(amod_pairs+ advmod_pairs+ neg_pairs))
#print (compound_pairs[0])
#print (compound_pairs[0][1])
#print (pairs[0])

# logic: if there is a match
# the second word in a compound pair is the same as 
# for each word in compound pairs, 
# a match equals if the second word equals any word in the list of pairs 
# if there is a match, replace the words in the paris list with the compound pair 

# had to remove xcomp and compound pairs for pairs list 

# switch around logic in main code -- makes more sense to check if there are compound pairs before running loop??
for i in range(len(pairs)):
    if len(compound_pairs)!=0:
                for word in compound_pairs:
                    mtch = re.search(re.escape(word[1]),re.escape(pairs[i]))
                    if mtch is not None:
                        pairs[i] = pairs[i].replace(mtch.group(),comp[0]) 
  

For sentiment analysis portion. The original notebook runs Vader on just the aspect terms. Against the table's own ratings, the classifier has 78% accuracy. 


I will attempt to train an analysis model on a separate script. 

# See analysis Multinomial NB model in separate file 
- can use pickle to save and import to here 
- however the NB models uses vectors. so all the preprocessing done here isn't useful 
- stick to using vadar for this as an option because tbh the accuracy is similar - 75-78pc 