**J060 CW ASSIGNMENT 3: TOKENISATION, LEMMAS, POS_TAGS USING WORDNET**

In [0]:
import gzip 
import pandas as pd
data=pd.read_json("http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Office_Products_5.json.gz",lines=True)

In [0]:
# word tokenizer
# lemmatiser
# pos_tag is a classifier

import nltk
import re
from nltk.corpus import wordnet

In [0]:
from nltk.tokenize import MWETokenizer
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

In [4]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [0]:
tokeniser=MWETokenizer()
mwe_tokens=tokeniser.tokenize(data.iloc[0,4].split())

In [6]:
print(mwe_tokens)

['I', 'bought', 'my', 'first', 'HP12C', 'in', 'about', '1984', 'or', 'so,', 'and', 'it', 'served', 'me', 'faithfully', 'until', '2002', 'when', 'I', 'lost', 'it', 'while', 'travelling.', 'I', 'searched', 'for', 'another', 'one', 'to', 'replace', 'it,', 'but', 'found', 'one', 'difficult', 'to', 'come', 'by', 'in', 'my', 'area.', 'So,', 'I', 'decided', 'to', 'buy', 'up', 'and', 'purchased', 'an', 'HP', '49G.', 'What', 'a', 'mistake!', 'I', 'know', 'that', 'many', 'people', 'view', 'the', 'HP', '49G', '(now', '49G+)', 'as', 'the', 'flagship', 'of', 'the', 'HP', 'line,', 'but', 'for', 'me', 'that', 'was', 'a', 'disaster.The', '49G', 'may', 'be', 'powerful,', 'but', 'using', 'it', 'is', 'extremely', 'counterintuitive...and', 'the', 'manual', 'was', 'sketchy', 'at', 'best.', 'The', '12C,', 'on', 'the', 'other', 'hand,', 'does', 'what', 'I', 'need', 'in', 'a', 'way', 'that', 'makes', 'good', 'sense', 'to', 'me.If', 'you', 'are', 'looking', 'for', 'a', 'solid,', 'reliable', 'calculator,', 'the

In [0]:
tokens=word_tokenize(data.iloc[0,4])

In [0]:
# The whole point of MWEtokeniser is to combine multiple words into the same bunch using a special character like + or _.
# MWEtokeniser has no role here.
# word_tokenize tokenises words more properly than MWETokeniser.

Cleaning function makes it easier to tokenize words.


In [0]:
def clean_string(x):
   return re.sub(r'[\[\],.)(\!;:]',' ',x)

def get_wordnet_pos(word):
  if word.startswith('V'):
    return wordnet.VERB
  elif word.startswith('J'):
    return wordnet.ADJ
  elif word.startswith('N'):
    return wordnet.NOUN
  elif word.startswith('R'):
    return wordnet.ADV      
  else: 
    return wordnet.NOUN   

In [0]:
def tok_lem_adj_pos_dict(x):
  #Do not use set because it changes the pos_tags
  tokens=word_tokenize(x)
  lemmatizer=WordNetLemmatizer()
  lem_list=[]
  pos_tags=nltk.pos_tag(tokens)
  adj=[]

  for i in range(0,len(pos_tags)):
    lem_list.append([pos_tags[i][0],lemmatizer.lemmatize(pos_tags[i][0],pos=get_wordnet_pos(pos_tags[i][1]))])

  for i in range(0,len(pos_tags)):
    if pos_tags[i][1].startswith('J'):
      adj.append(pos_tags[i][0])

  adj_count=len(adj)
  return {'word_tokens':tokens,'lemmas':lem_list, 'part_of_speech_tags':pos_tags, 'adjectives':adj, 'adj_count':adj_count }

The whole point of lemmatizer is to get to the root of a given word in a particular context. If we do not pass part-of-speech tags to the lemmatize function, then the root word will not be found in any context and it wont be efficient. Although finding out root words with such a large amount of data is time consuming because the averaged perceptron tagger uses different notations for the pos_tags whereas wordnet tags are limited and different.

In [60]:
tokens=word_tokenize(clean_string(data.iloc[0,4]))
lemmatizer=WordNetLemmatizer()
lem_list=[]
pos_tags=nltk.pos_tag(tokens)
  
for i in range(0,len(pos_tags)):
    lem_list.append([pos_tags[i][0],lemmatizer.lemmatize(pos_tags[i][0],pos=get_wordnet_pos(pos_tags[i][1]))])

    #lem_list contains a list whose first element shows the original words and the second element shows the lemma..

print(lem_list) 
#  return {'word_tokens':tokens,'lemmas':lem_list, 'part_of_speech_tags':pos_tags }

[['I', 'I'], ['bought', 'buy'], ['my', 'my'], ['first', 'first'], ['HP12C', 'HP12C'], ['in', 'in'], ['about', 'about'], ['1984', '1984'], ['or', 'or'], ['so', 'so'], ['and', 'and'], ['it', 'it'], ['served', 'serve'], ['me', 'me'], ['faithfully', 'faithfully'], ['until', 'until'], ['2002', '2002'], ['when', 'when'], ['I', 'I'], ['lost', 'lose'], ['it', 'it'], ['while', 'while'], ['travelling', 'travel'], ['I', 'I'], ['searched', 'search'], ['for', 'for'], ['another', 'another'], ['one', 'one'], ['to', 'to'], ['replace', 'replace'], ['it', 'it'], ['but', 'but'], ['found', 'find'], ['one', 'one'], ['difficult', 'difficult'], ['to', 'to'], ['come', 'come'], ['by', 'by'], ['in', 'in'], ['my', 'my'], ['area', 'area'], ['So', 'So'], ['I', 'I'], ['decided', 'decide'], ['to', 'to'], ['buy', 'buy'], ['up', 'up'], ['and', 'and'], ['purchased', 'purchase'], ['an', 'an'], ['HP', 'HP'], ['49G', '49G'], ['What', 'What'], ['a', 'a'], ['mistake', 'mistake'], ['I', 'I'], ['know', 'know'], ['that', 'that

In [12]:
#faithfully is an adverb according to pos_tags
#To check if our get_wordnet_pos function is working perfectly
print(lemmatizer.lemmatize('faithfully',pos='r'))
print(lemmatizer.lemmatize('travelling',pos='v'))
print(lemmatizer.lemmatize('travelling'))
# We can see that if proper pos_tags are not supplied to the lemmatizer, it is of no use.

faithfully
travel
travelling


In [37]:
temp=clean_string(data.iloc[0,4])
print(tok_lem_adj_pos_dict(temp)['adjectives'])

['searched', 'many', 'powerful', 'other', 'solid', 'programmable', 'reliable', 'lost', 'old', 'first', 'know', 'difficult', 'ready', 'best', 'good', 'manual', 'short']


In [0]:
data['clean_rev_text']=data['reviewText'].apply(clean_string)

In [0]:
data['nltk_rev_text']=data['clean_rev_text'].apply(tok_lem_adj_pos_dict)

In [0]:
data=data[['reviewerID', 'asin', 'reviewerName', 'helpful', 'reviewText','clean_rev_text','nltk_rev_text',
       'overall', 'summary', 'unixReviewTime', 'reviewTime']]

In [68]:
data.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,clean_rev_text,nltk_rev_text,overall,summary,unixReviewTime,reviewTime
0,A32T2H8150OJLU,B00000JBLH,ARH,"[3, 4]","I bought my first HP12C in about 1984 or so, a...",I bought my first HP12C in about 1984 or so a...,"{'word_tokens': ['I', 'bought', 'my', 'first',...",5,"A solid performer, and long time friend",1094169600,"09 3, 2004"
1,A3MAFS04ZABRGO,B00000JBLH,"Let it Be ""Alan""","[7, 9]",WHY THIS BELATED REVIEW? I feel very obliged t...,WHY THIS BELATED REVIEW? I feel very obliged t...,"{'word_tokens': ['WHY', 'THIS', 'BELATED', 'RE...",5,"Price of GOLD is up, so don't bury the golden ...",1197676800,"12 15, 2007"
2,A1F1A0QQP2XVH5,B00000JBLH,Mark B,"[3, 3]",I have an HP 48GX that has been kicking for mo...,I have an HP 48GX that has been kicking for mo...,"{'word_tokens': ['I', 'have', 'an', 'HP', '48G...",2,"Good functionality, but not durable like old HPs",1293840000,"01 1, 2011"
3,A49R5DBXXQDE5,B00000JBLH,R. D Johnson,"[7, 8]",I've started doing more finance stuff recently...,I've started doing more finance stuff recently...,"{'word_tokens': ['I', ''ve', 'started', 'doing...",5,One of the last of an almost extinct species,1145404800,"04 19, 2006"
4,A2XRMQA6PJ5ZJ8,B00000JBLH,Roger J. Buffington,"[0, 0]",For simple calculations and discounted cash fl...,For simple calculations and discounted cash fl...,"{'word_tokens': ['For', 'simple', 'calculation...",5,Still the best,1375574400,"08 4, 2013"


In [69]:
data.nltk_rev_text[0:-1]

0        {'word_tokens': ['I', 'bought', 'my', 'first',...
1        {'word_tokens': ['WHY', 'THIS', 'BELATED', 'RE...
2        {'word_tokens': ['I', 'have', 'an', 'HP', '48G...
3        {'word_tokens': ['I', ''ve', 'started', 'doing...
4        {'word_tokens': ['For', 'simple', 'calculation...
                               ...                        
53252    {'word_tokens': ['I', 'like', 'theAccuteck', '...
53253    {'word_tokens': ['What', 'I', 'like', 'about',...
53254    {'word_tokens': ['This', 'Accuteck', 'ShipPro'...
53255    {'word_tokens': ['I', 'ship', 'a', 'lot', 'of'...
53256    {'word_tokens': ['This', 'is', 'a', 'great', '...
Name: nltk_rev_text, Length: 53257, dtype: object

In [70]:
data.nltk_rev_text[0]['adj_count']

16

In [71]:
data.nltk_rev_text[0]['adjectives']

['first',
 'difficult',
 'many',
 'powerful',
 'counterintuitive',
 'other',
 'good',
 'solid',
 'reliable',
 'programmable',
 'basic',
 'many',
 'manual',
 'short',
 'ready',
 'old']

In [72]:
data.nltk_rev_text[0]['word_tokens']

['I',
 'bought',
 'my',
 'first',
 'HP12C',
 'in',
 'about',
 '1984',
 'or',
 'so',
 'and',
 'it',
 'served',
 'me',
 'faithfully',
 'until',
 '2002',
 'when',
 'I',
 'lost',
 'it',
 'while',
 'travelling',
 'I',
 'searched',
 'for',
 'another',
 'one',
 'to',
 'replace',
 'it',
 'but',
 'found',
 'one',
 'difficult',
 'to',
 'come',
 'by',
 'in',
 'my',
 'area',
 'So',
 'I',
 'decided',
 'to',
 'buy',
 'up',
 'and',
 'purchased',
 'an',
 'HP',
 '49G',
 'What',
 'a',
 'mistake',
 'I',
 'know',
 'that',
 'many',
 'people',
 'view',
 'the',
 'HP',
 '49G',
 'now',
 '49G+',
 'as',
 'the',
 'flagship',
 'of',
 'the',
 'HP',
 'line',
 'but',
 'for',
 'me',
 'that',
 'was',
 'a',
 'disaster',
 'The',
 '49G',
 'may',
 'be',
 'powerful',
 'but',
 'using',
 'it',
 'is',
 'extremely',
 'counterintuitive',
 'and',
 'the',
 'manual',
 'was',
 'sketchy',
 'at',
 'best',
 'The',
 '12C',
 'on',
 'the',
 'other',
 'hand',
 'does',
 'what',
 'I',
 'need',
 'in',
 'a',
 'way',
 'that',
 'makes',
 'good',

In [73]:
data.nltk_rev_text[0]['lemmas']

[['I', 'I'],
 ['bought', 'buy'],
 ['my', 'my'],
 ['first', 'first'],
 ['HP12C', 'HP12C'],
 ['in', 'in'],
 ['about', 'about'],
 ['1984', '1984'],
 ['or', 'or'],
 ['so', 'so'],
 ['and', 'and'],
 ['it', 'it'],
 ['served', 'serve'],
 ['me', 'me'],
 ['faithfully', 'faithfully'],
 ['until', 'until'],
 ['2002', '2002'],
 ['when', 'when'],
 ['I', 'I'],
 ['lost', 'lose'],
 ['it', 'it'],
 ['while', 'while'],
 ['travelling', 'travel'],
 ['I', 'I'],
 ['searched', 'search'],
 ['for', 'for'],
 ['another', 'another'],
 ['one', 'one'],
 ['to', 'to'],
 ['replace', 'replace'],
 ['it', 'it'],
 ['but', 'but'],
 ['found', 'find'],
 ['one', 'one'],
 ['difficult', 'difficult'],
 ['to', 'to'],
 ['come', 'come'],
 ['by', 'by'],
 ['in', 'in'],
 ['my', 'my'],
 ['area', 'area'],
 ['So', 'So'],
 ['I', 'I'],
 ['decided', 'decide'],
 ['to', 'to'],
 ['buy', 'buy'],
 ['up', 'up'],
 ['and', 'and'],
 ['purchased', 'purchase'],
 ['an', 'an'],
 ['HP', 'HP'],
 ['49G', '49G'],
 ['What', 'What'],
 ['a', 'a'],
 ['mistake', 'm

In [74]:
data.nltk_rev_text[0]['part_of_speech_tags']

[('I', 'PRP'),
 ('bought', 'VBD'),
 ('my', 'PRP$'),
 ('first', 'JJ'),
 ('HP12C', 'NNP'),
 ('in', 'IN'),
 ('about', 'RB'),
 ('1984', 'CD'),
 ('or', 'CC'),
 ('so', 'RB'),
 ('and', 'CC'),
 ('it', 'PRP'),
 ('served', 'VBD'),
 ('me', 'PRP'),
 ('faithfully', 'RB'),
 ('until', 'IN'),
 ('2002', 'CD'),
 ('when', 'WRB'),
 ('I', 'PRP'),
 ('lost', 'VBD'),
 ('it', 'PRP'),
 ('while', 'IN'),
 ('travelling', 'VBG'),
 ('I', 'PRP'),
 ('searched', 'VBD'),
 ('for', 'IN'),
 ('another', 'DT'),
 ('one', 'CD'),
 ('to', 'TO'),
 ('replace', 'VB'),
 ('it', 'PRP'),
 ('but', 'CC'),
 ('found', 'VBD'),
 ('one', 'CD'),
 ('difficult', 'JJ'),
 ('to', 'TO'),
 ('come', 'VB'),
 ('by', 'IN'),
 ('in', 'IN'),
 ('my', 'PRP$'),
 ('area', 'NN'),
 ('So', 'IN'),
 ('I', 'PRP'),
 ('decided', 'VBD'),
 ('to', 'TO'),
 ('buy', 'VB'),
 ('up', 'RP'),
 ('and', 'CC'),
 ('purchased', 'VBD'),
 ('an', 'DT'),
 ('HP', 'NNP'),
 ('49G', 'CD'),
 ('What', 'WP'),
 ('a', 'DT'),
 ('mistake', 'NN'),
 ('I', 'PRP'),
 ('know', 'VBP'),
 ('that', 'IN'),
 ('