## Importing libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

### How do we identify product features?
<img src = "./images/keywords.png">

In [4]:
sent1 = "I loved the screen on this phone."
sent2 = "The battery life on this phone is great."
sent3 = "The speakers are pathetic."

### Lets do a POS parse and see if we can figure out some patterns.

In [2]:
# import Spacy and load model.
import spacy

model = spacy.load("en_core_web_sm")



In [5]:
# Print the PoS tag of sent1

tokens = model(sent1)

for token in tokens:
    print(token.text, token.pos_, token.tag_)


I PRON PRP
loved VERB VBD
the DET DT
screen NOUN NN
on ADP IN
this DET DT
phone NOUN NN
. PUNCT .


In [6]:
# Print the PoS tag of sent2

tokens = model(sent2)

for token in tokens:
    print(token.text, token.pos_, token.tag_)

The DET DT
battery NOUN NN
life NOUN NN
on ADP IN
this DET DT
phone NOUN NN
is AUX VBZ
great ADJ JJ
. PUNCT .


In [7]:
# Print the PoS tag of sent3

tokens = model(sent3)

for token in tokens:
    print(token.text, token.pos_, token.tag_)


The DET DT
speakers NOUN NNS
are AUX VBP
pathetic ADJ JJ
. PUNCT .


#### **Product features such as `screen`, `battery`, `speaker` have a POS tag of NOUN**

## Summary
- Product features such as `screen`, `battery` and `speaker` have a POS tag of Noun
- If we can find the frequency count of all the nouns in our data, then by looking at top-n nouns we can find out what product features people are talking about
- Check hypothesis on a real world dataset

In [8]:
samsung_file = "/content/Samsung.txt"

In [9]:
con = open(samsung_file, 'r', encoding='utf-8')
samsung_reviews = con.read()
con.close()

In [12]:
len(samsung_reviews.split('\n'))

46355

In [13]:
review1 = samsung_reviews.split('\n')[0]
review1

"I feel so LUCKY to have found this used (phone to us & not used hard at all), phone on line from someone who upgraded and sold this one. My Son liked his old one that finally fell apart after 2.5+ years and didn't want an upgrade!! Thank you Seller, we really appreciate it & your honesty re: said used phone.I recommend this seller very highly & would but from them again!!"

In [14]:
tokens = model(review1)

for token in tokens:
    print(token.text, token.pos_, token.tag_)

I PRON PRP
feel VERB VBP
so ADV RB
LUCKY ADJ JJ
to PART TO
have AUX VB
found VERB VBN
this PRON DT
used ADJ JJ
( PUNCT -LRB-
phone NOUN NN
to ADP IN
us PRON PRP
& CCONJ CC
not PART RB
used VERB VBN
hard ADV RB
at ADV RB
all ADV RB
) PUNCT -RRB-
, PUNCT ,
phone NOUN NN
on ADP IN
line NOUN NN
from ADP IN
someone PRON NN
who PRON WP
upgraded VERB VBD
and CCONJ CC
sold VERB VBD
this DET DT
one NOUN NN
. PUNCT .
My PRON PRP$
Son PROPN NNP
liked VERB VBD
his PRON PRP$
old ADJ JJ
one NOUN NN
that PRON WDT
finally ADV RB
fell VERB VBD
apart ADV RB
after ADP IN
2.5 NUM CD
+ NUM CD
years NOUN NNS
and CCONJ CC
did AUX VBD
n't PART RB
want VERB VB
an DET DT
upgrade NOUN NN
! PUNCT .
! PUNCT .
Thank VERB VBP
you PRON PRP
Seller PROPN NNP
, PUNCT ,
we PRON PRP
really ADV RB
appreciate VERB VBP
it PRON PRP
& CCONJ CC
your PRON PRP$
honesty NOUN NN
re NOUN NN
: PUNCT :
said VERB VBD
used VERB VBN
phone NOUN NN
. PUNCT .
I PRON PRP
recommend VERB VBP
this DET DT
seller NOUN NN
very ADV RB
highly ADV RB

In [16]:
# Convert each token into its lemma and identify the PoS tags.
pos = []
lemma = []
text = []
for tok in model(review1):
    pos.append(tok.pos_)
    lemma.append(tok.lemma_)
    text.append(tok.text)

# Convert the data into a dataframe object.
nlp_table = pd.DataFrame({'text':text,'lemma':lemma,'pos':pos})
nlp_table.head()

Unnamed: 0,text,lemma,pos
0,I,I,PRON
1,feel,feel,VERB
2,so,so,ADV
3,LUCKY,lucky,ADJ
4,to,to,PART


In [19]:
nlp_table['text'].value_counts().sort_values(ascending=False).head()

Unnamed: 0_level_0,count
text,Unnamed: 1_level_1
!,4
used,3
this,3
phone,3
&,3


In [26]:
nlp_table[nlp_table['pos'] == 'NOUN']['lemma'].value_counts().sort_values(ascending=False).head()

Unnamed: 0_level_0,count
lemma,Unnamed: 1_level_1
phone,3
one,2
line,1
year,1
upgrade,1


In [30]:
nlp_table[(nlp_table['pos'] == 'NOUN') ]['lemma'].value_counts().sort_values(ascending=False)

Unnamed: 0_level_0,count
lemma,Unnamed: 1_level_1
phone,3
one,2
line,1
year,1
upgrade,1
honesty,1
re,1
seller,1


In [31]:
nlp_table.shape

(86, 3)

In [44]:
from tqdm import tqdm


i = 0
pos = []
lemma = []
text = []
nouns = []

for review in tqdm(samsung_reviews.split("\n")[0:1000]):
    doc = model(review)
    for tok in doc:
        if tok.pos_=="NOUN":
            nouns.append(tok.lemma_.lower())

100%|██████████| 1000/1000 [00:12<00:00, 77.19it/s]


In [45]:
pd.Series(nouns).value_counts().head(5)


Unnamed: 0,count
phone,1214
time,92
battery,89
price,87
screen,86


In [34]:
# Convert the data into a dataframe object.
nlp_table = pd.DataFrame({'text':text,'lemma':lemma,'pos':pos})
nlp_table.head()

Unnamed: 0,text,lemma,pos
0,I,I,PRON
1,feel,feel,VERB
2,so,so,ADV
3,LUCKY,lucky,ADJ
4,to,to,PART


In [35]:
nlp_table.shape

(41187, 3)

In [36]:
nlp_table[(nlp_table['pos'] == 'NOUN') ]['lemma'].value_counts().sort_values(ascending=False)

Unnamed: 0_level_0,count
lemma,Unnamed: 1_level_1
phone,1214
time,90
battery,89
price,87
screen,86
...,...
touchscreen,1
4.4.2,1
behaviour,1
taping,1
