## Importing libraries

In [15]:
import pandas as pd
import numpy as np
import os
import spacy 
from tqdm import tqdm

### Read reviews data

In [16]:
con=open("../Dataset/Samsung.txt",'r', encoding="utf-8")
samsung_reviews=con.read()
con.close()

### Can we reduce the time taken?
[Pipelines (Spacy)](https://spacy.io/usage/processing-pipelines)


<img src='./images/spacy_pipeline.png'>

In [17]:
# shorten the pipline loading
nlp=spacy.load('en_core_web_sm',disable=['parser','ner'])

In [18]:
nouns = []
for review in tqdm(samsung_reviews.split("\n")[0:1000]):
    doc = nlp(review)
    for tok in doc:
        if tok.pos_=="NOUN":
            nouns.append(tok.lemma_.lower())

100%|██████████████████████████████████████| 1000/1000 [00:03<00:00, 272.98it/s]


In [19]:
len(samsung_reviews.split("\n"))

46355

In [20]:
(46355/1000)*6

278.13

In [21]:
278/60

4.633333333333334

### Lets process all the reviews now and see if time taken is less !!!

In [22]:
nouns = []
for review in tqdm(samsung_reviews.split("\n")):
    doc = nlp(review)
    for tok in doc:
        if tok.pos_=="NOUN":
            nouns.append(tok.lemma_.lower())

100%|████████████████████████████████████| 46355/46355 [02:17<00:00, 337.43it/s]


In [25]:
nouns

['lucky',
 'phone',
 'phone',
 'line',
 'one',
 'one',
 'year',
 'upgrade',
 'honesty',
 're',
 'phone',
 'seller',
 'phone',
 'grade',
 'pantach',
 'revue',
 'phone',
 'size',
 'surfing',
 'medium',
 'phone',
 'phone',
 'phone',
 'phone',
 'thing',
 'volume',
 'button',
 'setting',
 'job',
 'phone',
 'thaank',
 'g',
 'keyboard',
 'size',
 'phone',
 'function',
 'phone',
 'e',
 '-',
 'mail',
 'direction',
 'text',
 'messaging',
 'phone',
 'cell',
 'phone',
 'application',
 'phone',
 'run',
 'time',
 'time',
 'camera',
 'video',
 'web',
 'browsing',
 'battery',
 'life',
 'bit',
 'phone',
 'product',
 'day',
 'blemish',
 'side',
 'phone',
 'product',
 'guy',
 'situation',
 'item',
 'issue',
 'phon',
 'phone',
 'replacement',
 'model',
 'contract',
 'one',
 'seller',
 'confirmation',
 'delivery',
 'day',
 'seller',
 'phone',
 'hour',
 'order',
 'phone',
 'day',
 'phone',
 'datum',
 'one',
 'problem',
 'seller',
 'phone',
 'keyboard',
 'touch',
 'screen',
 'phone',
 'device',
 'service',
 

### Does the hypothesis of nouns capturing `product features` hold?

In [12]:
nouns=pd.Series(nouns)
nouns.value_counts().head(5)

phone      1216
time         90
battery      90
screen       87
price        87
dtype: int64

In [13]:
nouns.value_counts().head(10)

phone      1216
time         90
battery      90
screen       87
price        87
card         83
problem      72
product      71
one          69
seller       64
dtype: int64

### We now know that people mention `battery`, `product`, `screen` etc. But we still don't know in what context they mention these keywords

### Summary:
 - Most frequently used lemmatised forms of noun, inform us about the product features people are talking about in product reviews
 - In order to process the review data faster spacy allows us to use the idea of enabling parts of model inference pipeline via `spacy.loads()` command and `disable` parameter

In [26]:
df = pd.read_csv('../Dataset/tagged_words.csv')

In [47]:
df [ df['word'].str.lower() == 'he' ]['tag'].value_counts()

PRON    9546
X          2
Name: tag, dtype: int64

In [46]:
df [ df['word'].str.lower() == 'wished' ]['tag'].value_counts()

VERB    55
Name: tag, dtype: int64

In [49]:
df [ df['word'].str.lower() == 'he' ]['tag'].value_counts()

PRON    9546
X          2
Name: tag, dtype: int64

In [50]:
df [ df['word'].str.lower() == 'was' ]['tag'].value_counts()

VERB    9815
Name: tag, dtype: int64

In [51]:
df [ df['word'].str.lower() == 'rich' ]['tag'].value_counts()

ADJ     70
NOUN     4
Name: tag, dtype: int64

In [29]:
df

Unnamed: 0,word,tag
0,the,DET
1,fulton,NOUN
2,county,NOUN
3,grand,ADJ
4,jury,NOUN
...,...,...
1161187,boucle,NOUN
1161188,dress,NOUN
1161189,was,VERB
1161190,stupefying,VERB


In [43]:
data = pd.read_csv("../Dataset/tagged_words.csv")
sent = "I saw him running away"

def get_common_tag(data,word):
    if word.lower() in data['word'].unique():
        q = f"word=='{word.lower()}'"
        return word , data.query(q)['tag'].value_counts().head(1).index.tolist()[0]
    else:
        return f"{word} not in data"

for word in sent.split(" "):
    print(get_common_tag(data,word))
 
data.query("word=='saw'")['tag'].value_counts()

('I', 'PRON')
('saw', 'VERB')
('him', 'PRON')
('running', 'VERB')
('away', 'ADV')


VERB    347
NOUN      5
Name: tag, dtype: int64

In [53]:
data = pd.read_csv("../Dataset/tagged_words.csv")
s = "He wished he was rich"
 
def get_common_tag(data,word):
    if word.lower() in data['word'].unique():
        q = f"word=='{word.lower()}'"
        return word , data.query(q)['tag'].value_counts().head(1).index.tolist()[0]
    else:
        return f"{word} not in data"
 
for word in s.split(" "):
    print(get_common_tag(data,word))

('He', 'PRON')
('wished', 'VERB')
('he', 'PRON')
('was', 'VERB')
('rich', 'ADJ')


In [56]:
pd.crosstab(df['word'], df['tag'],normalize='columns')

tag,ADJ,ADP,ADV,CONJ,DET,NOUN,NUM,PRON,PRT,VERB,X
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
!,0.638047,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000
$.027,0.000000,0.0,0.0,0.0,0.0,0.000007,0.0,0.0,0.0,0.0,0.000000
$.03,0.000000,0.0,0.0,0.0,0.0,0.000015,0.0,0.0,0.0,0.0,0.000000
$.054/mbf,0.000000,0.0,0.0,0.0,0.0,0.000004,0.0,0.0,0.0,0.0,0.000000
$.07,0.000000,0.0,0.0,0.0,0.0,0.000011,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
zurcher,0.000000,0.0,0.0,0.0,0.0,0.000007,0.0,0.0,0.0,0.0,0.000000
zurich,0.000000,0.0,0.0,0.0,0.0,0.000007,0.0,0.0,0.0,0.0,0.000000
zwei,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000722
zworykin,0.000000,0.0,0.0,0.0,0.0,0.000007,0.0,0.0,0.0,0.0,0.000000


In [58]:
df = pd.crosstab(df['word'], df['tag'],normalize='columns')

In [59]:
df

tag,ADJ,ADP,ADV,CONJ,DET,NOUN,NUM,PRON,PRT,VERB,X
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
!,0.638047,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000
$.027,0.000000,0.0,0.0,0.0,0.0,0.000007,0.0,0.0,0.0,0.0,0.000000
$.03,0.000000,0.0,0.0,0.0,0.0,0.000015,0.0,0.0,0.0,0.0,0.000000
$.054/mbf,0.000000,0.0,0.0,0.0,0.0,0.000004,0.0,0.0,0.0,0.0,0.000000
$.07,0.000000,0.0,0.0,0.0,0.0,0.000011,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
zurcher,0.000000,0.0,0.0,0.0,0.0,0.000007,0.0,0.0,0.0,0.0,0.000000
zurich,0.000000,0.0,0.0,0.0,0.0,0.000007,0.0,0.0,0.0,0.0,0.000000
zwei,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000722
zworykin,0.000000,0.0,0.0,0.0,0.0,0.000007,0.0,0.0,0.0,0.0,0.000000


In [63]:
df.loc['his']

tag
ADJ     0.000000
ADP     0.000000
ADV     0.000000
CONJ    0.000000
DET     0.050774
NOUN    0.000000
NUM     0.000000
PRON    0.000750
PRT     0.000000
VERB    0.000000
X       0.001443
Name: his, dtype: float64

In [64]:
round( 0.000750,3)

0.001