## Importing libraries

In [86]:
import pandas as pd
import numpy as np
from tqdm import tqdm

### Read reviews data

In [87]:
con=open("../data/Samsung.txt",'r', encoding="utf-8")
samsung_reviews=con.read()
con.close()

<img src = "./images/results.png">

<img src = "./images/keywords.png">

### We can use a simple hueristic
 - Find out what were the most common words that appeared before and after each mention of `product feature`
 - Use regex pattern to extract this information

The `battery` was ===> Prefix `keyword` Suffix

![image.png](attachment:9b4e9e8f-7d79-4d31-b370-44726e017a96.png)<img src="./images/regex.png">

In [88]:
df = pd.read_csv('../data/tagged_words.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1161192 entries, 0 to 1161191
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   word    1161177 non-null  object
 1   tag     1161192 non-null  object
dtypes: object(2)
memory usage: 17.7+ MB


In [89]:
df[df['word']=='saw'].groupby(by=['tag']).agg({'tag':'count'})

Unnamed: 0_level_0,tag
tag,Unnamed: 1_level_1
NOUN,5
VERB,347


In [90]:
sent = "I saw him running away"

def get_common_tag(data,word):
    if word.lower() in data['word'].unique():
        q = f"word=='{word.lower()}'"
        return word , data.query(q)['tag'].value_counts().head(1).index.tolist()[0]
    else:
        return f"{word} not in data"

for word in sent.split(" "):
    print(get_common_tag(df, word.lower()))
 
df.query("word=='saw'")['tag'].value_counts()

('i', 'PRON')
('saw', 'VERB')
('him', 'PRON')
('running', 'VERB')
('away', 'ADV')


VERB    347
NOUN      5
Name: tag, dtype: int64

In [91]:
sent= "He wished he was rich"
for word in sent.split():
    print(get_common_tag(df, word.lower()))

('he', 'PRON')
('wished', 'VERB')
('he', 'PRON')
('was', 'VERB')
('rich', 'ADJ')


In [92]:
df.head()

Unnamed: 0,word,tag
0,the,DET
1,fulton,NOUN
2,county,NOUN
3,grand,ADJ
4,jury,NOUN


In [93]:
df['tag'].value_counts().index.tolist()

['NOUN', 'ADJ', 'VERB', 'ADP', 'DET', 'ADV', 'PRON', 'CONJ', 'PRT', 'NUM', 'X']

In [94]:
df_summary = df.groupby(by=['word','tag']).agg({'tag':'count'}).rename(columns={'tag':'count'}).reset_index()
df_summary.head() # sort_values(by='count', ascending=False)

Unnamed: 0,word,tag,count
0,!,ADJ,147565
1,$.027,NOUN,2
2,$.03,NOUN,4
3,$.054/mbf,NOUN,1
4,$.07,NOUN,3


In [95]:
df_word_tag = pd.pivot_table(data=df_summary, index='word', columns='tag', values='count', aggfunc='count').fillna(0)
df_word_tag

tag,ADJ,ADP,ADV,CONJ,DET,NOUN,NUM,PRON,PRT,VERB,X
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
!,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
$.027,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
$.03,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
$.054/mbf,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
$.07,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
zurcher,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
zurich,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
zwei,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
zworykin,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [96]:
for col in df_word_tag.columns:
    df_word_tag[col] = (df_word_tag[col]/df_word_tag[col].sum())

df_word_tag

tag,ADJ,ADP,ADV,CONJ,DET,NOUN,NUM,PRON,PRT,VERB,X
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
!,0.000124,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000
$.027,0.000000,0.0,0.0,0.0,0.0,0.000033,0.0,0.0,0.0,0.0,0.000000
$.03,0.000000,0.0,0.0,0.0,0.0,0.000033,0.0,0.0,0.0,0.0,0.000000
$.054/mbf,0.000000,0.0,0.0,0.0,0.0,0.000033,0.0,0.0,0.0,0.0,0.000000
$.07,0.000000,0.0,0.0,0.0,0.0,0.000033,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
zurcher,0.000000,0.0,0.0,0.0,0.0,0.000033,0.0,0.0,0.0,0.0,0.000000
zurich,0.000000,0.0,0.0,0.0,0.0,0.000033,0.0,0.0,0.0,0.0,0.000000
zwei,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.001192
zworykin,0.000000,0.0,0.0,0.0,0.0,0.000033,0.0,0.0,0.0,0.0,0.000000


In [97]:
for col in df_word_tag.columns:
    print(col, df_word_tag[col].sum())

ADJ 0.9999999999999999
ADP 1.0
ADV 1.0
CONJ 1.0
DET 0.9999999999999999
NOUN 0.9999999999999999
NUM 1.0
PRON 1.0
PRT 0.9999999999999999
VERB 1.0
X 1.0


In [98]:
df_word_tag.loc["his", :] #df_word_tag['word']=='his']

tag
ADJ     0.000000
ADP     0.000000
ADV     0.000000
CONJ    0.000000
DET     0.017544
NOUN    0.000000
NUM     0.000000
PRON    0.015385
PRT     0.000000
VERB    0.000000
X       0.001192
Name: his, dtype: float64

In [99]:
df[df['word']=='his']

Unnamed: 0,word,tag
776,his,DET
789,his,DET
850,his,DET
857,his,DET
932,his,DET
...,...,...
1160771,his,DET
1160804,his,DET
1160837,his,DET
1160896,his,DET


In [112]:
df.shape

(1161192, 2)

In [116]:
df[df['tag']=='PRON']['tag'].count()

49334

In [114]:
df[(df['word']=='his') & (df['tag']=='PRON')]['tag'].count()

37

In [123]:
37/49334, round(37/49334, 3)

(0.0007499898650018244, 0.001)

In [124]:
df_ = pd.crosstab(df['word'], df['tag'], normalize=True)
'''for col in df_.columns:
    df_[col] = (df_[col]/df_[col].sum())'''
print(df_.loc['his', :])
type(df_), df_.shape, df_.index, df_


tag
ADJ     0.000000
ADP     0.000000
ADV     0.000000
CONJ    0.000000
DET     0.005991
NOUN    0.000000
NUM     0.000000
PRON    0.000032
PRT     0.000000
VERB    0.000000
X       0.000002
Name: his, dtype: float64


(pandas.core.frame.DataFrame,
 (49806, 11),
 Index(['!', '$.027', '$.03', '$.054/mbf', '$.07', '$.07/cwt', '$.076', '$.09',
        '$.10-a-minute', '$.105',
        ...
        'zorrillas', 'zounds', 'zu', 'zubkovskaya', 'zur', 'zurcher', 'zurich',
        'zwei', 'zworykin', '{0,t}'],
       dtype='object', name='word', length=49806),
 tag             ADJ  ADP  ADV  CONJ  DET          NOUN  NUM  PRON  PRT  VERB  \
 word                                                                           
 !          0.127082  0.0  0.0   0.0  0.0  0.000000e+00  0.0   0.0  0.0   0.0   
 $.027      0.000000  0.0  0.0   0.0  0.0  1.722390e-06  0.0   0.0  0.0   0.0   
 $.03       0.000000  0.0  0.0   0.0  0.0  3.444781e-06  0.0   0.0  0.0   0.0   
 $.054/mbf  0.000000  0.0  0.0   0.0  0.0  8.611951e-07  0.0   0.0  0.0   0.0   
 $.07       0.000000  0.0  0.0   0.0  0.0  2.583585e-06  0.0   0.0  0.0   0.0   
 ...             ...  ...  ...   ...  ...           ...  ...   ...  ...   ...   
 zurcher    0

In [125]:
df_ = pd.crosstab(df['word'], df['tag']) 
for col in df_.columns:
    df_[col] = (df_[col]/df_[col].sum())
print(df_.loc['his', :])

tag
ADJ     0.000000
ADP     0.000000
ADV     0.000000
CONJ    0.000000
DET     0.050774
NOUN    0.000000
NUM     0.000000
PRON    0.000750
PRT     0.000000
VERB    0.000000
X       0.001443
Name: his, dtype: float64


In [129]:
round(0.000750,3)

0.001

In [130]:
emmission_matrix = pd.crosstab(df['word'],df['tag'],normalize='columns')

word = 'his'
emmission_matrix.loc[word][emmission_matrix.loc[word]>0].round(3)
emmission_matrix['PRON'].loc['his'].round(3)

0.001

In [128]:
emmission_matrix.loc[word][emmission_matrix.loc[word]>0]

tag
DET     0.050774
PRON    0.000750
X       0.001443
Name: his, dtype: float64

In [132]:
df[100:110]

Unnamed: 0,word,tag
100,allen,NOUN
101,jr.,NOUN
102,!,ADJ
103,!,ADJ
104,only,ADV
105,a,DET
106,relative,ADJ
107,handful,NOUN
108,of,ADP
109,such,ADJ


#### Extract all the prefixes and suffixes of `battery`

### This doesn't make much sense as these are commonly used words. Let's remove `stopwords` and see what we get

<a href = "https://gist.github.com/sebleier/554280">Get Stop Words</a>

### Lets pretty print

### Lets put all this logic in a function

## Summary:
    - Simple hueristics sometime are very usefull
    - Regex can be life saviours
    - Don't forget to use simple text processing while trying to solve a non-trival problem