In [12]:
import numpy as np
import pandas as pd
import os, re
from nltk.corpus import stopwords

In [4]:
import spacy
model = spacy.load("en_core_web_sm", disable=['parser','ner']) # loading pre-trained model

In [5]:
tokens = model("upGrad is teaching Data Science courses to working professionals.")
for token in tokens:
    print([token.text, token.lemma_, token.pos_, token.tag_, token.is_stop, token.dep_], sep="\t") # lemma - lemmatize, is_stop - stop word

['upGrad', 'upgrad', 'NOUN', 'NN', False, '']
['is', 'be', 'AUX', 'VBZ', True, '']
['teaching', 'teach', 'VERB', 'VBG', False, '']
['Data', 'Data', 'PROPN', 'NNP', False, '']
['Science', 'Science', 'PROPN', 'NNP', False, '']
['courses', 'course', 'NOUN', 'NNS', False, '']
['to', 'to', 'ADP', 'IN', True, '']
['working', 'work', 'VERB', 'VBG', False, '']
['professionals', 'professional', 'NOUN', 'NNS', False, '']
['.', '.', 'PUNCT', '.', False, '']


---
<h3><b> Product Feature Extraction from Customer Reviews</b><br></h3>

In [8]:
pos, lemma, text = [], [], []

with open("F:\programming\python-development\python-questions\Database\Samsung_reviews.txt", 'rb') as file:
    data = file.read().decode('utf-8').split("\n")
    file.close()

trial_review = data[0]
for tokens in model(trial_review):
    text.append(tokens.text)
    lemma.append(tokens.lemma_)
    pos.append(tokens.pos_)
    
model_table = pd.DataFrame({'text': text, 'lemma': lemma, 'pos': pos})
model_table.head(5)

Unnamed: 0,text,lemma,pos
0,I,I,PRON
1,feel,feel,VERB
2,so,so,ADV
3,LUCKY,LUCKY,NOUN
4,to,to,PART


In [26]:
# most frequently occuring num in review 1
model_table[model_table['pos'] == 'NOUN']['lemma'].value_counts().head(5)

lemma
phone    3
one      2
LUCKY    1
line     1
year     1
Name: count, dtype: int64

In [27]:
# track progress while applying the same hypothesis on larger set of reviews
from tqdm import tqdm
nouns = []
for review in tqdm(data[0:1000]):
    doc = model(review)
    for token in doc:
        if token.pos_ == 'NOUN':
            nouns.append(token.lemma_.lower())
pd.Series(nouns).value_counts().head(5)

100%|██████████| 1000/1000 [00:07<00:00, 129.57it/s]


phone      1216
time         90
battery      90
screen       87
price        87
Name: count, dtype: int64

In [11]:
# understanding cotext of the noun tags
pattern = re.compile("\w+\sbattery\s\w+")
file_samsung = open("F:\programming\python-development\python-questions\Database\Samsung_reviews.txt", 'rb').read().decode('utf-8')
pattern_matched = re.findall(pattern, file_samsung)
type(pattern_matched)
pattern_matched[0:5]

['that battery life',
 'The battery was',
 'great battery life',
 'removable battery or',
 'the battery in']

In [29]:
prefixes, suffixes = [], []
for i in pattern_matched:
    l = i.split(" ")
    prefixes.append(l[0].lower())
    suffixes.append(l[-1].lower())

stop = stopwords.words('english')
prefixes, suffixes = [p for p in prefixes if p not in stop], [s for s in suffixes if s not in stop]
prefixes, suffixes = pd.Series(prefixes).value_counts().head(5).index, pd.Series(suffixes).value_counts().head(5).index

pre_suf = pd.DataFrame({'prefixes': prefixes,'keywords':['battery']*len(prefixes), 'suffixes': suffixes})
pre_suf.head()

Unnamed: 0,prefixes,keywords,suffixes
0,good,battery,life
1,great,battery,lasts
2,long,battery,last
3,new,battery,runs
4,removable,battery,drains


In [15]:
data = pd.read_csv("./Database/tagged_words.csv")
sent = "He wished he was rich"

def get_common_tag(data,word):
    if word.lower() in data['word'].unique():
        q = f"word=='{word.lower()}'"
        return word , data.query(q)['tag'].value_counts().head().index.tolist()[0]
    else:
        return f"{word} not in data"

for word in sent.split(" "):
    print(get_common_tag(data,word))
 
data.query("word=='saw'")['tag'].value_counts().head().index.tolist()[0]

('He', 'PRON')
('wished', 'VERB')
('he', 'PRON')
('was', 'VERB')
('rich', 'ADJ')


'VERB'

In [18]:
emission_matrix = pd.crosstab(data['word'], data['tag'], normalize='columns')
emission_matrix.loc['his'][emission_matrix.loc['his']>0].round(3)

tag
DET     0.051
PRON    0.001
X       0.001
Name: his, dtype: float64

---
<h3><b>Parsing in Python

In [19]:
from spacy import displacy
parser = spacy.load("en_core_web_sm", disable=['ner']) # loading pre-trained model

active = ['Hens lay eggs.',
'Birds build nests.',
'The batter hit the ball.',
'The computer transmitted a copy of the manual']

passive = ['Eggs are laid by hens',
'Nests are built by birds',
'The ball was hit by the batter',
'A copy of the manual was transmitted by the computer.']

In [20]:
# trial_line = parser(active[0])
trial_line = parser("SHE IS A FOOL")
for token in trial_line:
    print(token.text," ", token.dep_)

SHE   nsubj
IS   ROOT
A   det
FOOL   attr


In [37]:
# nltk library to print the constituent parse tree
from nltk import Tree
def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return Tree(node.orth_, [to_nltk_tree(child) for child in node.children]) # node.orth_ refers to the word
    else:
        return node.orth_

[to_nltk_tree(sent.root).pretty_print() for sent in parser("The computer transmitted a copy of the manual").sents]

         transmitted            
    __________|_______           
   |                 copy       
   |           _______|_____     
   |          |             of  
   |          |             |    
computer      |           manual
   |          |             |    
  The         a            the  



[None]

In [24]:
displacy.render(trial_line, style='dep', options={'distance': 150, 'compact': True})

In [30]:
# for sentence in active:
#     displacy.render(parser(sentence), style='dep', options={'distance': 90, 'compact':True})

# for sentence in passive:
#     displacy.render(parser(sentence), style='dep', options={'distance': 90, 'compact':True})

# to identify the passive sentences
from spacy.matcher import Matcher

In [31]:
# matching dependency nsubjpass
passive_rule = [{'DEP':{'IN':['nsubjpass', 'auxpass', 'csubjpass', 'agent']}}]
matcher = Matcher(parser.vocab)
matcher.add('Rule', [passive_rule])

def is_passive(sentence):
    doc = parser(sentence)
    matches = matcher(doc)
    if matches:
        return True
    else:
        return False

for sentence in passive:
    print(is_passive(sentence))

True
True
True
True


---
<h3><b>Name Entity Recognition & CRF

In [33]:
ner = spacy.load("en_core_web_sm")
doc = "Dr.Sumit is an adjunct faculty at UpGrad."
processed_doc = ner(doc)
for token in processed_doc:
    print(token.text, token.pos_, token.ent_iob_, token.ent_type_)
print("--------------------------")
for ents in processed_doc.ents:
    print(ents.text, ents.start_char, ents.end_char, ents.label_)   

Dr. PROPN O 
Sumit PROPN B PERSON
is AUX O 
an DET O 
adjunct ADJ O 
faculty NOUN O 
at ADP O 
UpGrad PROPN B ORG
. PUNCT O 
--------------------------
Sumit 3 8 PERSON
UpGrad 34 40 ORG


In [58]:
# Anonymizing of the data using spacy

email = ('Dear Family, Jose Luis and I have changed our dates, we are '
         'going to come to Aspen on the 23rd of December and leave on the '
         '30th of December. We would like to stay in the front bedroom of '
         'the Aspen Cottage so that Mark, Natalie and Zachary can stay in '
         'the guest cottage. Please let me know if there are any problems '
         'with this. If I do not hear anything, I will assume this is all '
         'o.k. with you.'
         'Love, Liz')

processed_email = ner(email) 

In [62]:
anonymized_email = list(email)
for ent in processed_email.ents:
  if(ent.label_ == 'PERSON') or ent.label_ == 'DATE':
    for char_pos in range(ent.start_char, ent.end_char):
      anonymized_email[char_pos] = '*'

"".join(anonymized_email)

'Dear Family, ********* and I have changed our dates, we are going to come to Aspen on ******************** and leave on ********************. We would like to stay in the front bedroom of the Aspen Cottage so that ****, ******* and ******* can stay in the guest cottage. Please let me know if there are any problems with this. If I do not hear anything, I will assume this is all o.k. with you.Love, ***'

In [64]:
# Custom Random Fields CRF
label_test = open("F:\programming\python-development\python-questions\Database\\nlp_data\label_test.txt", 'rb').read().decode('utf-8').split("\n")
label_train = open("F:\programming\python-development\python-questions\Database\\nlp_data\label_train.txt", 'rb').read().decode('utf-8').split("\n")
sent_test = open("F:\programming\python-development\python-questions\Database\\nlp_data\sent_test.txt", 'rb').read().decode('utf-8').split("\n")
sent_train = open("F:\programming\python-development\python-questions\Database\\nlp_data\sent_train.txt", 'rb').read().decode('utf-8').split("\n")

In [68]:
# !pip install pycrf
# !pip install sklearn-crfsuite

- f1 = input word is in lower case;
- f2 = last 3 characters of word;
- f3 = last 2 characers of word;
- f4 = 1; if the word is in uppercase, 0 otherwise;
- f5 = 1; if word is a number; otherwise, 0
- f6= 1; if the word starts with a capital letter; otherwise, 0

In [2]:
def get_features_for_word(sentence, pos):
    word = sentence[pos]
    features = [
    'word.lower = ' + word.lower(), # serves as word id
    'word[-3:] = ' + word[-3:],     # last three characters
    'word[-2:] = ' + word[-2:],     # last two characters
    'word.isupper = %s' % word.isupper(),  # is the word in all uppercase
    'word.isdigit = %s' % word.isdigit(),  # is the word a number
    'words.startsWithCapital = %s' % word[0].isupper() # is the word starting with a capital letter
    ] 

    if(pos > 0):
        prev_word = sentence[pos-1]
        features.extend([
            'prev_word.lower = ' + prev_word.lower(),
            'prev_word.isupper = %s' % prev_word.isupper(),
            'prev_word.isdigit = %s' % prev_word.isdigit(),
            'prev_word.startsWithCapital = %s' % prev_word[0].isupper()
        ])
    else:
        features.append('BEG') # feature to track begin of sentence 

    if(pos == len(sentence)-1):
        features.append('END') # feature to track end of sentence

    return features

def get_features_for_sentence(sentence):
    sentence_list = sentence.split()
    return [get_features_for_word(sentence, i) for i in range(len(sentence_list))]

def get_labels_for_sentence(labels):
    return labels.split()