## Entity Recognition

In [7]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [8]:
import re
from bs4 import BeautifulSoup

def clean_raw_text(raw_text):
    soup = BeautifulSoup(raw_text, 'html.parser')
    clean_soup = repr(soup.get_text().replace('\n','.'))
    clean_soup = re.sub(r'\\x[0-9A-Fa-f]{2}', ' ', clean_soup)
    clean_soup = re.sub(r'\\u[0-9A-Fa-f]{4}', ' ', clean_soup)
    clean_soup = re.sub(r'[\.] +', '.', clean_soup)
    clean_soup = re.sub(r'\.+', '. ', clean_soup)
    clean_soup = re.sub(' +', ' ', clean_soup)
    return clean_soup

def remove_entity_words(raw_text):
    input_text = clean_raw_text(raw_text)
    input_text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '_url_page_', input_text)
    input_text = re.sub(r'[\w\.-]+@[\w\.-]+', '_email_replaced_', input_text)
    tags = nlp(input_text)
    for entity in tags.ents:
        tag = entity.label_
        text = entity.text
        if tag in ['PERSON', 'ORG', 'EVENT', 'GPE', 'NORP', 'PRODUCT', 'MONEY']:
            input_text = input_text.replace(text, '_'+tag+'_')
    return input_text

## Model Utils

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd 
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier  
import numpy as np

def train_mlp(X,y):
    mlp = MLPClassifier(hidden_layer_sizes=(10,10), max_iter=1000)  
    mlp.fit(X, y) 
    return mlp
    
def train_svm(X, y):
    svm = SVC(C=1000000.0, gamma="auto", kernel='rbf')
    svm.fit(X, y)
    return svm

def load_docs(csv_file,delimiter=','):
    data = pd.read_csv(csv_file, delimiter=delimiter) 
    tags = np.array(data.is_order_email)
    labels = np.array(data.label)
    docs = []
    for i in range(len(tags)):
        docs.append((tags[i],remove_entity_words(labels[i])))
#         docs.append((tags[i],clean_raw_text(labels[i])))
    return docs

def create_tfidf_training_data(docs,min_df=5):
    # Create the training data class labels
    y = [d[0] for d in docs]

    # Create the document corpus list
    corpus = [d[1] for d in docs]

    # Create the TF-IDF vectoriser and transform the corpus
    vectorizer = TfidfVectorizer(min_df=min_df, token_pattern=r'(?u)\b\w*[a-zA-Z]\w*\b')
    X = vectorizer.fit_transform(corpus)
    return X, y, vectorizer

def tfidf_transform_data(docs, vectorizer):
        # Create the training data class labels
    y = [d[0] for d in docs]

    # Create the document corpus list
    corpus = [d[1] for d in docs]

    # Create the TF-IDF vectoriser and transform the corpus
    X = vectorizer.transform(corpus)
    return X, y


## Load Model

In [144]:
import pickle
vectorizer = pickle.load(open("model/tfidf.pkl", "rb"))
model = pickle.load(open("model/svm_model.pkl", "rb"))

## Test Model

In [42]:
def split(list_text, mark):
    res = []
    for text in list_text:
        res.extend(text.split(mark))
    return res

In [153]:

def is_text_order_notif(raw_text):
    raw_text = remove_entity_words(raw_text)
    raws = [raw_text]
#     raws = split(raws, '-')
#     raws = split(raws, ',')
#     raws = split(raws, '!')
#     raws = split(raws, '?')
#     raws = split(raws, '.')
    print(raws)
    test_words = []
    for r in raws:
        test_words.append((0,r))
    x, y = tfidf_transform_data(test_words, vectorizer)
    pred = model.predict(x)
    print(pred)
    for res in pred:
        if res == 1:
            return True
    return False

In [150]:
raw_text = "Google Security alert for your linked Google Account Sun 4/12 Your account ray-cap@hotmail.com is listed as the recovery email for raymond.djajalaksana@gmail.com. Don't recognize this account? Click here New sign-in to your linked account raymond.djajalaksana@gmail.com Your Google Account was just signed in to fr"
raw_text = "Domino's Pizza Singapore - No Reply Domino's Pizza Online Ordering - Email Confirmation Wed 6:01 PM Domino's Pizza Singapore dominos.com.sg Order Confirmation Dear MR RAYMOND DJAJALAKSANA Thank you for your order! To practise social distancing, we are now implementing Zero Contact Delivery and Takeaway. You have ordered the followi"

is_text_order_notif(raw_text)

['"Domino\'s _ORG_ ', " No Reply Domino's Pizza Online Ordering ", ' Email Confirmation Wed 6:01 _ORG_ _ORG_ dominos', ' com', ' sg Order Confirmation Dear MR RAYMOND DJAJALAKSANA Thank you for your order', ' To practise social distancing', ' we are now implementing Zero Contact Delivery and _GPE_', ' You have ordered the followi"']
[0 0 0 0 1 0 0 0]


True

## Building NN / SVM Model

In [129]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
base_path = 'raw_data/{}'
docs = load_docs(base_path.format("order_search.csv"))
for additional_doc in load_docs(base_path.format("all_email_1.csv"), delimiter='|'):
    docs.append(additional_doc)
for additional_doc in load_docs(base_path.format("all_email_2.csv"), delimiter='|'):
    docs.append(additional_doc)
for additional_doc in load_docs(base_path.format("all_email_3.csv"), delimiter='|'):
    docs.append(additional_doc)

In [135]:
x, y, vector = create_tfidf_training_data(docs, min_df=7)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=12)

In [141]:
model = train_mlp(X_train, y_train)
pred = model.predict(X_test)
print(model.score(X_test, y_test))
print(confusion_matrix(pred, y_test))

model = train_svm(X_train, y_train)
pred = model.predict(X_test)
print(model.score(X_test, y_test))
print(confusion_matrix(pred, y_test))


0.9572953736654805
[[244  10]
 [  2  25]]
0.9644128113879004
[[244   8]
 [  2  27]]


In [140]:
print(len(vector.get_feature_names()))
print(vector.get_feature_names())

558
['00am', '00pm', '30am', '__org_', '_email_replaced_', '_event_', '_gpe_', '_money_', '_money__money_', '_norp_', '_org_', '_person_', '_product_', '_url_page_', 'a', 'about', 'above', 'access', 'account', 'add', 'added', 'address', 'after', 'alert', 'all', 'also', 'alumni', 'am', 'amazon', 'amount', 'an', 'and', 'announcement', 'another', 'any', 'anyone', 'app', 'apply', 'april', 'are', 'around', 'as', 'at', 'attachments', 'aug', 'august', 'automated', 'available', 'away', 'awesome', 'b', 'back', 'based', 'be', 'because', 'been', 'before', 'being', 'below', 'best', 'better', 'beverages', 'big_person_', 'bill', 'book', 'booking', 'break', 'browse', 'browser', 'bundle', 'by', 'call', 'can', 'cancelled', 'cap', 'card', 'career', 'catalog', 'cdjapan', 'celebrate', 'centre', 'change', 'changes', 'check', 'chicken', 'choosing', 'click', 'code', 'coffee', 'collection', 'com', 'combed', 'come', 'coming', 'complete', 'conditions', 'confirmation', 'confirmed', 'contact', 'contains', 'contra

## Save Model

In [143]:
import pickle

pickle.dump(model, open('model/svm_model.pkl', 'wb'))
pickle.dump(vector, open("model/tfidf.pkl", "wb"))

## RPA-Bot for email scrapping

In [154]:
email_account = ''
email_pwd = ''

In [1]:
import tagui as t
import numpy as np

In [32]:
def type_into(xpath, type_cmd):
    wait_element(xpath)
    t.type(xpath, type_cmd)

def click(xpath):
    wait_element(xpath)
    t.click(xpath)
    
def present(xpath):
    return wait_element(xpath)

def read(xpath):
    wait_element(xpath)
    return t.read(xpath)

def wait_element(xpath):
    for i in range(10):
        if t.present(xpath):
            return True
        t.wait(1)
    return False
    
def login_outlook(account, password):
    t.url('https://login.live.com/login.srf?wa=wsignin1.0&rpsnv=13&ct=1586073207&rver=7.0.6737.0&wp=MBI_SSL&wreply=https%3a%2f%2foutlook.live.com%2fowa%2f%3fnlp%3d1%26RpsCsrfState%3d6590c65e-2e3f-b1ed-bda9-2c5e901a9000&id=292841&aadredir=1&whr=outlook.sg&CBCXT=out&lw=1&fl=dob%2cflname%2cwld&cobrandid=90015')
    type_into('//*[@type="email"]', account + '[enter]')
    type_into('//*[@name="passwd"]', password + '[enter]')
    
def login_gmail(account,password):
    login_stackoverflow(account,password)
    t.url('https://www.gmail.com')
    
def login_stackoverflow(account, password):
    t.url('https://stackoverflow.com/users/login')
    click('//button[@data-provider="google"]')
    if wait_element('//div[@data-identifier="{}"]'.format(account)):
        click('//div[@data-identifier="{}"]'.format(account))
    else:
        c = t.count('//div[@jsslot=""]//li')
        click('(//div[@jsslot=""]//li)[{}]'.format(c))
        type_into('//*[@type="email"]', account + '[enter]')
    type_into('//*[@name="passwd"]', password + '[enter]')
    return

def search_keyword(keyword):
    clear_button  = '//button[@aria-label="Exit Search"]'
    search_button = '//button[@aria-label="Search"]'
    if t.present(clear_button):
        click(clear_button)
    type_into('//input[contains(@aria-label, "Search")]', keyword )
    click(search_button)

def extract_outlook_email_headline(limit=100):
    list_item = []
    listbox_xpath = '//div[@role="listbox"]'
    if limit == 0 :
        limit = 1000
    for i in range(1, limit+1):
        item_xpath = '(' + listbox_xpath + '//div[@role="option"])[{}]'.format(i)
        if not present(item_xpath):
            print('email {} is not present'.format(i))
            break
        email_id = t.read(item_xpath + '/@data-convid')
        email_headline = t.read(item_xpath + '/@aria-label')
        list_item.append((email_id, email_headline))
    return np.array(list_item)
    
def extract_google_email_headline(limit=10):
    list_item = []
    listbox_xpath = '(//table[@class="F cf zt"])[2]'
    if limit == 0 :
        limit = 1000
    i = 1
    while len(list_item) < limit:
        item_xpath = '(' + listbox_xpath + '//tr)[{}]//td[@class="xY a4W"]'.format(i)
        if not present(item_xpath):
            older_button = '//div[@data-tooltip="Older"]'
            click(older_button)
            i = 1
            continue
        t.hover(item_xpath)
        email_id = item_xpath
        email_headline = clean_raw_text(read(item_xpath))
        header_text = read('(' + listbox_xpath + '//tr)[{}]//div[@class="afn"]'.format(i))
        list_item.append((email_id, email_headline, is_gmail_unread(header_text)))
        i += 1
    return list_item


def get_email_content(i):
    listbox_xpath = '//div[@role="listbox"]'
    item_xpath = '(' + listbox_xpath + '//div[@role="option"])[{}]'.format(i)
    click(item_xpath)
    clean_html_text = clean_raw_text(read('//div[@class="wide-content-host"]'))
    click('//button[@aria-label="More mail actions"]')
    click('//button[@name="Mark as unread"]')
    t.wait(1)
    click('//button[@aria-label="Close"]')
    return clean_html_text

def get_gmail_content(item_xpath, is_unread):
    click(item_xpath)
    content = t.read('//div[@role="listitem"]')
    t.hover('//div[contains(@title,"Mark as unread")]')
    click('(//div[contains(@data-tooltip,"Mark as unread")])[2]')
    return content
    
def is_gmail_unread(header_text):
    r = header_text.split('unread,')
    return len(r) == 2 and r[0] == ''

In [2]:
t.init(visual_automation = False, chrome_browser = True)

True

In [155]:
login_outlook(email_account, email_pwd)
t.wait(3)

[RPA][ERROR] - cannot find //*[@type="email"]
[RPA][ERROR] - cannot find //*[@name="passwd"]


True

In [33]:
extract_google_email_headline(limit=10)

[('((//table[@class="F cf zt"])[2]//tr)[1]//td[@class="xY a4W"]',
  "'FYI: Insider Purchases and Sales in Securities You Hold - The following list summarizes recently published information regarding corporate insider purchases or sales for securities you hold in your account. This data is presented for informational purposes'",
  False),
 ('((//table[@class="F cf zt"])[2]//tr)[2]//td[@class="xY a4W"]',
  "'Raymond Adalah Kabar Baik - Kiranya warta ini menolong kita untuk semakin kuat dan berpengharapan didalam Yesus Kristus Tuhan kita Web Version Sabtu, 25 April 2020 Sapaan Lembaga Alkitab Indonesia : Raymond, Kamu Adalah Kabar'",
  True),
 ('((//table[@class="F cf zt"])[2]//tr)[3]//td[@class="xY a4W"]',
  "'Here is your One-Time Password (OTP) - Here is your One-Time Password (OTP) Dear Valued Customer Please use this One-Time Password (OTP) - RAU-234783 – to log in to your account. This OTP, requested on 24 April 2020 at 02:13:56 PM, will'",
  False),
 ('((//table[@class="F cf zt"])[

In [31]:
get_gmail_content('((//table[@class="F cf zt"])[2]//tr)[2]//td[@class="xY a4W"]', False)

'Lembaga Alkitab Indonesia info@alkitab.or.id via\xa0bounce.s7.exacttarget.com\xa0Sat, Apr 25, 11:20 AM (1 day ago)to me \n\n  \n\n    \n    \n    \n    \n    \n  \n  \nKiranya warta ini menolong kita untuk semakin kuat dan berpengharapan didalam Yesus Kristus Tuhan kita\n    \n    \n      \n        \n          \n \n    \n    \n        \n        \n            \n                \n            Web Version\n                \n            \n        \n        \n      \n\n        \n      \n      \n        \n          \n            \n              \n                \n                  \n                    \n                      \n                        \n                          \n                          \n                            \n                            \n                               \n                                \n                                  \n                                    \n                                        \n                                          \n

In [15]:
clean_raw_text(t.read('//div[@role="listitem"]'))

'\'Lembaga Alkitab Indonesia info@alkitab. or. id via bounce. s7. exacttarget. com Apr 25, 2020, 11:20 AM (1 day ago)to me . Kiranya warta ini menolong kita untuk semakin kuat dan berpengharapan didalam Yesus Kristus Tuhan kita. Web Version. \\t. \\tSabtu, 25 April 2020. \\t . \\tSapaan Lembaga Alkitab Indonesia : . \\tRaymond, Kamu Adalah Kabar Baik. \\t . \\t"Le (Nak), jangan lupa sering-sering menengok Mbah Puteri (Nenek) yang sudah hidup sendiri di Klaten," pesan Ibu saya sesudah kakek saya wafat di tahun 1985-an. Tahun 1982-1989 saya kuliah di Yogyakarta sekitar 30 km dari Klaten dan harus indekos di dekat kampus. Pesan Ibu saya . Selengkapnya. God is Our Refuge. \\t. \\tKesaksian Daud dalam Alkitab, “Biarlah aku berlindung di bawah naungan sayap-Mu. " (Mazmur 61:5). Daud mengungkapkan kekuatirannya. , dan hal ini sebenarnya bertujuan untuk menguatkan dirinya dari rasa khawatir, takut, dan putus asa. Hanya kepada Tuhan sajalah perlindungan itu didapatkannya. \\t . Tonton Video. \\

In [135]:
get_email_content(1)

"' Getting too much email? Unsubscribe | Manage subscriptionsJSJobStreet. com Singapore Sat 4/18/2020 10:58 AM You . View in web browser. Dear raymond,. We understand that it can be overwhelming for candidates to deal with so much uncertainty in the world right now. That is why we have put together a COVID-19 resource hub to connect you with. Jobs. Immediate job vacancies to supplement income. Featured Employers. Companies hiring amid the COVID-19. Resources. Insights and tips to help you navigate through this challenging period. Remember, “Tough times don’t last, but tough people do”. Together, we can overcome!. #SGUnitedJobs. Take me there. Sincerely,. JobStreet Singapore. Connect with JobStreet. Private Policy . Terms & Conditions. JobStreet Singapore. 10 Anson Road, #05-20 International Plaza, Singapore 079903. * This is not an unsolicited message. To . unsubscribe please . click here'"

In [50]:
search_keyword('order')

In [156]:
outlook_emails = extract_outlook_email_headline(limit=100)

In [22]:
gmails = extract_google_email_headline(limit=150)

[RPA][ERROR] - cannot find ((//table[@class="F cf zt"])[2]//tr)[51]//td[@class="xY a4W"]
[RPA][ERROR] - cannot find ((//table[@class="F cf zt"])[2]//tr)[51]//td[@class="xY a4W"]


In [34]:
t.close()

True

## Dump dataset

In [30]:
import csv

emails = outlook_emails

with open('all_email_3.csv', mode='w' ,newline='', encoding='utf-8') as csv_file:
    fieldnames = ['id', 'label']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames, delimiter='|')

    writer.writeheader()
    for email in emails:
        writer.writerow({'id': email[0], 'label': email[1]})

In [152]:
t.close()

True

### Try with NLTK

In [13]:
import nltk
from nltk.tag.stanford import StanfordNERTagger
import os
java_path = "C:\Program Files (x86)\Common Files\Oracle\Java\javapath\java.exe"
os.environ['JAVAHOME'] = java_path

In [170]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [14]:
# https://nlp.stanford.edu/software/CRF-NER.html#Download
# st = StanfordNERTagger('stanford-ner/classifiers/english.muc.7class.distsim.crf.ser.gz', 'stanford-ner/stanford-ner.jar')
st = StanfordNERTagger('stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', 'stanford-ner/stanford-ner.jar')

In [None]:
input_text = clean_raw_text(t.read('//div[@class="wide-content-host"]'))
for sent in nltk.sent_tokenize(input_text):
    tokens = nltk.tokenize.word_tokenize(sent)
    tags = st.tag(tokens)
    for tag in tags:
        print(tag)

### Template extraction based on text similarity

In [127]:
order_status_words = [
    'your order is being processed.',
    'your order has been confirmed.',
    'your order has been shipped.',
    '_ORG_ has received your order.',
    'your order has been ackowledged by _ORG_',
    'your order has been ackowledged by _PERSON_',
    'we have received your order.',
    'we are glad to inform you that your order _GPE_ has been fully delivered',
    'Your Order _GPE_ has been placed on _DATE_.',
    'Item from your order _GPE_ has been Shipped',
    'Item(s) from your order _GPE_ has been Shipped',
    'Your order will be shipped in _NUMBER_ business days',
    'your package is on the way',
    'your package will arrive soon',
    'your package has shipped out'
]



In [157]:
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk

def get_cosine_sim(*strs): 
    vectors = [t for t in get_vectors(*strs)]
    return cosine_similarity(vectors)
    
def get_vectors(*strs):
    text = [t for t in strs]
    vectorizer = CountVectorizer(text)
    vectorizer.fit(text)
    return vectorizer.transform(text).toarray()

def get_arg_max(sim_metrics):
    return np.argmax(np.amax(sim_metrics, axis=1))

def get_order_number_sentence(raw_doc):
    doc = clean_raw_text(raw_doc)
    raw_texts_0 = nltk.sent_tokenize(doc)
    raw_texts = []
    for raw_text in raw_texts_0:
        if len(raw_text.split(' ')) < 3:
            continue
        raw_texts.append(raw_text)
    masked_raw_texts = []
    for raw_text in (raw_texts):
        masked_raw_texts.append(remove_entity_words(raw_text))
    sim_metrics = np.zeros((len(masked_raw_texts),len(order_status_words)))
    for i in range(len(masked_raw_texts)):
        for j in range(len(order_status_words)):
            text = masked_raw_texts[i]
            sentence = order_status_words[j]
            sim_metrics[i,j] = get_cosine_sim(text, sentence)[0,1]
    return raw_texts[get_arg_max(sim_metrics)]

In [158]:
get_order_number_sentence(t.read('//div[@class="wide-content-host"]'))

"Kim's Catering Pte Ltd has acknowledged your order -."

In [136]:
import json
json.load("model/order_status_sample.json")

AttributeError: 'str' object has no attribute 'read'

In [137]:
import json
with open('model/order_status_sample.json') as json_file:
    data = json.load(json_file)
    print(data)

['hh', 'bb']


In [148]:
class_str = read('(//div[contains(@class,"showHoverActionsOnHover")])[1]/@class')

In [150]:
class_str = read('(//div[contains(@class,"showHoverActionsOnHover")])[2]/@class')

In [142]:
for i in range(50):
    t.hover('(//div[contains(@class,"showHoverActionsOnHover")])[{}]'.format(i+1))
    t.wait(1)

In [143]:
get_email_content(1)

"'JSJobStreet. com Singapore Sat 4/18/2020 10:58 AM You . View in web browser. Dear raymond,. We understand that it can be overwhelming for candidates to deal with so much uncertainty in the world right now. That is why we have put together a COVID-19 resource hub to connect you with. Jobs. Immediate job vacancies to supplement income. Featured Employers. Companies hiring amid the COVID-19. Resources. Insights and tips to help you navigate through this challenging period. Remember, “Tough times don’t last, but tough people do”. Together, we can overcome!. #SGUnitedJobs. Take me there. Sincerely,. JobStreet Singapore. Connect with JobStreet. Private Policy . Terms & Conditions. JobStreet Singapore. 10 Anson Road, #05-20 International Plaza, Singapore 079903. * This is not an unsolicited message. To . unsubscribe please . click here'"

In [151]:
def is_email_unread(class_str):
    class_n = class_str.split('showHoverActionsOnHover')
    if len(class_n) <2:
        return False
    if class_n[1] != '' :
        return True
    return False
is_email_unread(class_str)

False