## This notebook reads the corpus and automatically populates annotation triples : ##
## location "hasActivity" activity ##

In [1]:
#load main corpus
import numpy as np
import pandas as pd

file = pd.read_csv("../../data/corpus original.csv", usecols=['overview','review_1','review_2','review_3','review_4','review_5'])
file = file.replace(np.nan, '', regex=True)
file.head()

Unnamed: 0,overview,review_1,review_2,review_3,review_4,review_5
0,Get a personalized Blue Hole experience on a p...,My wife and I are the type of travelers that l...,My husband and I booked the private Blue Hole ...,We were in a cruise ship that was delayed for ...,We booked a trip to Blue Hole from Sandals Roy...,My group of friends and I booked Delton's serv...
1,,We really had an awesome time chartering with ...,My favorite part of the camp was snorkeling w...,My 8 year old daughter spent 5 days snorkeling...,Jason and Alex were great! They modified our t...,I have been out on many tours during cruise st...
2,Jet extreme offers one of the finest way to ex...,"About 6 months ago, I booked fly boarding for ...",,,,
3,"Little Tobago, an offshore island which is vis...",,,,,
4,,"Our guides were brilliant on this experience, ...",With our wonderful guides Taley and Rhaden we ...,This was fantastic! Jesse and Lea were the bes...,Great guides - Sancha & Jeffrey - took us thro...,I was reluctant to take this zip wire tour but...


In [2]:
#create a list of verbs ending with "ing" from the corpus
#Once its created manually remove irrelevant words to create the final travel activity list lexicon

import nltk
from nltk import word_tokenize,sent_tokenize

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) 
text_list = file.values.tolist()
count = 0
verb_ing = set()
for text in text_list:
    all_text_per_doc = " ".join(text)
    
    if all_text_per_doc == "":
        continue
        
    tokenized_text = sent_tokenize(all_text_per_doc)
    
    for sent in tokenized_text:
        tokenized_sent = word_tokenize(sent)
        tagged_sent = nltk.pos_tag(tokenized_sent)
        for word,pos in tagged_sent:
            if word not in stop_words and pos.startswith("VB") and "ing" in word:
                verb_ing.add(word.lower())

f = open("../../data/verbing.txt", "w")
f.write("\n".join(verb_ing))
f.close()

In [3]:
#load travel acitivty lexicon
#travel_activity_lexicon was created by manually selecting travel related ing verbs and discarding the rest

activity_list = []

f = open("../../data/travel_activity_lexicon.txt", "r")
for item in f:
    activity_list.append(item)
f.close()


In [4]:
# function to get a set of locations for a document.

import spacy
nlp = spacy.load('en_core_web_sm') 
def get_locations(document):
    locations = set()
    doc = nlp(document)
    for ent in doc.ents: 
        if ent.label_ == "LOC" or ent.label_ == "NORP" or ent.label_ == "GPE":
            locations.add(ent.text)
        
    return locations

In [5]:
# function to get a set of activities for a sentence.

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) 

def get_activity(tagged_sent):

    extracted_activity = set()
    for word,pos in tagged_sent:
        #print(word,":",pos)
        for activity in activity_list:
            if pos.startswith("VB") and word in activity and word not in stop_words and len(word)>3 and len(word)/len(activity) >= 0.4:
                extracted_activity.add(word)
        
    return extracted_activity

In [6]:
# Function that takes corpus original.csv and automatically annotates entire corpus 
# Generates new file annotation_automated.csv.
# If subject is found or object is found, then entry is made.
# Run this cell to start.Can take upto 5-10 minutes.
# Once file is generated, insert these three columns into a copy of original_corpus.csv and rename corpus_annotated.csv
# Some manual corrections needs to be done to ensure correct annotations map to correct documents
# as there will be some mismatch.

verbs = set()
activity_set = set()
doc_activity_set = set()
text_list = file.values.tolist()
count = 0
location_subject = ""
activity_object = ""
predicate = "hasActivity"
triple = []
cannot_annotate = 0


f = open("../../data/annotation_automated.csv", "a")
        
for row,text in enumerate(text_list):
    count +=1
    all_text_per_doc = " ".join(text)
    if all_text_per_doc == "":
        continue
    locations = get_locations(all_text_per_doc)
    tokenized_text = sent_tokenize(all_text_per_doc)
    
    for sent in tokenized_text:
        tokenized_sent = word_tokenize(sent)
        tagged_sent = nltk.pos_tag(tokenized_sent)
        activity_set.update(get_activity(tagged_sent))
    
        doc_activity_set.update(activity_set)
        activity_set.clear()
        
    if len(locations) !=0 or len(doc_activity_set)!=0:
        location_subject = "_".join(locations)
        activity_object = "_".join(doc_activity_set)
        triple = [location_subject, predicate, activity_object]
        #print(triple)
        f.write(location_subject)
        f.write(",")
        f.write(predicate)
        f.write(",")
        f.write(activity_object)
        f.write(",")
        f.write("\n")
    else:
        cannot_annotate +=1
        f.write("")
        f.write(",")
        f.write("")
        f.write(",")
        f.write("")
        f.write(",")    
        f.write("\n")
        
    doc_activity_set.clear()
    
f.close()
print(cannot_annotate)

515
