In [2]:
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize

In [3]:
batch_df = pd.read_csv("../../data/Combined_Batch.csv", usecols=['WorkerId','Answer.“Activity”','RejectionTime'])
batch_df["RejectionTime"].fillna(0, inplace = True)

batch_df = batch_df.query('RejectionTime == 0')
batch_df = batch_df.reset_index()
batch_df["ID"] = batch_df.index + 1
batch_df

Unnamed: 0,index,WorkerId,RejectionTime,Answer.“Activity”,ID
0,0,A24JKHC4HTY6CD,0,swim,1
1,1,A3OVE49GVEJT8B,0,swimming,2
2,2,A4D99Y82KOLC8,0,rope swing,3
3,3,A1BYS45FAD0HWD,0,"Jet Skiing, riding",4
4,4,A24JKHC4HTY6CD,0,"/RIDING, swimming",5
...,...,...,...,...,...
796,901,AKSLU0C30G3JT,0,/kiteboarding,797
797,902,A5TRIFYKKB8KG,0,kiteboarding,798
798,904,A2F36YYI5U6AF4,0,guided tour,799
799,905,A2MTOSH583K270,0,/RIDE,800


In [4]:
input_df = pd.read_csv("../../data/input_file_MTurk.csv", usecols=['0','Object','Overview'])
input_df

Unnamed: 0,0,Object,Overview
0,1,,Get a personalized Blue Hole experience on a p...
1,3,,Jet extreme offers one of the finest way to ex...
2,6,,For anyone looking to party in Punta Cana—for ...
3,8,,Experience the natural beauty of Jamaica on a ...
4,14,,This combination tour includes both Dunn's Riv...
...,...,...,...
262,705,,Spend the day snorkeling on this speedboat tou...
263,716,,This tour starts with a 15 minutes tour of the...
264,735,,Cruise along the coast of Nassau aboard your o...
265,741,,Learn to kiteboard with a friend while on vaca...


In [5]:
def maximum(len1, len2, len3): 
    '''getting the row which has the maximum length
    '''
  
    if (len1[0] >= len2[0]) and (len1[0] >= len3[0]): 
        largest = len1
  
    elif (len2[0] >= len1[0]) and (len2[0] >= len3[0]): 
        largest = len2
    else: 
        largest = len3
          
    return largest 

def get_top_annotation(annotation_list):
    ''' get the best annotation out of the 3 worker annotation by
        checking the common words and returning the annotation which
        has maximum words and with maximum common words.
    '''
    set_1 = set(annotation_list[0][0])
    set_2 = set(annotation_list[1][0])
    set_3 = set(annotation_list[2][0])

    if len(set_1) > len(set_2):
        length_1_2 = (len(set_1.intersection(set_2)), 1, annotation_list[0][1])
    else:
        length_1_2 = (len(set_2.intersection(set_1)), 2,annotation_list[1][1])

    if len(set_2) > len(set_3):
        length_2_3 = (len(set_2.intersection(set_3)), 2, annotation_list[1][1])
    else:
        length_2_3 = (len(set_3.intersection(set_2)), 3, annotation_list[2][1])

    if len(set_1) > len(set_3):
        length_1_3 = (len(set_1.intersection(set_3)), 1, annotation_list[0][1])
    else:
        length_1_3 = (len(set_3.intersection(set_1)), 3, annotation_list[2][1])
        
    top_annotation = maximum(length_1_2,length_2_3,length_1_3)
    
    return top_annotation

def find_row_id(row_id):
    ''' find the row number assuming each row number
        cooresponds to 3 rows
    '''
    if row_id <=3:
        return 1
    else:
        mod = row_id % 3
        
        if mod == 0:
            return int(row_id/3)
        else:
            num = row_id - mod
            
            return int(num/3 + 1)

In [6]:
# preparing annotation row, after lowercase and lemmatization

lemmatizer= WordNetLemmatizer()

annotations_dict = batch_df.to_dict()
worker_id_list = list(annotations_dict['WorkerId'].values())
activity_list = list(annotations_dict['Answer.“Activity”'].values())
ID_list = list(annotations_dict['ID'].values())

activity_list = [ i.strip('/').lower() for i in activity_list ]

worker_annotation = list(zip(ID_list, worker_id_list,activity_list))
worker_annotation
annotation_row_list = []
count = 0
id =0
item_list = []
lemme_list = []

for index,item in enumerate(worker_annotation):    
    count +=1
    if index % 3 == 0:
        count = 0
    if index % 3 == 0:
        id +=1
    
    lemme_list = []
    item_list = word_tokenize(item[2])
    for each in range(len(item_list)):
        if item_list[each] != ",":
            word = lemmatizer.lemmatize(item_list[each].strip(), 'v')
            lemme_list.append(word)
            
    annotation_row_list.append((item[0],count,id,lemme_list))

In [7]:
# creating list of the top annotations of each document
top_row_ids = []

docid_list = []
annotation_list = []
for row in annotation_row_list:
    row_id = row[0]
    doc_id = row[2]
    annontaion = row[3]
    
    if doc_id not in docid_list:
        annotation_list = []
        docid_list.append(doc_id)
        annotation_list.append((annontaion,row_id))
    else:
        annotation_list.append((annontaion, row_id))
        
        if len(annotation_list) == 3:
            top_annotation = get_top_annotation(annotation_list)
            top_row_ids.append(top_annotation)

In [8]:
# prepare the final dataframe, which has the top annotation for each row in our corpus

final_df = pd.DataFrame(columns = ['ID', 'Object']) 

for each_top_annotaion in top_row_ids:
    batch_row_id = each_top_annotaion[2]
    input_row_id = find_row_id(batch_row_id)
    object_val = batch_df['Answer.“Activity”'][batch_row_id-1]
    row_val = input_df["0"][input_row_id-1]
    
    row_df = {'ID': row_val, 'Object': object_val} 
    final_df = final_df.append(row_df, ignore_index=True) 

In [9]:
final_df

Unnamed: 0,ID,Object
0,1,swimming
1,3,"/ swimming, riding, snorkeling"
2,6,/SNORKELING
3,8,"/Horseriding, Shopping"
4,14,"/hiking, cliff jumping, rope swinging"
...,...,...
262,705,/snorkeling
263,716,"/drive, hike, bath"
264,735,"/cruise, deep sea fishing, in-shore/light tack..."
265,741,/kiteboarding


In [10]:
annotations_dict = dict(zip(final_df.ID,final_df.Object))
annotations_list = []
f = open("../../data/final_annotations_one_token.csv", "w")
f.write("Document_ID")
f.write(",")
f.write("Annotation_Token")
f.write("\n")

for id,annotation in annotations_dict.items():
    annotation = annotation.strip("/").strip(" ").rstrip(",")
    #print(id,annotation)
    annotation_list = annotation.split(",")
    for index,i in enumerate(annotation_list):
        f.write(str(id))
        f.write(",")
        f.write(i.lower().strip(" "))
        f.write("\n")

f.close()

In [11]:
annotations = pd.read_csv("../../data/final_annotations_one_token.csv")

In [12]:
annotations["Annotation_Token"].unique()

array(['swimming', 'riding', 'snorkeling', 'horseriding', 'shopping',
       'hiking', 'cliff jumping', 'rope swinging', 'walking tour',
       'hike up', 'diving', 'cruise', 'travel', 'tours', 'snorkel',
       'getaway', 'trip', 'boat ride', 'drive', 'tasting', 'swim',
       'explore', 'tour', 'horse back ridding', 'hopping',
       'bird training', 'snorkel outing', 'sailing', 'bird-watching',
       'relaxing', 'fishing', 'flyboarding', 'kayak', 'paddling',
       'horseback riding', 'ride', 'paddle', 'dive', 'driving',
       'horseback ride', 'historic sites', 'adventure', 'farming',
       'riders', 'scuba diving', 'trekking', 'hiring / relax', 'learning',
       'boat tour', 'hike', 'cliff jump', 'walk', 'trot', 'canter',
       'ride / thrilling', 'watching', 'excursion', 'dip', 'kiteboarding',
       'scuba', 'water dive', 'chocolate tour', 'pair', 'snorkel.swim',
       'cooking', 'sailing tour', 'visit', 'clib', 'play', 'sightseeing',
       'excursions', 'jumping', 'ridin

In [13]:
lst = ['swimming', 'riding', 'snorkeling', 'horseriding', 'shopping','hiking', 'cliff jumping', 'rope swinging', 'walking tour','hike up', 'diving', 'cruise', 'travel', 'tours', 'snorkel','getaway', 'trip', 'boat ride', 'drive', 'tasting', 'swim','explore', 'tour', 'horse back ridding', 'hopping','bird training', 'snorkel outing', 'sailing', 'bird-watching','relaxing', 'fishing', 'flyboarding', 'kayak', 'paddling','horseback riding', 'ride', 'paddle', 'dive', 'driving','horseback ride', 'historic sites', 'adventure', 'farming','riders', 'scuba diving', 'trekking', 'hiring / relax', 'learning','boat tour', 'hike', 'cliff jump', 'walk', 'trot', 'canter','ride / thrilling', 'watching', 'excursion', 'dip', 'kiteboarding','scuba', 'water dive', 'chocolate tour', 'pair', 'snorkel.swim','cooking', 'sailing tour', 'visit', 'clib', 'play', 'sightseeing','excursions', 'jumping', 'riding tour', 'sail', 'snorkeling site','full-day tour', 'guided sightseeing tour', 'sip', 'climb','float', 'jetskiing', 'sightseeing tour', 'guide', 'zipline','travelling', 'traveling', 'divers', 'exploring', 'scooter','boating', 'sailing and snorkeling', 'cliff diving', 'navigate','walking', 'pumping', 'trail', 'atv tour', 'learn','full day tour', 'bike riding', 'guided tour','shopping excursion', 'water sports', 'lounging', 'speedboat','swims', 'sunbathe', 'island hop', 'explore marine life', 'relax','driving tour', 'splash', 'guided day trip', 'ziplining', 'feast','eat', 'waterfall rappelling', 'observe', 'coach tour', 'lunch','brunch', 'flying', 'snorkell', 'relaxation', 'art','sunset cruise listen', 'watch', 'pedal', 'cycling', 'shop','view', 'follow', 'sample food and drink','beach hopping off the coast', 'see colorful corals', 'discover','kayaking', 'listen', 'parasailing', 'petting zoo','kayaking and snorkeling', 'reflect', 'food tour', 'hikes','viewing', 'tour.', 'glide', 'admire jungle scenery','private tour', 'kitesurfing', 'get up', 'buy','private guided tour', 'small-group tour', 'charter', 'vist','barbecue', 'frolicking', 'stand-up paddleboarding', 'climbing','multi-stop tour', 'cultural tour', 'playing', 'surfing','practicing', 'atv off-road adventure', 'eating', 'lounge','full-day land and sea tour', 'boarding a catamaran', 'sing','dance', 'visit two different snorkel locations','observing the abundant marine life in each','cruise explore visit watersports relax', 'powerboat', 'views','dining', 'taste', 'jet skiing', 'sunset sailing cruise', 'jump','slide', 'bath', 'deep sea fishing','in-shore/light tackle fishing', 'sample']

In [14]:
select_str = "<select name='annot_drop_down' id='annot_drop_down'>"

In [15]:
for item in lst:
    select_str += "<option value="+item+">"+item+"</option>"

In [16]:
select_str += "</select>"
select_str

"<select name='annot_drop_down' id='annot_drop_down'><option value=swimming>swimming</option><option value=riding>riding</option><option value=snorkeling>snorkeling</option><option value=horseriding>horseriding</option><option value=shopping>shopping</option><option value=hiking>hiking</option><option value=cliff jumping>cliff jumping</option><option value=rope swinging>rope swinging</option><option value=walking tour>walking tour</option><option value=hike up>hike up</option><option value=diving>diving</option><option value=cruise>cruise</option><option value=travel>travel</option><option value=tours>tours</option><option value=snorkel>snorkel</option><option value=getaway>getaway</option><option value=trip>trip</option><option value=boat ride>boat ride</option><option value=drive>drive</option><option value=tasting>tasting</option><option value=swim>swim</option><option value=explore>explore</option><option value=tour>tour</option><option value=horse back ridding>horse back ridding</

In [17]:
result = annotations.query("Annotation_Token == 'tours'")
doc_id_list = list(result['Document_ID'])
doc_id_list

[35, 182, 217]

In [18]:
data = pd.read_csv("../../data/final_corpus.csv")
data[data['Document ID'].isin(doc_id_list)]

Unnamed: 0,Document ID,Subject,Predicate,Object,Subject.1,Predicate.1,Object.1,Overview,Review1,Review2,Review3,Review4,Review5
34,35,Caicos_Turks,hasActivity,,,,,Cruise around the islands of Turks and Caicos ...,,,,,
181,182,Barbados,hasActivity,,,,,Skip the hassle of crowded group tours and enj...,Morning fishing in Jan 2020. Missed a hooked w...,Our family took a four-hour fishing charter on...,Went fishing for a morning 4 hour charter on I...,Have been out with IOU Fishing Charters three ...,I went on a morning charter with my wife and 2...
216,217,Forte Charlotte_Caribbean,hasActivity,trekking,Forte Charlotte_Caribbean,hasActivity,trekking,Feeling safe and comfortable is one of my prio...,We spent the day with Samuel and had the best ...,Great Day with a really funny guide Rum Punch ...,I would highly recommend going on a tour with ...,,
