In [45]:
import spacy
import json
import pandas as pd

In [46]:
data_location = '../data/json_dumps/11481.json'
#article_location = '../data/article.json'
with open(data_location, 'r', encoding='utf8', errors='ignore') as json_file:
    data = json.load(json_file)
    snippets_array = data['article']['snippets']

In [47]:
nlp = spacy.load("en_core_web_md")

In [48]:
sentence_array = []
ids = []
for item in snippets_array:
    if 'text' in item:
        data = item['text']
        id_val = item['id']
        if len(data) < 50:
            sentence_array.append(data)
            ids.append(id_val)
dataframe = pd.DataFrame(zip(ids,sentence_array), columns=['id','Sentence'])

In [49]:
dataframe

Unnamed: 0,id,Sentence
0,187637,1)\tCatch the indie scene at Route 196
1,187640,2)\tGo clubbing at Black Market
2,187643,3)\tDine and drink at The Yard
3,187646,4)\tGo shopping at a night market
4,187649,5)\tLaugh out loud at Zirkoh
5,187652,6)\tSing your heart out at Red Box
6,187655,7)\tFeast at Mercato
7,187657,8)\tWatch an indie film at Teatrino
8,187660,9)\tCheer on your favorite team at Skinny Mike’s
9,187663,10)\tTry your luck at City of Dreams


In [50]:
entities=[]
type_entity=[]
sentences=[]
sentence_id = []
only_places = ['FAC', 'GPE', 'ORG']
sample = list(dataframe['Sentence'])
sample_ids = list(dataframe['id'])
for sent in sample:
    parsed_sentence=nlp(sent)
    for ent in parsed_sentence.ents:
        if ent.text not in entities:
            if ent.label_ in only_places:
                entities.append(ent.text)
                sentences.append(sent)
                type_entity.append(spacy.explain(ent.label_))
                sentence_id.append(sample_ids[sample.index(sent)])
Entities=pd.DataFrame({'Id': sentence_id,'Activity At':entities,'Activity Description':type_entity})
print('The total number of entities detected were:{}'.format(len(Entities)))
Entities

The total number of entities detected were:6


Unnamed: 0,Id,Activity At,Activity Description
0,187637,Route 196,"Buildings, airports, highways, bridges, etc."
1,187640,Black Market,"Buildings, airports, highways, bridges, etc."
2,187649,Zirkoh,"Companies, agencies, institutions, etc."
3,187655,Mercato,"Companies, agencies, institutions, etc."
4,187657,Teatrino,"Countries, cities, states"
5,187666,Manila,"Countries, cities, states"


In [51]:
spacy.explain('FAC')

'Buildings, airports, highways, bridges, etc.'

In [52]:
spacy.explain('NORP')

'Nationalities or religious or political groups'

In [53]:
Entities.to_json(orient='records')

'[{"Id":187637,"Activity At":"Route 196","Activity Description":"Buildings, airports, highways, bridges, etc."},{"Id":187640,"Activity At":"Black Market","Activity Description":"Buildings, airports, highways, bridges, etc."},{"Id":187649,"Activity At":"Zirkoh","Activity Description":"Companies, agencies, institutions, etc."},{"Id":187655,"Activity At":"Mercato","Activity Description":"Companies, agencies, institutions, etc."},{"Id":187657,"Activity At":"Teatrino","Activity Description":"Countries, cities, states"},{"Id":187666,"Activity At":"Manila","Activity Description":"Countries, cities, states"}]'

In [54]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
pattern = [{'POS': 'VERB', 'OP': '+'}, {'POS': 'CONJ', 'OP': '?'}, {'POS': 'VERB', 'OP': '?'}]
matcher.add("Matching", None, pattern)
activity = []
activity_ids = []
for item in sample:
    doc = nlp(item)
    matches = matcher(doc)
    for match_id, start, end in matches:
        string_id = nlp.vocab.strings[match_id]  
        span = doc[start:end]  # The matched span
        print(span.text)
        activity.append(span.text)
        activity_ids.append(sample_ids[sample.index(item)])

Catch
Go clubbing
clubbing
Go shopping
shopping
Laugh
Sing
Watch
Try


In [55]:
activity_df = pd.DataFrame(zip(activity_ids, activity), columns=['Id','Activity'])

In [56]:
activity_df

Unnamed: 0,Id,Activity
0,187637,Catch
1,187640,Go clubbing
2,187640,clubbing
3,187646,Go shopping
4,187646,shopping
5,187649,Laugh
6,187652,Sing
7,187657,Watch
8,187663,Try


In [57]:
final_json = Entities.merge(activity_df, left_on='Id', right_on='Id')

In [58]:
final_json.to_json(orient='records')

'[{"Id":187637,"Activity At":"Route 196","Activity Description":"Buildings, airports, highways, bridges, etc.","Activity":"Catch"},{"Id":187640,"Activity At":"Black Market","Activity Description":"Buildings, airports, highways, bridges, etc.","Activity":"Go clubbing"},{"Id":187640,"Activity At":"Black Market","Activity Description":"Buildings, airports, highways, bridges, etc.","Activity":"clubbing"},{"Id":187649,"Activity At":"Zirkoh","Activity Description":"Companies, agencies, institutions, etc.","Activity":"Laugh"},{"Id":187657,"Activity At":"Teatrino","Activity Description":"Countries, cities, states","Activity":"Watch"}]'

In [59]:
final_json

Unnamed: 0,Id,Activity At,Activity Description,Activity
0,187637,Route 196,"Buildings, airports, highways, bridges, etc.",Catch
1,187640,Black Market,"Buildings, airports, highways, bridges, etc.",Go clubbing
2,187640,Black Market,"Buildings, airports, highways, bridges, etc.",clubbing
3,187649,Zirkoh,"Companies, agencies, institutions, etc.",Laugh
4,187657,Teatrino,"Countries, cities, states",Watch
