In [47]:
!pip install spacy



In [239]:
from spacy.displacy.render import EntityRenderer
from IPython.core.display import display, HTML
import spacy
import pandas as pd
nlp = spacy.load('en_core_web_sm')

In [240]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

In [160]:
# colors for additional part of speech tags we want to visualize
options = {
    'colors': {'COMPOUND': '#FE6BFE', 'PROPN': '#18CFE6', 'NOUN': '#18CFE6', 'NP': '#1EECA6', 'ENTITY': '#FF8800'}
}

In [161]:
pd.set_option('display.max_rows', 1000) # edit how jupyter will render our pandas dataframes
pd.options.mode.chained_assignment = None # prevent warning about working on a copy of a dataframe

In [162]:
df = pd.read_csv("rashi_data.csv")

mini_df = df[:100]
mini_df.index = pd.RangeIndex(len(mini_df.index))

# comment this out to run on full dataset
df = mini_df

In [163]:
df

Unnamed: 0,Id,Description
0,0,"Murray McLauchlan is one of Canada's most highly regarded singer/songwriters and with good cause. Over a 48-year career, he has released 19 albums and...."
1,1,"For the fourth week of FebFest there will be lots of outdoor activities with an ice stock demo, skating and more! This week's Toast to Tapas theme will be an all time Canadian favourite, bacon. As an added bonus,...."
2,2,Bundle up the little kids and take them to Boucher Park on Clarence Street just around the corner from Springer Market Square and let the kids enjoy the ice slides and ice sculptures. Boucher Park will be open for the whole month of February (weather permitting) in the heart of downtown Kingston ...
3,3,"Spring Market Square - come and enjoy a lovely, outdoor, skate to music in the historical downtown of Kingston. This public skate is free to the public and makes for a great family outing. Hours of operation are from 8 am to 10 pm Monday to Sunday ...."
4,4,"If you’ve ever gone out for a pint in the city, chances are you’ve met Greg Ball. He’s a Kingston bartender who just happens to be signed with Ching Music. The...."
5,5,Experiential Tourism is growing in popularity come take part in a cool ( not literally) artisan experience. Kingston Glass Studio & Gallery offers ....
6,6,You better come grab your seat early! This local favourite will pack our lounge! You never know who will join Chris Jackson on stage...
7,7,Skate indoors on the large NHL - sized ice pad at The Centre 70 Arena. This leisurely skate is open to all ages and is monitored by the Skate Patrol...
8,8,"The Ice Cold Comedy Festival returns for its third year with some brand new venues, fresh new faces and some of our past favourites."
9,9,"Hosted by local legend and musical veteran, Scotty, Open Mic kicks off at 7pm, and ends when our taps- and your vocals- run dry! No sign-up necessary and a......"


In [237]:
def custom_render(doc, df, column, options={}, page=False, minify=False, idx=0):
    """Overload the spaCy built-in rendering to allow custom part-of-speech (POS) tags.
    
    Keyword arguments:
    doc -- a spaCy nlp doc object
    df -- a pandas dataframe object
    column -- the name of of a column of interest in the dataframe
    options -- various options to feed into the spaCy renderer, including colors
    page -- rendering markup as full HTML page (default False)
    minify -- for compact HTML (default False)
    idx -- index for specific query or doc in dataframe (default 0)
    
    """
    renderer, converter = EntityRenderer, parse_custom_ents
    renderer = renderer(options=options)
    parsed = [converter(doc, df=df, idx=idx, column=column)]
    html = renderer.render(parsed, page=page, minify=minify).strip()  
    return display(HTML(html))

def parse_custom_ents(doc, df, idx, column):
    if column in df.columns:
        entities = df[column][idx]
        ents = [{'start': ent[1], 'end': ent[2], 'label': ent[3]} 
                for ent in entities]
    else:
        ents = [{'start': ent.start_char, 'end': ent.end_char, 'label': ent.label_}
            for ent in doc.ents]
    return {'text': doc.text, 'ents': ents, 'title': None}

def render_entities(idx, df, options={}, column='named_ents'):
    text = df['text'][idx]
    custom_render(nlp(text), df=df, column=column, options=options, idx=idx)

In [238]:
df.fillna('')

Unnamed: 0,Id,text,named_ents
0,0,"Murray McLauchlan is one of Canada's most highly regarded singer/songwriters and with good cause. Over a 48-year career, he has released 19 albums and....","[(Murray McLauchlan, 0, 17, PERSON), (one, 21, 24, CARDINAL), (Canada, 28, 34, GPE), (48-year, 105, 112, CARDINAL), (19, 137, 139, CARDINAL)]"
1,1,"For the fourth week of FebFest there will be lots of outdoor activities with an ice stock demo, skating and more! This week's Toast to Tapas theme will be an all time Canadian favourite, bacon. As an added bonus,....","[(the fourth week of FebFest, 4, 30, DATE), (week, 119, 123, DATE), (Toast, 126, 131, ORG), (Canadian, 167, 175, NORP)]"
2,2,Bundle up the little kids and take them to Boucher Park on Clarence Street just around the corner from Springer Market Square and let the kids enjoy the ice slides and ice sculptures. Boucher Park will be open for the whole month of February (weather permitting) in the heart of downtown Kingston ...,"[(Bundle, 0, 6, GPE), (Boucher Park, 43, 55, LOC), (Clarence Street, 59, 74, FAC), (Springer Market Square, 103, 125, FAC), (Boucher Park, 184, 196, FAC), (the whole month of February, 214, 241, DATE), (Kingston, 288, 296, GPE)]"
3,3,"Spring Market Square - come and enjoy a lovely, outdoor, skate to music in the historical downtown of Kingston. This public skate is free to the public and makes for a great family outing. Hours of operation are from 8 am to 10 pm Monday to Sunday ....","[(Kingston, 102, 110, GPE), (8 am to 10 pm, 217, 230, TIME), (Monday, 231, 237, DATE), (Sunday, 241, 247, DATE)]"
4,4,"If you’ve ever gone out for a pint in the city, chances are you’ve met Greg Ball. He’s a Kingston bartender who just happens to be signed with Ching Music. The....","[(Greg Ball, 71, 80, PERSON), (Kingston, 89, 97, GPE), (Ching Music, 143, 154, ORG)]"
5,5,Experiential Tourism is growing in popularity come take part in a cool ( not literally) artisan experience. Kingston Glass Studio & Gallery offers ....,"[(Kingston Glass Studio & Gallery, 108, 139, ORG)]"
6,6,You better come grab your seat early! This local favourite will pack our lounge! You never know who will join Chris Jackson on stage...,"[(Chris Jackson, 110, 123, PERSON)]"
7,7,Skate indoors on the large NHL - sized ice pad at The Centre 70 Arena. This leisurely skate is open to all ages and is monitored by the Skate Patrol...,"[(NHL, 27, 30, ORG), (The Centre 70 Arena, 50, 69, FAC), (the Skate Patrol, 132, 148, WORK_OF_ART)]"
8,8,"The Ice Cold Comedy Festival returns for its third year with some brand new venues, fresh new faces and some of our past favourites.","[(its third year, 41, 55, DATE)]"
9,9,"Hosted by local legend and musical veteran, Scotty, Open Mic kicks off at 7pm, and ends when our taps- and your vocals- run dry! No sign-up necessary and a......","[(Scotty, 44, 50, PERSON), (Open Mic, 52, 60, PERSON), (7pm, 74, 77, TIME)]"


In [166]:
# df = pd.DataFrame(df['Description'])
# df.columns = ['text']
df = df.rename({'Description': 'text'}, axis=1)
df.head(10)

Unnamed: 0,Id,text
0,0,"Murray McLauchlan is one of Canada's most highly regarded singer/songwriters and with good cause. Over a 48-year career, he has released 19 albums and...."
1,1,"For the fourth week of FebFest there will be lots of outdoor activities with an ice stock demo, skating and more! This week's Toast to Tapas theme will be an all time Canadian favourite, bacon. As an added bonus,...."
2,2,Bundle up the little kids and take them to Boucher Park on Clarence Street just around the corner from Springer Market Square and let the kids enjoy the ice slides and ice sculptures. Boucher Park will be open for the whole month of February (weather permitting) in the heart of downtown Kingston ...
3,3,"Spring Market Square - come and enjoy a lovely, outdoor, skate to music in the historical downtown of Kingston. This public skate is free to the public and makes for a great family outing. Hours of operation are from 8 am to 10 pm Monday to Sunday ...."
4,4,"If you’ve ever gone out for a pint in the city, chances are you’ve met Greg Ball. He’s a Kingston bartender who just happens to be signed with Ching Music. The...."
5,5,Experiential Tourism is growing in popularity come take part in a cool ( not literally) artisan experience. Kingston Glass Studio & Gallery offers ....
6,6,You better come grab your seat early! This local favourite will pack our lounge! You never know who will join Chris Jackson on stage...
7,7,Skate indoors on the large NHL - sized ice pad at The Centre 70 Arena. This leisurely skate is open to all ages and is monitored by the Skate Patrol...
8,8,"The Ice Cold Comedy Festival returns for its third year with some brand new venues, fresh new faces and some of our past favourites."
9,9,"Hosted by local legend and musical veteran, Scotty, Open Mic kicks off at 7pm, and ends when our taps- and your vocals- run dry! No sign-up necessary and a......"


In [170]:
def extract_named_ents(text):
    return [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in nlp(text).ents]

def add_named_ents(df):
    df['named_ents'] = df['text'].apply(extract_named_ents)    

In [173]:
add_named_ents(df)
df.head(90)

Unnamed: 0,Id,text,named_ents
0,0,"Murray McLauchlan is one of Canada's most highly regarded singer/songwriters and with good cause. Over a 48-year career, he has released 19 albums and....","[(Murray McLauchlan, 0, 17, PERSON), (one, 21, 24, CARDINAL), (Canada, 28, 34, GPE), (48-year, 105, 112, CARDINAL), (19, 137, 139, CARDINAL)]"
1,1,"For the fourth week of FebFest there will be lots of outdoor activities with an ice stock demo, skating and more! This week's Toast to Tapas theme will be an all time Canadian favourite, bacon. As an added bonus,....","[(the fourth week of FebFest, 4, 30, DATE), (week, 119, 123, DATE), (Toast, 126, 131, ORG), (Canadian, 167, 175, NORP)]"
2,2,Bundle up the little kids and take them to Boucher Park on Clarence Street just around the corner from Springer Market Square and let the kids enjoy the ice slides and ice sculptures. Boucher Park will be open for the whole month of February (weather permitting) in the heart of downtown Kingston ...,"[(Bundle, 0, 6, GPE), (Boucher Park, 43, 55, LOC), (Clarence Street, 59, 74, FAC), (Springer Market Square, 103, 125, FAC), (Boucher Park, 184, 196, FAC), (the whole month of February, 214, 241, DATE), (Kingston, 288, 296, GPE)]"
3,3,"Spring Market Square - come and enjoy a lovely, outdoor, skate to music in the historical downtown of Kingston. This public skate is free to the public and makes for a great family outing. Hours of operation are from 8 am to 10 pm Monday to Sunday ....","[(Kingston, 102, 110, GPE), (8 am to 10 pm, 217, 230, TIME), (Monday, 231, 237, DATE), (Sunday, 241, 247, DATE)]"
4,4,"If you’ve ever gone out for a pint in the city, chances are you’ve met Greg Ball. He’s a Kingston bartender who just happens to be signed with Ching Music. The....","[(Greg Ball, 71, 80, PERSON), (Kingston, 89, 97, GPE), (Ching Music, 143, 154, ORG)]"
5,5,Experiential Tourism is growing in popularity come take part in a cool ( not literally) artisan experience. Kingston Glass Studio & Gallery offers ....,"[(Kingston Glass Studio & Gallery, 108, 139, ORG)]"
6,6,You better come grab your seat early! This local favourite will pack our lounge! You never know who will join Chris Jackson on stage...,"[(Chris Jackson, 110, 123, PERSON)]"
7,7,Skate indoors on the large NHL - sized ice pad at The Centre 70 Arena. This leisurely skate is open to all ages and is monitored by the Skate Patrol...,"[(NHL, 27, 30, ORG), (The Centre 70 Arena, 50, 69, FAC), (the Skate Patrol, 132, 148, WORK_OF_ART)]"
8,8,"The Ice Cold Comedy Festival returns for its third year with some brand new venues, fresh new faces and some of our past favourites.","[(its third year, 41, 55, DATE)]"
9,9,"Hosted by local legend and musical veteran, Scotty, Open Mic kicks off at 7pm, and ends when our taps- and your vocals- run dry! No sign-up necessary and a......","[(Scotty, 44, 50, PERSON), (Open Mic, 52, 60, PERSON), (7pm, 74, 77, TIME)]"


In [172]:
column = 'named_ents'
render_entities(80, df, options=options, column=column) # take a look at one of the abstracts

In [107]:
#function for Extracting Nouns
def extract_nouns(text):
    keep_pos = ['PROPN', 'NOUN']
    return [(tok.text, tok.idx, tok.idx+len(tok.text), tok.pos_) for tok in nlp(text) if tok.pos_ in keep_pos]

def add_nouns(df):
    df['nouns'] = df['text'].apply(extract_nouns)

In [108]:
add_nouns(df)
display(df)

Unnamed: 0,text,named_ents,nouns
0,"Murray McLauchlan is one of Canada's most highly regarded singer/songwriters and with good cause. Over a 48-year career, he has released 19 albums and....","[(Murray McLauchlan, 0, 17, PERSON), (one, 21, 24, CARDINAL), (Canada, 28, 34, GPE), (48-year, 105, 112, CARDINAL), (19, 137, 139, CARDINAL)]","[(Murray, 0, 6, PROPN), (McLauchlan, 7, 17, PROPN), (Canada, 28, 34, PROPN), (singer, 58, 64, NOUN), (songwriters, 65, 76, NOUN), (cause, 91, 96, NOUN), (career, 113, 119, NOUN), (albums, 140, 146, NOUN)]"
1,"For the fourth week of FebFest there will be lots of outdoor activities with an ice stock demo, skating and more! This week's Toast to Tapas theme will be an all time Canadian favourite, bacon. As an added bonus,....","[(the fourth week of FebFest, 4, 30, DATE), (week, 119, 123, DATE), (Toast, 126, 131, ORG), (Canadian, 167, 175, NORP)]","[(week, 15, 19, NOUN), (FebFest, 23, 30, PROPN), (lots, 45, 49, NOUN), (activities, 61, 71, NOUN), (ice, 80, 83, NOUN), (stock, 84, 89, NOUN), (demo, 90, 94, NOUN), (week, 119, 123, NOUN), (Toast, 126, 131, PROPN), (Tapas, 135, 140, PROPN), (theme, 141, 146, NOUN), (time, 162, 166, NOUN), (favourite, 176, 185, NOUN), (bacon, 187, 192, NOUN), (bonus, 206, 211, NOUN)]"
2,Bundle up the little kids and take them to Boucher Park on Clarence Street just around the corner from Springer Market Square and let the kids enjoy the ice slides and ice sculptures. Boucher Park will be open for the whole month of February (weather permitting) in the heart of downtown Kingston ...,"[(Bundle, 0, 6, GPE), (Boucher Park, 43, 55, LOC), (Clarence Street, 59, 74, FAC), (Springer Market Square, 103, 125, FAC), (Boucher Park, 184, 196, FAC), (the whole month of February, 214, 241, DATE), (Kingston, 288, 296, GPE)]","[(kids, 21, 25, NOUN), (Boucher, 43, 50, PROPN), (Park, 51, 55, PROPN), (Clarence, 59, 67, PROPN), (Street, 68, 74, PROPN), (corner, 91, 97, NOUN), (Springer, 103, 111, PROPN), (Market, 112, 118, PROPN), (Square, 119, 125, PROPN), (kids, 138, 142, NOUN), (ice, 153, 156, NOUN), (slides, 157, 163, NOUN), (ice, 168, 171, NOUN), (sculptures, 172, 182, NOUN), (Boucher, 184, 191, PROPN), (Park, 192, 196, PROPN), (month, 224, 229, NOUN), (February, 233, 241, PROPN), (weather, 243, 250, NOUN), (heart, 270, 275, NOUN), (downtown, 279, 287, NOUN), (Kingston, 288, 296, PROPN)]"
3,"Spring Market Square - come and enjoy a lovely, outdoor, skate to music in the historical downtown of Kingston. This public skate is free to the public and makes for a great family outing. Hours of operation are from 8 am to 10 pm Monday to Sunday ....","[(Kingston, 102, 110, GPE), (8 am to 10 pm, 217, 230, TIME), (Monday, 231, 237, DATE), (Sunday, 241, 247, DATE)]","[(Spring, 0, 6, PROPN), (Market, 7, 13, PROPN), (Square, 14, 20, PROPN), (music, 66, 71, NOUN), (downtown, 90, 98, NOUN), (Kingston, 102, 110, PROPN), (skate, 124, 129, NOUN), (public, 145, 151, NOUN), (family, 174, 180, NOUN), (outing, 181, 187, NOUN), (Hours, 189, 194, NOUN), (operation, 198, 207, NOUN), (am, 219, 221, NOUN), (pm, 228, 230, NOUN), (Monday, 231, 237, PROPN), (Sunday, 241, 247, PROPN)]"
4,"If you’ve ever gone out for a pint in the city, chances are you’ve met Greg Ball. He’s a Kingston bartender who just happens to be signed with Ching Music. The....","[(Greg Ball, 71, 80, PERSON), (Kingston, 89, 97, GPE), (Ching Music, 143, 154, ORG)]","[(pint, 30, 34, NOUN), (city, 42, 46, NOUN), (chances, 48, 55, NOUN), (Greg, 71, 75, PROPN), (Ball, 76, 80, PROPN), (Kingston, 89, 97, PROPN), (bartender, 98, 107, NOUN), (who, 108, 111, NOUN), (Ching, 143, 148, PROPN), (Music, 149, 154, PROPN)]"
5,Experiential Tourism is growing in popularity come take part in a cool ( not literally) artisan experience. Kingston Glass Studio & Gallery offers ....,"[(Kingston Glass Studio & Gallery, 108, 139, ORG)]","[(Experiential, 0, 12, PROPN), (Tourism, 13, 20, PROPN), (popularity, 35, 45, NOUN), (part, 56, 60, NOUN), (cool, 66, 70, NOUN), (experience, 96, 106, NOUN), (Kingston, 108, 116, PROPN), (Glass, 117, 122, PROPN), (Studio, 123, 129, PROPN), (Gallery, 132, 139, PROPN)]"
6,You better come grab your seat early! This local favourite will pack our lounge! You never know who will join Chris Jackson on stage...,"[(Chris Jackson, 110, 123, PERSON)]","[(seat, 26, 30, NOUN), (favourite, 49, 58, NOUN), (lounge, 73, 79, NOUN), (who, 96, 99, NOUN), (Chris, 110, 115, PROPN), (Jackson, 116, 123, PROPN), (stage, 127, 132, NOUN)]"
7,Skate indoors on the large NHL - sized ice pad at The Centre 70 Arena. This leisurely skate is open to all ages and is monitored by the Skate Patrol...,"[(NHL, 27, 30, ORG), (The Centre 70 Arena, 50, 69, FAC), (the Skate Patrol, 132, 148, WORK_OF_ART)]","[(Skate, 0, 5, PROPN), (indoors, 6, 13, NOUN), (NHL, 27, 30, PROPN), (ice, 39, 42, NOUN), (pad, 43, 46, NOUN), (Centre, 54, 60, PROPN), (Arena, 64, 69, PROPN), (skate, 86, 91, NOUN), (ages, 107, 111, NOUN), (Skate, 136, 141, PROPN), (Patrol, 142, 148, PROPN)]"
8,"The Ice Cold Comedy Festival returns for its third year with some brand new venues, fresh new faces and some of our past favourites.","[(its third year, 41, 55, DATE)]","[(Ice, 4, 7, PROPN), (Cold, 8, 12, PROPN), (Comedy, 13, 19, PROPN), (Festival, 20, 28, PROPN), (year, 51, 55, NOUN), (brand, 66, 71, NOUN), (venues, 76, 82, NOUN), (faces, 94, 99, NOUN), (favourites, 121, 131, NOUN)]"
9,"Hosted by local legend and musical veteran, Scotty, Open Mic kicks off at 7pm, and ends when our taps- and your vocals- run dry! No sign-up necessary and a......","[(Scotty, 44, 50, PERSON), (Open Mic, 52, 60, PERSON), (7pm, 74, 77, TIME)]","[(legend, 16, 22, NOUN), (veteran, 35, 42, NOUN), (Scotty, 44, 50, PROPN), (Open, 52, 56, PROPN), (Mic, 57, 60, PROPN), (pm, 75, 77, NOUN), (vocals-, 112, 119, NOUN), (run, 120, 123, NOUN)]"


In [109]:
column = 'nouns'
render_entities(0, df, options=options, column=column)

In [110]:
def extract_named_nouns(row_series):
    ents = set()
    idxs = set()
    # remove duplicates and merge two lists together
    for noun_tuple in row_series['nouns']:
        for named_ents_tuple in row_series['named_ents']:
            if noun_tuple[1] == named_ents_tuple[1]: 
                idxs.add(noun_tuple[1])
                ents.add(named_ents_tuple)
        if noun_tuple[1] not in idxs:
            ents.add(noun_tuple)
    
    return sorted(list(ents), key=lambda x: x[1])

def add_named_nouns(df):
    df['named_nouns'] = df.apply(extract_named_nouns, axis=1)

In [111]:
add_named_nouns(df)
display(df)

Unnamed: 0,text,named_ents,nouns,named_nouns
0,"Murray McLauchlan is one of Canada's most highly regarded singer/songwriters and with good cause. Over a 48-year career, he has released 19 albums and....","[(Murray McLauchlan, 0, 17, PERSON), (one, 21, 24, CARDINAL), (Canada, 28, 34, GPE), (48-year, 105, 112, CARDINAL), (19, 137, 139, CARDINAL)]","[(Murray, 0, 6, PROPN), (McLauchlan, 7, 17, PROPN), (Canada, 28, 34, PROPN), (singer, 58, 64, NOUN), (songwriters, 65, 76, NOUN), (cause, 91, 96, NOUN), (career, 113, 119, NOUN), (albums, 140, 146, NOUN)]","[(Murray McLauchlan, 0, 17, PERSON), (McLauchlan, 7, 17, PROPN), (Canada, 28, 34, GPE), (singer, 58, 64, NOUN), (songwriters, 65, 76, NOUN), (cause, 91, 96, NOUN), (career, 113, 119, NOUN), (albums, 140, 146, NOUN)]"
1,"For the fourth week of FebFest there will be lots of outdoor activities with an ice stock demo, skating and more! This week's Toast to Tapas theme will be an all time Canadian favourite, bacon. As an added bonus,....","[(the fourth week of FebFest, 4, 30, DATE), (week, 119, 123, DATE), (Toast, 126, 131, ORG), (Canadian, 167, 175, NORP)]","[(week, 15, 19, NOUN), (FebFest, 23, 30, PROPN), (lots, 45, 49, NOUN), (activities, 61, 71, NOUN), (ice, 80, 83, NOUN), (stock, 84, 89, NOUN), (demo, 90, 94, NOUN), (week, 119, 123, NOUN), (Toast, 126, 131, PROPN), (Tapas, 135, 140, PROPN), (theme, 141, 146, NOUN), (time, 162, 166, NOUN), (favourite, 176, 185, NOUN), (bacon, 187, 192, NOUN), (bonus, 206, 211, NOUN)]","[(week, 15, 19, NOUN), (FebFest, 23, 30, PROPN), (lots, 45, 49, NOUN), (activities, 61, 71, NOUN), (ice, 80, 83, NOUN), (stock, 84, 89, NOUN), (demo, 90, 94, NOUN), (week, 119, 123, DATE), (Toast, 126, 131, ORG), (Tapas, 135, 140, PROPN), (theme, 141, 146, NOUN), (time, 162, 166, NOUN), (favourite, 176, 185, NOUN), (bacon, 187, 192, NOUN), (bonus, 206, 211, NOUN)]"
2,Bundle up the little kids and take them to Boucher Park on Clarence Street just around the corner from Springer Market Square and let the kids enjoy the ice slides and ice sculptures. Boucher Park will be open for the whole month of February (weather permitting) in the heart of downtown Kingston ...,"[(Bundle, 0, 6, GPE), (Boucher Park, 43, 55, LOC), (Clarence Street, 59, 74, FAC), (Springer Market Square, 103, 125, FAC), (Boucher Park, 184, 196, FAC), (the whole month of February, 214, 241, DATE), (Kingston, 288, 296, GPE)]","[(kids, 21, 25, NOUN), (Boucher, 43, 50, PROPN), (Park, 51, 55, PROPN), (Clarence, 59, 67, PROPN), (Street, 68, 74, PROPN), (corner, 91, 97, NOUN), (Springer, 103, 111, PROPN), (Market, 112, 118, PROPN), (Square, 119, 125, PROPN), (kids, 138, 142, NOUN), (ice, 153, 156, NOUN), (slides, 157, 163, NOUN), (ice, 168, 171, NOUN), (sculptures, 172, 182, NOUN), (Boucher, 184, 191, PROPN), (Park, 192, 196, PROPN), (month, 224, 229, NOUN), (February, 233, 241, PROPN), (weather, 243, 250, NOUN), (heart, 270, 275, NOUN), (downtown, 279, 287, NOUN), (Kingston, 288, 296, PROPN)]","[(kids, 21, 25, NOUN), (Boucher Park, 43, 55, LOC), (Park, 51, 55, PROPN), (Clarence Street, 59, 74, FAC), (Street, 68, 74, PROPN), (corner, 91, 97, NOUN), (Springer Market Square, 103, 125, FAC), (Market, 112, 118, PROPN), (Square, 119, 125, PROPN), (kids, 138, 142, NOUN), (ice, 153, 156, NOUN), (slides, 157, 163, NOUN), (ice, 168, 171, NOUN), (sculptures, 172, 182, NOUN), (Boucher Park, 184, 196, FAC), (Park, 192, 196, PROPN), (month, 224, 229, NOUN), (February, 233, 241, PROPN), (weather, 243, 250, NOUN), (heart, 270, 275, NOUN), (downtown, 279, 287, NOUN), (Kingston, 288, 296, GPE)]"
3,"Spring Market Square - come and enjoy a lovely, outdoor, skate to music in the historical downtown of Kingston. This public skate is free to the public and makes for a great family outing. Hours of operation are from 8 am to 10 pm Monday to Sunday ....","[(Kingston, 102, 110, GPE), (8 am to 10 pm, 217, 230, TIME), (Monday, 231, 237, DATE), (Sunday, 241, 247, DATE)]","[(Spring, 0, 6, PROPN), (Market, 7, 13, PROPN), (Square, 14, 20, PROPN), (music, 66, 71, NOUN), (downtown, 90, 98, NOUN), (Kingston, 102, 110, PROPN), (skate, 124, 129, NOUN), (public, 145, 151, NOUN), (family, 174, 180, NOUN), (outing, 181, 187, NOUN), (Hours, 189, 194, NOUN), (operation, 198, 207, NOUN), (am, 219, 221, NOUN), (pm, 228, 230, NOUN), (Monday, 231, 237, PROPN), (Sunday, 241, 247, PROPN)]","[(Spring, 0, 6, PROPN), (Market, 7, 13, PROPN), (Square, 14, 20, PROPN), (music, 66, 71, NOUN), (downtown, 90, 98, NOUN), (Kingston, 102, 110, GPE), (skate, 124, 129, NOUN), (public, 145, 151, NOUN), (family, 174, 180, NOUN), (outing, 181, 187, NOUN), (Hours, 189, 194, NOUN), (operation, 198, 207, NOUN), (am, 219, 221, NOUN), (pm, 228, 230, NOUN), (Monday, 231, 237, DATE), (Sunday, 241, 247, DATE)]"
4,"If you’ve ever gone out for a pint in the city, chances are you’ve met Greg Ball. He’s a Kingston bartender who just happens to be signed with Ching Music. The....","[(Greg Ball, 71, 80, PERSON), (Kingston, 89, 97, GPE), (Ching Music, 143, 154, ORG)]","[(pint, 30, 34, NOUN), (city, 42, 46, NOUN), (chances, 48, 55, NOUN), (Greg, 71, 75, PROPN), (Ball, 76, 80, PROPN), (Kingston, 89, 97, PROPN), (bartender, 98, 107, NOUN), (who, 108, 111, NOUN), (Ching, 143, 148, PROPN), (Music, 149, 154, PROPN)]","[(pint, 30, 34, NOUN), (city, 42, 46, NOUN), (chances, 48, 55, NOUN), (Greg Ball, 71, 80, PERSON), (Ball, 76, 80, PROPN), (Kingston, 89, 97, GPE), (bartender, 98, 107, NOUN), (who, 108, 111, NOUN), (Ching Music, 143, 154, ORG), (Music, 149, 154, PROPN)]"
5,Experiential Tourism is growing in popularity come take part in a cool ( not literally) artisan experience. Kingston Glass Studio & Gallery offers ....,"[(Kingston Glass Studio & Gallery, 108, 139, ORG)]","[(Experiential, 0, 12, PROPN), (Tourism, 13, 20, PROPN), (popularity, 35, 45, NOUN), (part, 56, 60, NOUN), (cool, 66, 70, NOUN), (experience, 96, 106, NOUN), (Kingston, 108, 116, PROPN), (Glass, 117, 122, PROPN), (Studio, 123, 129, PROPN), (Gallery, 132, 139, PROPN)]","[(Experiential, 0, 12, PROPN), (Tourism, 13, 20, PROPN), (popularity, 35, 45, NOUN), (part, 56, 60, NOUN), (cool, 66, 70, NOUN), (experience, 96, 106, NOUN), (Kingston Glass Studio & Gallery, 108, 139, ORG), (Glass, 117, 122, PROPN), (Studio, 123, 129, PROPN), (Gallery, 132, 139, PROPN)]"
6,You better come grab your seat early! This local favourite will pack our lounge! You never know who will join Chris Jackson on stage...,"[(Chris Jackson, 110, 123, PERSON)]","[(seat, 26, 30, NOUN), (favourite, 49, 58, NOUN), (lounge, 73, 79, NOUN), (who, 96, 99, NOUN), (Chris, 110, 115, PROPN), (Jackson, 116, 123, PROPN), (stage, 127, 132, NOUN)]","[(seat, 26, 30, NOUN), (favourite, 49, 58, NOUN), (lounge, 73, 79, NOUN), (who, 96, 99, NOUN), (Chris Jackson, 110, 123, PERSON), (Jackson, 116, 123, PROPN), (stage, 127, 132, NOUN)]"
7,Skate indoors on the large NHL - sized ice pad at The Centre 70 Arena. This leisurely skate is open to all ages and is monitored by the Skate Patrol...,"[(NHL, 27, 30, ORG), (The Centre 70 Arena, 50, 69, FAC), (the Skate Patrol, 132, 148, WORK_OF_ART)]","[(Skate, 0, 5, PROPN), (indoors, 6, 13, NOUN), (NHL, 27, 30, PROPN), (ice, 39, 42, NOUN), (pad, 43, 46, NOUN), (Centre, 54, 60, PROPN), (Arena, 64, 69, PROPN), (skate, 86, 91, NOUN), (ages, 107, 111, NOUN), (Skate, 136, 141, PROPN), (Patrol, 142, 148, PROPN)]","[(Skate, 0, 5, PROPN), (indoors, 6, 13, NOUN), (NHL, 27, 30, ORG), (ice, 39, 42, NOUN), (pad, 43, 46, NOUN), (Centre, 54, 60, PROPN), (Arena, 64, 69, PROPN), (skate, 86, 91, NOUN), (ages, 107, 111, NOUN), (Skate, 136, 141, PROPN), (Patrol, 142, 148, PROPN)]"
8,"The Ice Cold Comedy Festival returns for its third year with some brand new venues, fresh new faces and some of our past favourites.","[(its third year, 41, 55, DATE)]","[(Ice, 4, 7, PROPN), (Cold, 8, 12, PROPN), (Comedy, 13, 19, PROPN), (Festival, 20, 28, PROPN), (year, 51, 55, NOUN), (brand, 66, 71, NOUN), (venues, 76, 82, NOUN), (faces, 94, 99, NOUN), (favourites, 121, 131, NOUN)]","[(Ice, 4, 7, PROPN), (Cold, 8, 12, PROPN), (Comedy, 13, 19, PROPN), (Festival, 20, 28, PROPN), (year, 51, 55, NOUN), (brand, 66, 71, NOUN), (venues, 76, 82, NOUN), (faces, 94, 99, NOUN), (favourites, 121, 131, NOUN)]"
9,"Hosted by local legend and musical veteran, Scotty, Open Mic kicks off at 7pm, and ends when our taps- and your vocals- run dry! No sign-up necessary and a......","[(Scotty, 44, 50, PERSON), (Open Mic, 52, 60, PERSON), (7pm, 74, 77, TIME)]","[(legend, 16, 22, NOUN), (veteran, 35, 42, NOUN), (Scotty, 44, 50, PROPN), (Open, 52, 56, PROPN), (Mic, 57, 60, PROPN), (pm, 75, 77, NOUN), (vocals-, 112, 119, NOUN), (run, 120, 123, NOUN)]","[(legend, 16, 22, NOUN), (veteran, 35, 42, NOUN), (Scotty, 44, 50, PERSON), (Open Mic, 52, 60, PERSON), (Mic, 57, 60, PROPN), (pm, 75, 77, NOUN), (vocals-, 112, 119, NOUN), (run, 120, 123, NOUN)]"


In [112]:
column = 'named_nouns'
render_entities(1, df, options=options, column=column)

In [113]:
#function to extract noun phrase

def extract_noun_phrases(text):
    return [(chunk.text: chunk.label_) for chunk in nlp(text).noun_chunks]

def add_noun_phrases(df):
    df['noun_phrases'] = df['text'].apply(extract_noun_phrases)

In [114]:
#Visualize noun phrases

def visualize_noun_phrases(text):
    df = pd.DataFrame([text]) 
    df.columns = ['text']
    add_noun_phrases(df)
    column = 'noun_phrases'
    render_entities(0, df, options=options, column=column)

In [157]:
df.to_csv(r'rashi_data_named_entities.csv', index = False)

In [197]:
df['named_ents'][2]

[('Bundle', 0, 6, 'GPE'),
 ('Boucher Park', 43, 55, 'LOC'),
 ('Clarence Street', 59, 74, 'FAC'),
 ('Springer Market Square', 103, 125, 'FAC'),
 ('Boucher Park', 184, 196, 'FAC'),
 ('the whole month of February', 214, 241, 'DATE'),
 ('Kingston', 288, 296, 'GPE')]

In [182]:
df

Unnamed: 0,Id,text,named_ents
0,0,"Murray McLauchlan is one of Canada's most highly regarded singer/songwriters and with good cause. Over a 48-year career, he has released 19 albums and....","[(Murray McLauchlan, 0, 17, PERSON), (one, 21, 24, CARDINAL), (Canada, 28, 34, GPE), (48-year, 105, 112, CARDINAL), (19, 137, 139, CARDINAL)]"
1,1,"For the fourth week of FebFest there will be lots of outdoor activities with an ice stock demo, skating and more! This week's Toast to Tapas theme will be an all time Canadian favourite, bacon. As an added bonus,....","[(the fourth week of FebFest, 4, 30, DATE), (week, 119, 123, DATE), (Toast, 126, 131, ORG), (Canadian, 167, 175, NORP)]"
2,2,Bundle up the little kids and take them to Boucher Park on Clarence Street just around the corner from Springer Market Square and let the kids enjoy the ice slides and ice sculptures. Boucher Park will be open for the whole month of February (weather permitting) in the heart of downtown Kingston ...,"[(Bundle, 0, 6, GPE), (Boucher Park, 43, 55, LOC), (Clarence Street, 59, 74, FAC), (Springer Market Square, 103, 125, FAC), (Boucher Park, 184, 196, FAC), (the whole month of February, 214, 241, DATE), (Kingston, 288, 296, GPE)]"
3,3,"Spring Market Square - come and enjoy a lovely, outdoor, skate to music in the historical downtown of Kingston. This public skate is free to the public and makes for a great family outing. Hours of operation are from 8 am to 10 pm Monday to Sunday ....","[(Kingston, 102, 110, GPE), (8 am to 10 pm, 217, 230, TIME), (Monday, 231, 237, DATE), (Sunday, 241, 247, DATE)]"
4,4,"If you’ve ever gone out for a pint in the city, chances are you’ve met Greg Ball. He’s a Kingston bartender who just happens to be signed with Ching Music. The....","[(Greg Ball, 71, 80, PERSON), (Kingston, 89, 97, GPE), (Ching Music, 143, 154, ORG)]"
5,5,Experiential Tourism is growing in popularity come take part in a cool ( not literally) artisan experience. Kingston Glass Studio & Gallery offers ....,"[(Kingston Glass Studio & Gallery, 108, 139, ORG)]"
6,6,You better come grab your seat early! This local favourite will pack our lounge! You never know who will join Chris Jackson on stage...,"[(Chris Jackson, 110, 123, PERSON)]"
7,7,Skate indoors on the large NHL - sized ice pad at The Centre 70 Arena. This leisurely skate is open to all ages and is monitored by the Skate Patrol...,"[(NHL, 27, 30, ORG), (The Centre 70 Arena, 50, 69, FAC), (the Skate Patrol, 132, 148, WORK_OF_ART)]"
8,8,"The Ice Cold Comedy Festival returns for its third year with some brand new venues, fresh new faces and some of our past favourites.","[(its third year, 41, 55, DATE)]"
9,9,"Hosted by local legend and musical veteran, Scotty, Open Mic kicks off at 7pm, and ends when our taps- and your vocals- run dry! No sign-up necessary and a......","[(Scotty, 44, 50, PERSON), (Open Mic, 52, 60, PERSON), (7pm, 74, 77, TIME)]"


In [296]:
named_ents_list= df['named_ents'].tolist()
event_desc= df['text'].tolist()

In [295]:
with open('tags.txt', 'w') as filehandle:
    for listitem in named_ents_list:
        filehandle.write('%s\n' % listitem)

In [319]:
import json
dict_ents = {}
for var in range(len(named_ents_list)):
    for i in range(len(named_ents_list[var])):
            key_obj= named_ents_list[var][i][3]
            if key_obj not in dict_ents:
                dict_obj={}
                dict_obj["key"]={named_ents_list[var][i][0]}
                dict_obj["document_id"] = {var}
                dict_ents['%s' %(key_obj)]= []
                dict_ents[key_obj].append(dict_obj)  
            else:
                dict_obj={}
                dict_obj["key"]={abc[var][i][0]}
                dict_obj["document_id"] = {var}
                dict_ents[key_obj].append(dict_obj)       

In [320]:
dict_ents

{'PERSON': [{'key': {'Murray McLauchlan'}, 'document_id': {0}},
  {'key': {'Greg Ball'}, 'document_id': {4}},
  {'key': {'Chris Jackson'}, 'document_id': {6}},
  {'key': {'Scotty'}, 'document_id': {9}},
  {'key': {'Open Mic'}, 'document_id': {9}},
  {'key': {"Enjoy Melos'"}, 'document_id': {10}},
  {'key': {'George'}, 'document_id': {10}},
  {'key': {'Greg Runions'}, 'document_id': {11}},
  {'key': {"Duke Ellington's"}, 'document_id': {11}},
  {'key': {'Tim Hortons Brier'}, 'document_id': {14}},
  {'key': {'The Tim Hortons Brier'}, 'document_id': {14}},
  {'key': {'Greg Ball'}, 'document_id': {15}},
  {'key': {'Artist Bob Young'}, 'document_id': {22}},
  {'key': {'Bob'}, 'document_id': {22}},
  {'key': {'Jordan Edward Benjamin'}, 'document_id': {25}},
  {'key': {'Greg Ball'}, 'document_id': {28}},
  {'key': {'The Skate Patrol'}, 'document_id': {29}},
  {'key': {'Artist Bob Young'}, 'document_id': {30}},
  {'key': {'Bob'}, 'document_id': {30}},
  {'key': {'Chris Jackson'}, 'document_id'

In [298]:
event_desc[0]

"Murray McLauchlan is one of Canada's most highly regarded singer/songwriters and with good cause. Over a 48-year career, he has released 19 albums and...."

In [305]:
import json
dict_events = {}
dict_events['contents']= []
for var in range(len(event_desc)):
    dict_content = {}
    dict_content['id'] = {var}
    dict_content['content'] = {event_desc[var]}
    dict_events['contents'].append(dict_content)
    
                   

In [306]:
dict_events

{'contents': [{'id': {0},
   'content': {"Murray McLauchlan is one of Canada's most highly regarded singer/songwriters and with good cause. Over a 48-year career, he has released 19 albums and...."}},
  {'id': {1},
   'content': {"For the fourth week of FebFest there will be lots of outdoor activities with an ice stock demo, skating and more! This week's Toast to Tapas theme will be an all time Canadian favourite, bacon. As an added bonus,...."}},
  {'id': {2},
   'content': {'Bundle up the little kids and take them to Boucher Park on Clarence Street just around the corner from Springer Market Square and let the kids enjoy the ice slides and ice sculptures. Boucher Park will be open for the whole month of February (weather permitting) in the heart of downtown Kingston ...'}},
  {'id': {3},
   'content': {'Spring Market Square - come and enjoy a lovely, outdoor, skate to music in the historical downtown of Kingston. This public skate is free to the public and makes for a great family ou

In [316]:
import json
json.dumps(dict_ents)

TypeError: Object of type set is not JSON serializable

In [314]:
dict_events

{'contents': [{'id': {0},
   'content': {"Murray McLauchlan is one of Canada's most highly regarded singer/songwriters and with good cause. Over a 48-year career, he has released 19 albums and...."}},
  {'id': {1},
   'content': {"For the fourth week of FebFest there will be lots of outdoor activities with an ice stock demo, skating and more! This week's Toast to Tapas theme will be an all time Canadian favourite, bacon. As an added bonus,...."}},
  {'id': {2},
   'content': {'Bundle up the little kids and take them to Boucher Park on Clarence Street just around the corner from Springer Market Square and let the kids enjoy the ice slides and ice sculptures. Boucher Park will be open for the whole month of February (weather permitting) in the heart of downtown Kingston ...'}},
  {'id': {3},
   'content': {'Spring Market Square - come and enjoy a lovely, outdoor, skate to music in the historical downtown of Kingston. This public skate is free to the public and makes for a great family ou

In [315]:
f = open("content.txt","w")
f.write( str(dict_events) )
f.close()