In [None]:
# !pip install git+https://github.com/CODAIT/text-extensions-for-pandas

In [1]:
import urllib
import requests
import json
import sys
import pandas as pd
from bs4 import BeautifulSoup
from ibm_watson import NaturalLanguageUnderstandingV1
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from watson_developer_cloud.natural_language_understanding_v1 \
import Features, EntitiesOptions, KeywordsOptions, SemanticRolesOptions, SentimentOptions, EmotionOptions, ConceptsOptions, CategoriesOptions

# IBM Watson libraries
import ibm_watson
import ibm_watson.natural_language_understanding_v1 as nlu
import ibm_cloud_sdk_core


try:
    import text_extensions_for_pandas as tp
except ModuleNotFoundError as e:
    # If we're running from within the project source tree and the parent Python
    # environment doesn't have the text_extensions_for_pandas package, use the
    # version in the local source tree.
    if not os.getcwd().endswith("notebooks"):
        raise e
    if ".." not in sys.path:
        sys.path.insert(0, "..")
    import text_extensions_for_pandas as tp

In [2]:
query = "pizza"
query = query.replace(' ', '+')
res = requests.get('https://en.wikipedia.org/wiki/' + query )
res

<Response [200]>

In [3]:
res.raise_for_status()
wiki=BeautifulSoup(res.text,"html.parser")


In [4]:
wiki_text = ""
for i in wiki.select('p'):
    wiki_text+=i.getText().replace("\n","").replace("\xa0","")
    wiki_text

In [5]:
authenticator = IAMAuthenticator('')
natural_language_understanding = NaturalLanguageUnderstandingV1(
    version='2019-07-12',
    authenticator=authenticator
)
natural_language_understanding.set_service_url('')

In [6]:
# Make the request
response = natural_language_understanding.analyze(
    text=wiki_text,
    # TODO: Use this URL once we've pushed the shortened document to Github
    #url="https://raw.githubusercontent.com/CODAIT/text-extensions-for-pandas/master/resources/holy_grail_short.txt",
    return_analyzed_text=True,
    features=nlu.Features(
        entities=nlu.EntitiesOptions(sentiment=True),
        keywords=nlu.KeywordsOptions(sentiment=True, emotion=True),
        relations=nlu.RelationsOptions(),
        semantic_roles=nlu.SemanticRolesOptions(),
        syntax=nlu.SyntaxOptions(sentences=True, 
                                 tokens=nlu.SyntaxOptionsTokens(lemma=True, part_of_speech=True))
    )).get_result()

In [7]:
response.keys()

dict_keys(['usage', 'syntax', 'semantic_roles', 'relations', 'language', 'keywords', 'entities', 'analyzed_text'])

In [8]:
response["syntax"]

{'tokens': [{'text': 'Pizza',
   'part_of_speech': 'NOUN',
   'location': [0, 5],
   'lemma': 'pizza'},
  {'text': '(', 'part_of_speech': 'PUNCT', 'location': [6, 7]},
  {'text': 'Italian',
   'part_of_speech': 'ADJ',
   'location': [7, 14],
   'lemma': 'Italian'},
  {'text': ':', 'part_of_speech': 'PUNCT', 'location': [14, 15], 'lemma': ':'},
  {'text': '[', 'part_of_speech': 'PUNCT', 'location': [15, 16]},
  {'text': 'ˈpittsa', 'part_of_speech': 'NOUN', 'location': [16, 23]},
  {'text': ']', 'part_of_speech': 'PUNCT', 'location': [23, 24]},
  {'text': ',', 'part_of_speech': 'PUNCT', 'location': [24, 25]},
  {'text': 'Neapolitan',
   'part_of_speech': 'ADJ',
   'location': [26, 36],
   'lemma': 'Neapolitan'},
  {'text': ':', 'part_of_speech': 'PUNCT', 'location': [36, 37], 'lemma': ':'},
  {'text': '[', 'part_of_speech': 'PUNCT', 'location': [37, 38]},
  {'text': 'ˈpittsə', 'part_of_speech': 'NOUN', 'location': [38, 45]},
  {'text': ']', 'part_of_speech': 'PUNCT', 'location': [45, 46]

In [9]:
dfs = tp.io.watson.nlu.parse_response(response)
dfs.keys()

SpanText: '[2][3] In casual settings, however, it is cut into wedges to be eaten while held in the hand.The term pizza was first recorded in the 10th century in a Latin manuscript from the Southern Italian town of Gaeta in Lazio, on the border with Campania.'
Sentence: 'The term pizza was first recorded in the 10th century in a Latin manuscript from the Southern Italian town of Gaeta in Lazio, on the border with Campania.'
SpanText: '[5][6] Many companies sell ready-baked frozen pizzas to be reheated in an ordinary home oven.The Associazione Verace Pizza Napoletana (lit.'
Sentence: '[5][6] Many companies sell ready-baked frozen pizzas to be reheated in an ordinary home oven.'
SpanText: 'After this, it is typically left undisturbed and allowed time to proof.Traditional pizza dough being tossedVarious toppings being placed on pan pizzasAn uncooked Neapolitan pizza on a metal peel, ready for the ovenIn restaurants, pizza can be baked in an oven with stone bricks above the heat source, an 

dict_keys(['syntax', 'entities', 'keywords', 'relations', 'semantic_roles'])

In [10]:
syntax_df = dfs["syntax"]
syntax_df

Unnamed: 0,span,part_of_speech,lemma,sentence
0,"[0, 5): 'Pizza'",NOUN,pizza,"[0, 374): 'Pizza (Italian:[ˈpittsa], Neapolita..."
1,"[6, 7): '('",PUNCT,,"[0, 374): 'Pizza (Italian:[ˈpittsa], Neapolita..."
2,"[7, 14): 'Italian'",ADJ,Italian,"[0, 374): 'Pizza (Italian:[ˈpittsa], Neapolita..."
3,"[14, 15): ':'",PUNCT,:,"[0, 374): 'Pizza (Italian:[ˈpittsa], Neapolita..."
4,"[15, 16): '['",PUNCT,,"[0, 374): 'Pizza (Italian:[ˈpittsa], Neapolita..."
...,...,...,...,...
3173,"[15665, 15679): 'pizzaArgentine'",PROPN,,"[15432, 15710): '[72]Chicago-style pizza — dee..."
3174,"[15680, 15698): 'fugazzetta.Detroit'",PROPN,,"[15432, 15710): '[72]Chicago-style pizza — dee..."
3175,"[15698, 15699): '-'",PUNCT,,"[15432, 15710): '[72]Chicago-style pizza — dee..."
3176,"[15699, 15704): 'style'",NOUN,style,"[15432, 15710): '[72]Chicago-style pizza — dee..."


In [11]:
pronouns_by_sentence = syntax_df[syntax_df["part_of_speech"] == "PRON"][["sentence", "span"]]
pronouns_by_sentence

Unnamed: 0,sentence,span
64,"[0, 374): 'Pizza (Italian:[ˈpittsa], Neapolita...","[296, 301): 'which'"
95,"[424, 610): 'A person who makes pizza is known...","[433, 436): 'who'"
145,"[610, 857): '[2][3] In casual settings, howeve...","[646, 648): 'it'"
201,"[857, 972): '[4] Modern pizza was invented in ...","[915, 918): 'its'"
214,"[972, 1218): '[5] It has become one of the mos...","[976, 978): 'It'"
303,"[1358, 1516): 'True Neapolitan Pizza Associati...","[1466, 1470): 'that'"
349,"[1516, 1754): '[7] In 2009, upon Italy's reque...","[1681, 1684): 'its'"
466,"[2171, 2294): '[18] Records of people adding o...","[2236, 2238): 'it'"
505,"[2295, 2555): 'In the 6th century BC, the Pers...","[2449, 2454): 'their'"
519,"[2295, 2555): 'In the 6th century BC, the Pers...","[2514, 2519): 'their'"


In [12]:
syntax_df.head(3)

Unnamed: 0,span,part_of_speech,lemma,sentence
0,"[0, 5): 'Pizza'",NOUN,pizza,"[0, 374): 'Pizza (Italian:[ˈpittsa], Neapolita..."
1,"[6, 7): '('",PUNCT,,"[0, 374): 'Pizza (Italian:[ˈpittsa], Neapolita..."
2,"[7, 14): 'Italian'",ADJ,Italian,"[0, 374): 'Pizza (Italian:[ˈpittsa], Neapolita..."


In [13]:
syntax_df.dtypes

span                   SpanDtype
part_of_speech            object
lemma                     object
sentence          TokenSpanDtype
dtype: object

In [14]:
print(syntax_df["span"].array)

<SpanArray>
[                     [0, 5): 'Pizza',                          [6, 7): '(',
                   [7, 14): 'Italian',                        [14, 15): ':',
                        [15, 16): '[',                  [16, 23): 'ˈpittsa',
                        [23, 24): ']',                        [24, 25): ',',
               [26, 36): 'Neapolitan',                        [36, 37): ':',
 ...
                 [15647, 15649): 'of',                [15650, 15653): 'New',
               [15654, 15658): 'York',                  [15658, 15659): '-',
              [15659, 15664): 'style',     [15665, 15679): 'pizzaArgentine',
 [15680, 15698): 'fugazzetta.Detroit',                  [15698, 15699): '-',
              [15699, 15704): 'style',              [15705, 15710): 'pizza']
Length: 3178, dtype: SpanDtype


In [15]:
syntax_df["span"].array.begin[:10], syntax_df["span"].array.end[:10]

(array([ 0,  6,  7, 14, 15, 16, 23, 24, 26, 36]),
 array([ 5,  7, 14, 15, 16, 23, 24, 25, 36, 37]))

In [16]:
span_obj = syntax_df["span"].array[0]
print(f"\"{span_obj}\" is an object of type {type(span_obj)}")

"[0, 5): 'Pizza'" is an object of type <class 'text_extensions_for_pandas.array.span.Span'>


In [17]:
syntax_df["span"].iloc[:10].to_numpy()

array([[0, 5): 'Pizza', [6, 7): '(', [7, 14): 'Italian', [14, 15): ':',
       [15, 16): '[', [16, 23): 'ˈpittsa', [23, 24): ']', [24, 25): ',',
       [26, 36): 'Neapolitan', [36, 37): ':'], dtype=object)

In [18]:
# Show the first 10 tokens in context
syntax_df["span"].iloc[:10].array

Unnamed: 0,begin,end,covered_text
0,0,5,Pizza
1,6,7,(
2,7,14,Italian
3,14,15,:
4,15,16,[
5,16,23,ˈpittsa
6,23,24,]
7,24,25,","
8,26,36,Neapolitan
9,36,37,:


In [19]:
syntax_df.head(3)

Unnamed: 0,span,part_of_speech,lemma,sentence
0,"[0, 5): 'Pizza'",NOUN,pizza,"[0, 374): 'Pizza (Italian:[ˈpittsa], Neapolita..."
1,"[6, 7): '('",PUNCT,,"[0, 374): 'Pizza (Italian:[ˈpittsa], Neapolita..."
2,"[7, 14): 'Italian'",ADJ,Italian,"[0, 374): 'Pizza (Italian:[ˈpittsa], Neapolita..."


In [20]:
syntax_df["sentence"].unique()

Unnamed: 0,begin,end,begin_token,end_token,covered_text
0,0,374,0,81,"Pizza (Italian:[ˈpittsa], Neapolitan:[ˈpittsə]) is a savory dish of Italian origin consisting of a usually round, flattened base of leavened wheat-based dough topped with tomatoes, cheese, and often various other ingredients (such as anchovies, mushrooms, onions, olives, pineapple, meat, etc.), which is then baked at a high temperature, traditionally in a wood-fired oven."
1,374,423,81,93,[1] A small pizza is sometimes called a pizzetta.
2,424,610,93,133,"A person who makes pizza is known as a pizzaiolo.In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced, and is eaten with the use of a knife and fork."
3,610,857,133,187,"[2][3] In casual settings, however, it is cut into wedges to be eaten while held in the hand.The term pizza was first recorded in the 10th century in a Latin manuscript from the Southern Italian town of Gaeta in Lazio, on the border with Campania."
4,857,972,187,211,"[4] Modern pizza was invented in Naples, and the dish and its variants have since become popular in many countries."
5,972,1218,211,258,"[5] It has become one of the most popular foods in the world and a common fast food item in Europe and North America, available at pizzerias (restaurants specializing in pizza), restaurants offering Mediterranean cuisine, and via pizza delivery."
6,1218,1357,258,287,[5][6] Many companies sell ready-baked frozen pizzas to be reheated in an ordinary home oven.The Associazione Verace Pizza Napoletana (lit.
7,1358,1516,287,311,True Neapolitan Pizza Association) is a non-profit organization founded in 1984 with headquarters in Naples that aims to promote traditional Neapolitan pizza.
8,1516,1754,311,362,"[7] In 2009, upon Italy's request, Neapolitan pizza was registered with the European Union as a Traditional Speciality Guaranteed dish,[8][9] and in 2017 the art of its making was included on UNESCO's list of intangible cultural heritage."
9,1754,2072,362,431,"[10]The word ""pizza"" first appeared in a Latin text from the central Italian town of Gaeta, then still part of the Byzantine Empire, in 997 AD; the text states that a tenant of certain property is to give the bishop of Gaeta duodecim pizze (""twelve pizzas"") every Christmas Day, and another twelve every Easter Sunday."


In [21]:
token_span_array = syntax_df["sentence"].unique()
print(f"""
Offset information (stored in the TokenSpanArray):
`begin_token` property: {token_span_array.begin_token}
  `end_token` property: {token_span_array.end_token}
   
Token information (`tokens` property, shared among mulitple TokenSpanArrays):
{token_span_array.tokens}
""")



Offset information (stored in the TokenSpanArray):
`begin_token` property: [   0   81   93  133  187  211  258  287  311  362  431  453  476  529
  580  606  625  648  673  688  755  794  819  850  881  894  927  948
  979 1003 1022 1062 1143 1185 1208 1237 1256 1277 1308 1326 1368 1381
 1394 1414 1490 1510 1524 1546 1579 1602 1630 1667 1705 1742 1790 1806
 1873 1933 1957 2003 2030 2074 2108 2158 2216 2267 2296 2345 2374 2391
 2412 2442 2472 2499 2527 2564 2610 2677 2692 2735 2749 2778 2801 2821
 2840 2863 2876 2899 2910 2933 2984 3008 3046 3073 3106 3135]
  `end_token` property: [  81   93  133  187  211  258  287  311  362  431  453  476  529  580
  606  625  648  673  688  755  794  819  850  881  894  927  948  979
 1003 1022 1062 1143 1185 1208 1237 1256 1277 1308 1326 1368 1381 1394
 1414 1490 1510 1524 1546 1579 1602 1630 1667 1705 1742 1790 1806 1873
 1933 1957 2003 2030 2074 2108 2158 2216 2267 2296 2345 2374 2391 2412
 2442 2472 2499 2527 2564 2610 2677 2692 2735 2749 2778 2

In [22]:
syntax_df[["sentence"]].drop_duplicates()

Unnamed: 0,sentence
0,"[0, 374): 'Pizza (Italian:[ˈpittsa], Neapolita..."
81,"[374, 423): '[1] A small pizza is sometimes ca..."
93,"[424, 610): 'A person who makes pizza is known..."
133,"[610, 857): '[2][3] In casual settings, howeve..."
187,"[857, 972): '[4] Modern pizza was invented in ..."
...,...
3008,"[14813, 15024): '[69]Some attribute the appare..."
3046,"[15024, 15156): '[71]National Pizza Month is a..."
3073,"[15156, 15284): '[72][73][74][75] This observa..."
3106,"[15284, 15432): '[75] During this time, some p..."


In [23]:
pronouns_by_sentence

Unnamed: 0,sentence,span
64,"[0, 374): 'Pizza (Italian:[ˈpittsa], Neapolita...","[296, 301): 'which'"
95,"[424, 610): 'A person who makes pizza is known...","[433, 436): 'who'"
145,"[610, 857): '[2][3] In casual settings, howeve...","[646, 648): 'it'"
201,"[857, 972): '[4] Modern pizza was invented in ...","[915, 918): 'its'"
214,"[972, 1218): '[5] It has become one of the mos...","[976, 978): 'It'"
303,"[1358, 1516): 'True Neapolitan Pizza Associati...","[1466, 1470): 'that'"
349,"[1516, 1754): '[7] In 2009, upon Italy's reque...","[1681, 1684): 'its'"
466,"[2171, 2294): '[18] Records of people adding o...","[2236, 2238): 'it'"
505,"[2295, 2555): 'In the 6th century BC, the Pers...","[2449, 2454): 'their'"
519,"[2295, 2555): 'In the 6th century BC, the Pers...","[2514, 2519): 'their'"


In [24]:
pronouns_by_sentence.dtypes

sentence    TokenSpanDtype
span             SpanDtype
dtype: object

In [25]:
mask = pronouns_by_sentence["sentence"].map(lambda s: s.covered_text).str.contains("Pizza")
pronouns_by_sentence["span"][mask].values

Unnamed: 0,begin,end,covered_text
0,296,301,which
1,1466,1470,that
2,7488,7491,its
3,12879,12884,There
4,13016,13021,which
5,14759,14764,which
6,15073,15077,that
