In [7]:
!pip install watson-developer-cloud



In [8]:
# Usual Python imports
import sys
import os
import glob
import json

# BeautifulSoup, for parsing HTML
from bs4 import BeautifulSoup

# Matplotlib, for plotting
import matplotlib.pyplot as plt
%matplotlib inline

# Watson Python SDK
# If not installed run: 
# !pip install --upgrade watson-developer-cloud
import watson_developer_cloud

# Utility functions
%load_ext autoreload
%aimport helper
%autoreload 1

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
discovery_creds = helper.fetch_credentials('discovery')

discovery = watson_developer_cloud.DiscoveryV1(
                        version='2018-08-01',
                        url=discovery_creds['url'],
                        iam_apikey=discovery_creds['apikey'])

  


In [10]:
env, env_id = helper.fetch_object(
    discovery, "environment", "Bookworm",
    create=True, create_args=dict(
        description="A space to read and understand stories"  # feel free to edit
    ))

{'description': 'A space to read and understand stories', 'name': 'Bookworm'}
Created environment: Bookworm (5b5c1b03-1746-4aef-8148-4efd7f99c343)


In [11]:
# Lists existing configurations for the service instance and store default configuration id
configurations = discovery.list_configurations(environment_id=env_id).get_result()
cfg_id =  configurations['configurations'][0]['configuration_id']
print(json.dumps(configurations, indent=2))

{
  "configurations": [
    {
      "configuration_id": "34a30188-a7f4-4dc9-9b69-5e1bf08ee4c3",
      "name": "Default Configuration",
      "description": "The configuration used by default when creating a new collection without specifying a configuration_id.",
      "created": "2019-03-31T07:33:30.744Z",
      "updated": "2019-03-31T07:33:30.744Z"
    }
  ]
}


In [12]:
# Get default configuration details
config = discovery.get_configuration(environment_id=env_id, configuration_id=cfg_id).get_result()
print(json.dumps(config, indent=2))

{
  "configuration_id": "34a30188-a7f4-4dc9-9b69-5e1bf08ee4c3",
  "name": "Default Configuration",
  "created": "2019-03-31T07:33:30.744Z",
  "updated": "2019-03-31T07:33:30.744Z",
  "description": "The configuration used by default when creating a new collection without specifying a configuration_id.",
  "conversions": {
    "html": {
      "exclude_content": {
        "xpaths": []
      },
      "exclude_tag_attributes": [
        "EVENT_ACTIONS"
      ],
      "exclude_tags_completely": [
        "script",
        "sup"
      ],
      "exclude_tags_keep_content": [
        "font",
        "em",
        "span"
      ],
      "keep_content": {
        "xpaths": []
      }
    },
    "json_normalizations": [],
    "pdf": {
      "heading": {
        "fonts": [
          {
            "level": 1,
            "max_size": 80,
            "min_size": 24
          },
          {
            "bold": false,
            "italic": false,
            "level": 2,
            "max_size": 24,
     

In [13]:
data_dir = "data"
filename = os.path.join(data_dir, "sample.html")
with open(filename, "r") as f:
    res = discovery.test_configuration_in_environment(environment_id=env_id, configuration_id=cfg_id, file=f).get_result()
print(json.dumps(res, indent=2))

{
  "status": "completed",
  "original_media_type": "text/html",
  "snapshots": [
    {
      "step": "html_input",
      "snapshot": {
        "html": "<html>\n<head>\n    <title>Star Wars: Episode IV - A New Hope (Opening Crawl)</title>\n</head>\n<body>\n    <article>\n        <h1>Star Wars: Episode IV - A New Hope (Opening Crawl)</h1>\n        <p>\n            It is a period of civil war. Rebel spaceships, striking from a hidden base, have won their first victory against the evil Galactic Empire.\n        </p><p>\n            During the battle, Rebel spies managed to steal secret plans to the Empire's ultimate weapon, the DEATH STAR, an armored space station with enough power to destroy an entire planet.\n        </p><p>\n            Pursued by the Empire's sinister agents, Princess Leia races home aboard her starship, custodian of the stolen plans that can save her people and restore freedom to the galaxy...\n        </p>\n    </article>\n</body>"
      }
    },
    {
      "step":

In [14]:
# Take a closer look at the results from the "enrichments_output" or "normalizations_output" step
output = next((s["snapshot"] for s in res["snapshots"] if s["step"] == "enrichments_output"), None)
print(json.dumps(output, indent=2))

{
  "extracted_metadata": {
    "title": "Star Wars: Episode IV - A New Hope (Opening Crawl)"
  },
  "html": "<?xml version='1.0' encoding='UTF-8' standalone='yes'?><html>\n<head>\n    <meta content=\"text/html; charset=UTF-8\" http-equiv=\"Content-Type\"/>\n    \n    <title>Star Wars: Episode IV - A New Hope (Opening Crawl)</title>\n\n\n</head>\n<body>\n\n\n    <article>\n        <h1>Star Wars: Episode IV - A New Hope (Opening Crawl)</h1>\n        <p>\n            It is a period of civil war. Rebel spaceships, striking from a hidden base, have won their first victory against the evil Galactic Empire.\n        </p><p>\n            During the battle, Rebel spies managed to steal secret plans to the Empire's ultimate weapon, the DEATH STAR, an armored space station with enough power to destroy an entire planet.\n        </p><p>\n            Pursued by the Empire's sinister agents, Princess Leia races home aboard her starship, custodian of the stolen plans that can save her people and res

In [15]:
col, col_id = helper.fetch_object(discovery, "collection", "Story Chunks", environment_id=env_id,
    create=True, create_args=dict(
        environment_id=env_id, configuration_id=cfg_id,
        description="Stories and plots split up into chunks suitable for answering")
    )

{'environment_id': '5b5c1b03-1746-4aef-8148-4efd7f99c343', 'configuration_id': '34a30188-a7f4-4dc9-9b69-5e1bf08ee4c3', 'description': 'Stories and plots split up into chunks suitable for answering', 'name': 'Story Chunks'}
Created collection: Story Chunks (28f5db94-bc69-4ba7-86fc-95d0d3ffb814)


In [16]:
# Add documents to collection
doc_ids = []  # to store the generated id for each document added
for filename in glob.glob(os.path.join(data_dir, "Star-Wars", "*.html")):
    print("Adding file:", filename)
    with open(filename, "r") as f:
        # Split each individual <p> into its own "document"
        doc = f.read()
        soup = BeautifulSoup(doc, 'html.parser')
        for i, p in enumerate(soup.find_all('p')):
            doc_info = discovery.add_document(
                environment_id=env_id, 
                collection_id=col_id,
                file=json.dumps({"text": p.get_text(strip=True)}),
                filename='n',
                file_content_type="application/json",
                metadata=json.dumps({"title": soup.title.get_text(strip=True)})
            ).get_result() # add get_result() here

            doc_ids.append(doc_info["document_id"])
print("Total", len(doc_ids), "documents added.")

Adding file: data\Star-Wars\Episode-III_Revenge-of-the-Sith.html
Adding file: data\Star-Wars\Episode-II_Attack-of-the-Clones.html
Adding file: data\Star-Wars\Episode-IV_A-New-Hope.html
Adding file: data\Star-Wars\Episode-I_The-Phantom-Menace.html
Adding file: data\Star-Wars\Episode-VII_The-Force-Awakens.html
Adding file: data\Star-Wars\Episode-VI_Return-of-the-Jedi.html
Adding file: data\Star-Wars\Episode-V_The-Empire-Strikes-Back.html
Adding file: data\Star-Wars\Rogue-One.html
Total 42 documents added.


In [17]:
# View collection details to verify all documents have been processed
col, col_id = helper.fetch_object(discovery, "collection", "Story Chunks", environment_id=env_id)

Found collection: Story Chunks (28f5db94-bc69-4ba7-86fc-95d0d3ffb814)


In [18]:
# List all fields extracted
discovery.list_collection_fields(environment_id=env_id, collection_id=col_id).get_result()

{'fields': [{'field': 'enriched_text.entities.sentiment.score',
   'type': 'double'},
  {'field': 'enriched_text.sentiment.document.label', 'type': 'string'},
  {'field': 'enriched_text.concepts.dbpedia_resource', 'type': 'string'},
  {'field': 'extracted_metadata.file_type', 'type': 'string'},
  {'field': 'enriched_text.concepts.text', 'type': 'string'},
  {'field': 'enriched_text.concepts.relevance', 'type': 'double'},
  {'field': 'enriched_text.categories.score', 'type': 'double'},
  {'field': 'enriched_text.entities.sentiment.label', 'type': 'string'},
  {'field': 'enriched_text.entities.relevance', 'type': 'double'},
  {'field': 'text', 'type': 'string'},
  {'field': 'enriched_text.entities.count', 'type': 'double'},
  {'field': 'enriched_text.entities.sentiment', 'type': 'nested'},
  {'field': 'enriched_text.sentiment.document', 'type': 'nested'},
  {'field': 'enriched_text.entities.text', 'type': 'string'},
  {'field': 'enriched_text.entities', 'type': 'nested'},
  {'field': 'en

In [19]:
# A simple query
results = discovery.query(environment_id=env_id, collection_id=col_id,
    query_options={
        "query": "enriched_text.relations.subject.text:\"Jar Jar\"",
        "return": "metadata.title,text"
    }).get_result()
print(json.dumps(results, indent=2))

{
  "matching_results": 42,
  "session_token": "1_M8p0S1Kd2Q5fwKU1_2L6AdPLPA",
  "results": [
    {
      "id": "b10dcd6e-f138-4930-bbfb-a91cd81ffbda",
      "result_metadata": {
        "score": 1
      },
      "extracted_metadata": {
        "sha1": "2535dcdeef6d5ec1b0433c636bea824971b2b659",
        "filename": "n",
        "file_type": "json"
      },
      "text": "Amidala's ship is unable to sustain its hyperdrive and lands for repairs on the desert planet Tatooine. Qui-Gon, Jar Jar, astromech droid R2-D2, and Amidala (in disguise as the handmaiden Padm\u00c3\u00a9) visit the settlement of Mos Espa to buy new parts at a junk shop. They meet the shop's owner Watto and his nine-year-old slave, Anakin Skywalker, who is a gifted pilot and engineer and has created a protocol droid called C-3PO. Qui-Gon senses a strong presence of the Force within Anakin and is convinced that he is the \"chosen one\" of Jedi prophecy who will bring balance to the Force. Qui-Gon wagers Anakin's freedom