In [5]:
!pip install --upgrade "watson-developer-cloud>=1.4.0"

Requirement already up-to-date: watson-developer-cloud>=1.4.0 in /anaconda3/envs/tensorflow/lib/python3.6/site-packages (1.4.1)
Requirement not upgraded as not directly required: Twisted>=13.2.0 in /anaconda3/envs/tensorflow/lib/python3.6/site-packages (from watson-developer-cloud>=1.4.0) (18.4.0)
Requirement not upgraded as not directly required: python-dateutil>=2.5.3 in /anaconda3/envs/tensorflow/lib/python3.6/site-packages (from watson-developer-cloud>=1.4.0) (2.7.3)
Requirement not upgraded as not directly required: autobahn>=0.10.9 in /anaconda3/envs/tensorflow/lib/python3.6/site-packages (from watson-developer-cloud>=1.4.0) (18.6.1)
Requirement not upgraded as not directly required: requests<3.0,>=2.0 in /anaconda3/envs/tensorflow/lib/python3.6/site-packages (from watson-developer-cloud>=1.4.0) (2.19.1)
Requirement not upgraded as not directly required: service-identity>=17.0.0 in /anaconda3/envs/tensorflow/lib/python3.6/site-packages (from watson-developer-cloud>=1.4.0) (17.0.0

In [1]:
import os
import sys
import errno
from watson_developer_cloud import DiscoveryV1
from dotenv import load_dotenv, find_dotenv
import json


In [6]:
try:
    load_dotenv(find_dotenv())
except IOError:
    print('warning: no .env file loaded')

# Create Discovery instance

In [7]:
discovery = DiscoveryV1(
              #url=os.getenv('DISCOVERY_URL'),
              username=os.getenv('DISCOVERY_USERNAME'),
              password=os.getenv('DISCOVERY_PASSWORD'),
              version="2018-03-05"
            )
environment_id = os.getenv('DISCOVERY_ENVIRONMENT_ID')
collection_id =  os.getenv('DISCOVERY_COLLECTION_ID')
configuration_id = os.getenv('DISCOVERY_CONFIGURATION_ID')
                                 

# List all fields

In [35]:
fields = discovery.list_fields(environment_id, [collection_id])
print(json.dumps(fields, indent=2))

{
  "fields": [
    {
      "field": "anchor",
      "type": "string"
    },
    {
      "field": "enriched_text.entities.sentiment.score",
      "type": "double"
    },
    {
      "field": "enriched_text.sentiment.document.label",
      "type": "string"
    },
    {
      "field": "enriched_text.concepts.dbpedia_resource",
      "type": "string"
    },
    {
      "field": "extracted_metadata.file_type",
      "type": "string"
    },
    {
      "field": "enriched_text.concepts.text",
      "type": "string"
    },
    {
      "field": "enriched_text.concepts.relevance",
      "type": "double"
    },
    {
      "field": "enriched_text.categories.score",
      "type": "double"
    },
    {
      "field": "digest",
      "type": "string"
    },
    {
      "field": "url",
      "type": "string"
    },
    {
      "field": "host",
      "type": "string"
    },
    {
      "field": "enriched_text.entities.sentiment.label",
      "type": "string"
    },
    {
      "field": "enriched_text

# List Configurations in environment

In [4]:
configs = discovery.list_configurations(environment_id)
print(json.dumps(configs, indent=2))

{
  "configurations": [
    {
      "configuration_id": "4c720927-1619-479c-b5e0-c8c79c9803e6",
      "name": "Default Configuration",
      "description": "The configuration used by default when creating a new collection without specifying a configuration_id.",
      "created": "2018-05-04T11:46:30.601Z",
      "updated": "2018-05-04T11:46:30.601Z"
    },
    {
      "configuration_id": "4d9f12a9-36fa-4876-b2d4-f3891341be60",
      "name": "Default Contract Configuration",
      "description": "Extract party, nature, and category from elements in PDFs.",
      "created": "2018-05-04T11:46:36.080Z",
      "updated": "2018-05-04T11:46:36.080Z"
    },
    {
      "configuration_id": "0806bbdd-6bef-4882-8ba9-093c37749337",
      "name": "sampleconfig",
      "description": null,
      "created": "2018-05-07T13:09:34.160Z",
      "updated": "2018-05-07T13:09:34.160Z"
    }
  ]
}


# Create collection progrmatically. Use this option in case you are not using collection created by tool. Uncomment the assigning collection to environment variable.

In [5]:
new_collection = discovery.create_collection(environment_id=environment_id, configuration_id=configuration_id, 
                                             name='Nature2', description='Create a Collection to hold natural calmaties information', 
                                             language='en')
print(json.dumps(new_collection, indent=2))
# **** Uncomment if you want to use the above created collection further ***
# collection_id = new_collection

{
  "name": "Nature2",
  "collection_id": "6cd34257-ac8c-43d5-b62f-61e0e9732eca",
  "description": "Create a Collection to hold natural calmaties information",
  "created": "2018-07-27T05:52:34.121Z",
  "updated": "2018-07-27T05:52:34.121Z",
  "configuration_id": "0806bbdd-6bef-4882-8ba9-093c37749337",
  "language": "en",
  "status": "active"
}


# List Collections in given environment

In [4]:
collections = discovery.list_collections(environment_id)
print(json.dumps(collections, indent=2))

{
  "collections": [
    {
      "collection_id": "e2688c58-055d-487d-921f-a41c0dde6a55",
      "name": "MorganStanley",
      "configuration_id": "0806bbdd-6bef-4882-8ba9-093c37749337",
      "language": "en",
      "status": "active",
      "description": "",
      "created": "2018-05-04T11:46:47.697Z",
      "updated": "2018-06-17T19:18:25.451Z"
    },
    {
      "collection_id": "be9d4ddb-863c-4e5f-8a5e-aeee9fc73bc0",
      "name": "Nature",
      "configuration_id": "4c720927-1619-479c-b5e0-c8c79c9803e6",
      "language": "en",
      "status": "active",
      "description": null,
      "created": "2018-07-05T17:39:47.081Z",
      "updated": "2018-07-05T17:39:47.081Z"
    }
  ]
}


# Create an environment

In [10]:

# Only one free environment is allowed per organization ---- so commenting
#response = discovery.create_environment(
#    name="my_environment",
#    description="My environment"
#)

#print(json.dumps(response, indent=2))

In [24]:
en_de = {"red" : "rot", "green" : "grün", "blue" : "blau", "yellow":"gelb"}
print (en_de)
print (en_de["red"])
de_fr = {"rot" : "rouge", "grün" : "vert", "blau" : "bleu", "gelb":"jaune"}
print ("The French word for red is: " + de_fr[en_de["red"]])

{'red': 'rot', 'green': 'grün', 'blue': 'blau', 'yellow': 'gelb'}
rot
The French word for red is: rouge


# List environments

In [29]:

environment_info = discovery.list_environments()
print(json.dumps(environment_info, indent=2))

{
  "environments": [
    {
      "environment_id": "system",
      "name": "Watson System Environment",
      "description": "Shared system data sources",
      "read_only": true
    },
    {
      "environment_id": "162e8398-ceff-42ab-8710-655087535953",
      "name": "byod",
      "description": "",
      "created": "2018-05-04T11:46:30.566Z",
      "updated": "2018-05-04T11:46:30.566Z",
      "read_only": false
    }
  ]
}


# List environments and list collection for news environment(system)

In [42]:
environments = discovery.list_environments()
print(json.dumps(environments, indent=2))

#news_environments = [x for x in environments['environments'] if x['name'] == 'Watson Discovery News Environment']
news_environments = [x for x in environments['environments'] if x['name'] == 'Watson System Environment']
print(news_environments, len(news_environments))
news_environment_id = news_environments[0]['environment_id']
print('news environemnt details \n',json.dumps(news_environment_id, indent=2))

collections = discovery.list_collections(news_environment_id)
news_collections = [x for x in collections['collections']]
print('news collection')
print(json.dumps(news_collections, indent=2))



{
  "environments": [
    {
      "environment_id": "system",
      "name": "Watson System Environment",
      "description": "Shared system data sources",
      "read_only": true
    },
    {
      "environment_id": "162e8398-ceff-42ab-8710-655087535953",
      "name": "byod",
      "description": "",
      "created": "2018-05-04T11:46:30.566Z",
      "updated": "2018-05-04T11:46:30.566Z",
      "read_only": false
    }
  ]
}
[{'environment_id': 'system', 'name': 'Watson System Environment', 'description': 'Shared system data sources', 'read_only': True}] 1
news environemnt details 
 "system"
news collection
[
  {
    "collection_id": "news-en",
    "name": "news-en",
    "language": "en",
    "status": "active",
    "description": "Watson News pre-enriched collection of curated news sources v2 (English)"
  },
  {
    "collection_id": "news-de",
    "name": "news-de",
    "language": "de",
    "status": "active",
    "description": "Watson News pre-enriched collection of curated news 

# Add a document

In [44]:
print(os.getcwd())
print(os.path.join('/Users/rajeshgudikoti/Documents/rajesh/IBMDigital/Projects_docs/OHUM/samplereports/', 'spectracell_micronutrients.pdf'))
###### rb argument is passed as pdf is binary document ################
with open(os.path.join(os.getcwd(), '/Users/rajeshgudikoti/Documents/rajesh/IBMDigital/Projects_docs/OHUM/samplereports/', 'spectracell_micronutrients.pdf'),'rb') as fileinfo:
    add_doc = discovery.add_document(environment_id, collection_id, file=fileinfo, file_content_type='application/pdf',filename='spectracell_micronutrients.pdf')
print(json.dumps(add_doc, indent=2))


/Users/rajeshgudikoti/Documents/learning/nlp/discovery
/Users/rajeshgudikoti/Documents/rajesh/IBMDigital/Projects_docs/OHUM/samplereports/spectracell_micronutrients.pdf


# Add a document reading online

In [43]:
!pip install PyPDF2
# If importing module fails even after above command executed sucessfully then navigate to conda prompt and run 
# pip uninstall PyPDF2
# pip install PyPDF2
#https://stackoverflow.com/questions/39241643/no-module-named-pypdf2-error

[33mYou are using pip version 10.0.1, however version 18.0 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [44]:
import PyPDF2, requests,io

In [49]:
print('reading online pdffile ')
url = 'https://www.ifc.org/wps/wcm/connect/8b796b004970c0199a7ada336b93d75f/DisERHandbook.pdf?MOD=AJPERES'
response = requests.get(url)
my_raw_data = response.content
#print('raw data ', my_raw_data)
pdf_content = io.BytesIO(my_raw_data)
pdf_reader = PyPDF2.PdfFileReader(pdf_content)

if pdf_reader.isEncrypted:
    pdf_reader.decrypt("")
    print('pdf content decrypted \n', pdf_reader.getPage(0).extractText())

else:
    print('pdf content \n ', pdf_reader.getPage(0).extractText())
num_pages = pdf_reader.getNumPages()
print(num_pages)

reading online pdffile 
pdf content 
  ˜˚˛˝˛˙ˆˇ˘˝˘ˆˇˆ
ˇˆ˝ˇˆˆ˛˛
˘˚˝ˆ˘ˇ˘˛

72


In [50]:
url = 'https://www.ifc.org/wps/wcm/connect/8b796b004970c0199a7ada336b93d75f/DisERHandbook.pdf?MOD=AJPERES'
response = requests.get(url)
my_raw_data = response.content

with open("/Users/rajeshgudikoti/Documents/github_prj/cfc-fisrt-aid/DisERHandbook.pdf", 'wb') as my_data:
    my_data.write(my_raw_data)

open_pdf_file = open("my_pdf.pdf", 'rb')
read_pdf = PyPDF2.PdfFileReader(open_pdf_file)
if read_pdf.isEncrypted:
    read_pdf.decrypt("")
    print('pdf content decrypted \n', read_pdf.getPage(0).extractText())

else:
    print('pdf content \n',read_pdf.getPage(0).extractText())

pdf content 
 ˜˚˛˝˛˙ˆˇ˘˝˘ˆˇˆ
ˇˆ˝ˇˆˆ˛˛
˘˚˝ˆ˘ˇ˘˛



# Get document details

In [45]:
print(add_doc['status'])
document_id = add_doc['document_id']
print(document_id)
# doc_info = discovery.get_document_status(environment_id, collection_id, '3ca7f868-630d-485a-a0ed-6dea26482604')
doc_info = discovery.get_document_status(environment_id, collection_id, document_id)
print(json.dumps(doc_info, indent=2))

processing
02bc2855-9b0f-448c-8edc-23f8dc64ec3b
{
  "document_id": "02bc2855-9b0f-448c-8edc-23f8dc64ec3b",
  "notices": [],
  "status": "available",
  "filename": "spectracell_micronutrients.pdf",
  "file_type": "pdf",
  "sha1": "59566e8927157d7f8be82634e43ad0f931e3449c"
}


# Update a document

In [41]:
#document_id = 'c346ec99-693f-4c0e-8f39-fd8633cb4725'
with open(os.path.join(os.getcwd(), '/Users/rajeshgudikoti/Documents/rajesh/IBMDigital/Projects_docs/OHUM/samplereports/', 'spectracell_micronutrients.pdf'),'rb') as fileinfo:
    add_updated_doc = discovery.update_document(environment_id, collection_id, document_id,
                                                file=fileinfo, 
                                                file_content_type='application/pdf',
                                                filename='spectracell_micronutrients.pdf')
print(json.dumps(add_updated_doc, indent=2))

{
  "document_id": "02bc2855-9b0f-448c-8edc-23f8dc64ec3b",
  "status": "processing"
}


In [43]:
print(add_updated_doc['status'])
document_id = add_updated_doc['document_id']
print(document_id)
doc_info = discovery.get_document_status(environment_id, collection_id, document_id)
print(json.dumps(doc_info, indent=2))

processing
02bc2855-9b0f-448c-8edc-23f8dc64ec3b
{
  "document_id": "02bc2855-9b0f-448c-8edc-23f8dc64ec3b",
  "notices": [],
  "status": "available",
  "filename": "spectracell_micronutrients.pdf",
  "file_type": "pdf",
  "sha1": "59566e8927157d7f8be82634e43ad0f931e3449c"
}


In [None]:
# Delete a document

In [17]:
delete_doc = discovery.delete_document(environment_id, collection_id, document_id)
print(json.dumps(delete_doc, indent=2))

{
  "document_id": "c346ec99-693f-4c0e-8f39-fd8633cb4725",
  "status": "deleted"
}


# query using discovery query langauge syntax

In [13]:
qopts = {'query': "entities.text:'Johnny Depp'"}
my_query = discovery.query('162e8398-ceff-42ab-8710-655087535953', 'e2688c58-055d-487d-921f-a41c0dde6a55', qopts)
print(json.dumps(my_query, indent=2))

{
  "matching_results": 2,
  "results": [
    {
      "id": "ef51f518fcb5b92a81f89df88342dd028427add685761d72612b54a2d15d7724",
      "result_metadata": {
        "score": 0
      },
      "digest": "ef2d9e5405c599b970ad8dae61ad26d7",
      "url": "https://en.wikipedia.org/wiki/Watson_(computer)",
      "host": "en.wikipedia.org",
      "text": "Watson (computer) - Wikipedia Watson (computer) From Wikipedia, the free encyclopedia Jump to: navigation , search \"IBM Watson\" redirects here. For the IBM laboratory, see Thomas J. Watson Research Center . Watson's avatar , inspired by the IBM \" smarter planet \" logo [1] Watson is a question-answering computer system capable of answering questions posed in natural language , [2] developed in IBM 's DeepQA project by a research team led by principal investigator David Ferrucci . [3] Watson was named after IBM's first CEO, industrialist Thomas J. Watson . [4] [5] The computer system was initially developed to answer questions on the quiz sho

In [72]:
dict={'a':1,'b':2,'c':3}
dict_list=[]
dict_list=list(dict.values())
print (dict_list)

[1, 2, 3]


In [46]:
with open('expansions.json', 'r') as f:
        datastore = json.load(f)
print('datastore \n',datastore)
datastorelist = datastore.values()
print('datastorelist \n',datastorelist)

datastore 
 {'expansions': [{'expanded_terms': ['car', 'automobile', 'sedan', 'suv', 'sport utility vehicle', 'motor vehicle']}, {'input_terms': ['weekday', 'week day'], 'expanded_terms': ['monday', 'tuesday', 'wednesday', 'thursday', 'friday']}, {'input_terms': ['weekend', 'week end'], 'expanded_terms': ['saturday', 'sunday']}]}
datastorelist 
 dict_values([[{'expanded_terms': ['car', 'automobile', 'sedan', 'suv', 'sport utility vehicle', 'motor vehicle']}, {'input_terms': ['weekday', 'week day'], 'expanded_terms': ['monday', 'tuesday', 'wednesday', 'thursday', 'friday']}, {'input_terms': ['weekend', 'week end'], 'expanded_terms': ['saturday', 'sunday']}]])


In [48]:
expansions_list = discovery.list_expansions(environment_id, collection_id)
print(type(expansions_list), '\t', type(list(expansions_list.values())), (list(expansions_list.values())[0]))
print(json.dumps(expansions_list, indent=2))
expansion = json.dumps({
   "expansions": [
     {
       "expanded_terms": [
         "car",
         "automobile",
         "motor vehicle"
       ]
     }
   ]
 })
with open('expansions.json', 'r') as f:
        datastore = json.load(f)
print('datastore \t',datastore, '\n',type(datastore))
expansion_json = json.loads(expansion)

#d = {"age":25} 
kwargs = {"age":25}

datastore_list = list(datastore.values())
print(type(expansion),'\t',expansion)
print(type(expansion_json),'\t',expansion_json)
# ************ create is failing ***************** #
#expansions_list = discovery.create_expansions(environment_id,collection_id,list(datastore))
expansions_list = discovery.list_expansions(environment_id, collection_id)

print('expansions_list ********** \n',json.dumps(expansions_list, indent=2))


<class 'dict'> 	 <class 'list'> [{'expanded_terms': ['car', 'automobile', 'sedan', 'suv', 'sport utility vehicle', 'motor vehicle']}, {'input_terms': ['weekday', 'week day'], 'expanded_terms': ['monday', 'tuesday', 'wednesday', 'thursday', 'friday']}, {'input_terms': ['weekend', 'week end'], 'expanded_terms': ['saturday', 'sunday']}]
{
  "expansions": [
    {
      "expanded_terms": [
        "car",
        "automobile",
        "sedan",
        "suv",
        "sport utility vehicle",
        "motor vehicle"
      ]
    },
    {
      "input_terms": [
        "weekday",
        "week day"
      ],
      "expanded_terms": [
        "monday",
        "tuesday",
        "wednesday",
        "thursday",
        "friday"
      ]
    },
    {
      "input_terms": [
        "weekend",
        "week end"
      ],
      "expanded_terms": [
        "saturday",
        "sunday"
      ]
    }
  ]
}
datastore 	 {'expansions': [{'expanded_terms': ['car', 'automobile', 'sedan', 'suv', 'sport utility 

In [51]:
import json

r = {'is_claimed': 'True', 'rating': 3.5}
print(type(r)) #Output
r = json.dumps(r)
loaded_r = json.loads(r)
loaded_r['rating'] #Output 3.5
print(type(r)) #Output str
print(type(loaded_r)) #Output dict

<class 'dict'>
<class 'str'>
<class 'dict'>


In [49]:
def list_collections(self, environment_id):
        """
        Retrieves information about the collections within a given environment
        :param environment_id: this is the guid of a valid environment
        :return: json results of the collections in an environment
        """
        url_string = '/v1/environments/{0}/collections'.format(
            environment_id)
        return self.request(method='GET', url=url_string,
                            params={"version": self.version},
                            accept_json=True)
list_collections(discovery,environment_id)

{'collections': [{'collection_id': 'e2688c58-055d-487d-921f-a41c0dde6a55',
   'configuration_id': '0806bbdd-6bef-4882-8ba9-093c37749337',
   'created': '2018-05-04T11:46:47.697Z',
   'description': '',
   'language': 'en',
   'name': 'MorganStanley',
   'status': 'active',
   'updated': '2018-06-17T19:18:25.451Z'},
  {'collection_id': 'be9d4ddb-863c-4e5f-8a5e-aeee9fc73bc0',
   'configuration_id': '4c720927-1619-479c-b5e0-c8c79c9803e6',
   'created': '2018-07-05T17:39:47.081Z',
   'description': None,
   'language': 'en',
   'name': 'Nature',
   'status': 'active',
   'updated': '2018-07-05T17:39:47.081Z'}]}