Created on October 31st 2021 by Patrick Rotzetter

https://www.linkedin.com/in/rotzetter/

## Small experiment of document mining with various techniques Part 8

Let us use AWS Comprehend topic modeling for our sample documents

### This notebook is supposed to be run on AWS Sagemaker

In [None]:
#install pymongo package to get access to json.util
conda install -c anaconda pymongo 

In [2]:
#import boto and connect to Comprehend service

import boto3
from bson.json_util import loads, dumps, default

 
comprehend = boto3.client(service_name='comprehend', region_name='us-east-2')


In [22]:
# let us list the files available for analysis in the S3 bucket
s3 = boto3.client('s3')
bucket='mlsamplesdocs'
subfolder=''
contents = s3.list_objects(Bucket=bucket, Prefix=subfolder)['Contents']
for f in contents:
    print(f['Key'])
    conn.download_file(bucket,f['Key'], f['Key'])

AI-bank-of-the-future-Can-banks-meet-the-AI-challenge-1.pdf.txt
Artificial Financial Intelligence.pdf.txt
Data machine the insurers using AI to reshape the industry Financial Times.pdf.txt
Digital-disruption-in-Insurance.pdf.txt
EPAM_Streamlining_the_Auto_Claims_Process_via_Integrated_IA.pdf.txt
Insurance-2030-The-impact-of-AI-on-the-future-of-insurance-F.pdf.txt
Issues_Paper_on_Increasing_Digitalisation_in_Insurance_and_its_Potential_Impact_on_Consumer_Outcomes.pdf.txt
Kaggle State of Machine Learning and Data Science 2020.pdf.txt
Module-1-Lecture-Slides.pdf.txt
Technology-and-innovation-in-the-insurance-sector.pdf.txt
WEF_Governance_of_Chatbots_in_Healthcare_2020.pdf.txt
ai-360-research.pdf.txt
ai-insurance.pdf.txt
fra-2020-artificial-intelligence_en.pdf.txt
sigma-5-2020-en.pdf.txt
sigma1_2020_en.pdf.txt


In [4]:
# define the parameters for the topic detection job, input and output, detection mode and number of topics. We also need to pass a roel for teh job to be able to read the S3 bucket

input_s3_url = "s3://mlsamplesdocs/"
input_doc_format = "ONE_DOC_PER_FILE"
output_s3_url = "s3://mycomprehendoutputbucket2203"
data_access_role_arn = "arn:aws:iam::012086180905:role/ComprehendS3Access"
number_of_topics = 10

In [5]:
input_data_config = {"S3Uri": input_s3_url, "InputFormat": input_doc_format}
output_data_config = {"S3Uri": output_s3_url}
 
start_topics_detection_job_result = comprehend.start_topics_detection_job(NumberOfTopics=number_of_topics,
                                                                              InputDataConfig=input_data_config,
                                                                              OutputDataConfig=output_data_config,
                                                                              DataAccessRoleArn=data_access_role_arn)
 
print('start_topics_detection_job_result: ' + dumps(start_topics_detection_job_result))
 
job_id = start_topics_detection_job_result["JobId"]
 
print('job_id: ' + job_id)

start_topics_detection_job_result: {"JobId": "b713e353ab76ebbe4958359733b760e7", "JobArn": "arn:aws:comprehend:us-east-2:012086180905:topics-detection-job/b713e353ab76ebbe4958359733b760e7", "JobStatus": "SUBMITTED", "ResponseMetadata": {"RequestId": "f4d5c207-bc3c-40e5-82cf-935221dd48e6", "HTTPStatusCode": 200, "HTTPHeaders": {"x-amzn-requestid": "f4d5c207-bc3c-40e5-82cf-935221dd48e6", "content-type": "application/x-amz-json-1.1", "content-length": "175", "date": "Sun, 31 Oct 2021 07:49:11 GMT"}, "RetryAttempts": 0}}
job_id: b713e353ab76ebbe4958359733b760e7


In [33]:

describe_topics_detection_job_result = comprehend.describe_topics_detection_job(JobId=job_id)
 
print('describe_topics_detection_job_result: ' + dumps(describe_topics_detection_job_result, default=default))
 

describe_topics_detection_job_result: {"TopicsDetectionJobProperties": {"JobId": "b713e353ab76ebbe4958359733b760e7", "JobArn": "arn:aws:comprehend:us-east-2:012086180905:topics-detection-job/b713e353ab76ebbe4958359733b760e7", "JobStatus": "COMPLETED", "SubmitTime": {"$date": 1635666551316}, "EndTime": {"$date": 1635667766827}, "InputDataConfig": {"S3Uri": "s3://mlsamplesdocs/", "InputFormat": "ONE_DOC_PER_FILE"}, "OutputDataConfig": {"S3Uri": "s3://mycomprehendoutputbucket2203/012086180905-TOPICS-b713e353ab76ebbe4958359733b760e7/output/output.tar.gz"}, "NumberOfTopics": 10, "DataAccessRoleArn": "arn:aws:iam::012086180905:role/ComprehendS3Access"}, "ResponseMetadata": {"RequestId": "b72f21f4-ac0c-4273-a91a-0fa0c22077fe", "HTTPStatusCode": 200, "HTTPHeaders": {"x-amzn-requestid": "b72f21f4-ac0c-4273-a91a-0fa0c22077fe", "content-type": "application/x-amz-json-1.1", "content-length": "579", "date": "Sun, 31 Oct 2021 08:13:35 GMT"}, "RetryAttempts": 0}}


In [34]:
outputfilename=describe_topics_detection_job_result['TopicsDetectionJobProperties']['OutputDataConfig']['S3Uri']

In [35]:
outputfilename

's3://mycomprehendoutputbucket2203/012086180905-TOPICS-b713e353ab76ebbe4958359733b760e7/output/output.tar.gz'

In [36]:
def split_s3_path(s3_path):
    path_parts=s3_path.replace("s3://","").split("/")
    bucket=path_parts.pop(0)
    key="/".join(path_parts)
    return bucket, key

In [37]:
bucket, key = split_s3_path(outputfilename)

In [38]:
bucket

'mycomprehendoutputbucket2203'

In [39]:
key

'012086180905-TOPICS-b713e353ab76ebbe4958359733b760e7/output/output.tar.gz'

In [49]:
s3.download_file(bucket,key,'./output.tar.gz')

In [51]:
!gunzip < output.tar.gz | tar -xv

doc-topics.csv
topic-terms.csv


In [52]:
import pandas as pd

In [53]:
import seaborn as sea

In [56]:
doc_topics=pd.read_csv('doc-topics.csv')