# Ch02. Summarizing Text Documents Using NLP

* 바벨피쉬X싸이그래머 / 바벨스택 - CloudPsy
* 김무성

## Using Amazon Comprehend to Inspect Text and Determine the Primary Language 

In [1]:
import boto3

In [2]:
import json

In [8]:
import os

os.environ['AWS_PROFILE'] = "awsml"

In [9]:
comprehend = boto3.client(service_name='comprehend', region_name='us-east-1')

In [10]:
english_string = 'Machine Learning is fascinating.'

In [13]:
print('Calling DetectDominantLanguage')
print('english_string result:')
print(json.dumps(comprehend.detect_dominant_language(Text = english_string), sort_keys=True, indent=4))

Calling DetectDominantLanguage
english_string result:
{
    "Languages": [
        {
            "LanguageCode": "en",
            "Score": 0.993855357170105
        }
    ],
    "ResponseMetadata": {
        "HTTPHeaders": {
            "content-length": "63",
            "content-type": "application/x-amz-json-1.1",
            "date": "Thu, 14 Mar 2019 10:32:40 GMT",
            "x-amzn-requestid": "7eaa48d7-4644-11e9-aa5d-a7f870e922dd"
        },
        "HTTPStatusCode": 200,
        "RequestId": "7eaa48d7-4644-11e9-aa5d-a7f870e922dd",
        "RetryAttempts": 0
    }
}


In [11]:
spanish_string = 'El aprendizaje automático es fascinante.'

In [18]:
print('Calling DetectDominantLanguage')
print('english_string result:')
print(json.dumps(comprehend.detect_dominant_language(Text = spanish_string), sort_keys=True, indent=4))

Calling DetectDominantLanguage
english_string result:
{
    "Languages": [
        {
            "LanguageCode": "es",
            "Score": 0.968570351600647
        }
    ],
    "ResponseMetadata": {
        "HTTPHeaders": {
            "content-length": "63",
            "content-type": "application/x-amz-json-1.1",
            "date": "Thu, 14 Mar 2019 10:35:05 GMT",
            "x-amzn-requestid": "d4ef3e2f-4644-11e9-a565-090d9ae2c6ce"
        },
        "HTTPStatusCode": 200,
        "RequestId": "d4ef3e2f-4644-11e9-a565-090d9ae2c6ce",
        "RetryAttempts": 0
    }
}


In [16]:
korean_string = '기계학습은 멋져!!'

In [19]:
print('Calling DetectDominantLanguage')
print('english_string result:')
print(json.dumps(comprehend.detect_dominant_language(Text = korean_string), sort_keys=True, indent=4))

Calling DetectDominantLanguage
english_string result:
{
    "Languages": [
        {
            "LanguageCode": "ko",
            "Score": 1.0
        }
    ],
    "ResponseMetadata": {
        "HTTPHeaders": {
            "content-length": "49",
            "content-type": "application/x-amz-json-1.1",
            "date": "Thu, 14 Mar 2019 10:35:19 GMT",
            "x-amzn-requestid": "dd7407d3-4644-11e9-88f0-5d2a54c2b2ce"
        },
        "HTTPStatusCode": 200,
        "RequestId": "dd7407d3-4644-11e9-88f0-5d2a54c2b2ce",
        "RetryAttempts": 0
    }
}


## Extracting Information in a Set of Documents

### Determining the Named Entities in a Document

In [20]:
import boto3
import json
import os

os.environ['AWS_PROFILE'] = "awsml"

In [21]:
comprehend = boto3.client(service_name='comprehend', region_name='us-east-1')

In [22]:
english_string = "I study Machine Learning in Seattle on Thursday."

In [24]:
print('Calling DetectEntities')
print(json.dumps(comprehend.detect_entities(Text = english_string,
      LanguageCode='en'), sort_keys=True, indent=4))
print('End of DetectEntities\n')

Calling DetectEntities
{
    "Entities": [
        {
            "BeginOffset": 28,
            "EndOffset": 35,
            "Score": 0.9982718229293823,
            "Text": "Seattle",
            "Type": "LOCATION"
        },
        {
            "BeginOffset": 39,
            "EndOffset": 47,
            "Score": 0.9937644004821777,
            "Text": "Thursday",
            "Type": "DATE"
        }
    ],
    "ResponseMetadata": {
        "HTTPHeaders": {
            "content-length": "203",
            "content-type": "application/x-amz-json-1.1",
            "date": "Thu, 14 Mar 2019 11:25:52 GMT",
            "x-amzn-requestid": "ecff5a09-464b-11e9-b2b9-575c05026259"
        },
        "HTTPStatusCode": 200,
        "RequestId": "ecff5a09-464b-11e9-b2b9-575c05026259",
        "RetryAttempts": 0
    }
}
End of DetectEntities



In [25]:
korean_string = "나는 목요일에 역삼역에서 기계학습을 공부한다."

In [27]:
print('Calling DetectEntities')
print(json.dumps(comprehend.detect_entities(Text = korean_string,
      LanguageCode='ko'), sort_keys=True, indent=4))
print('End of DetectEntities\n')

Calling DetectEntities


ClientError: An error occurred (ValidationException) when calling the DetectEntities operation: 1 validation error detected: Value 'ko' at 'languageCode' failed to satisfy constraint: Member must satisfy enum value set: [de, pt, en, it, fr, es]

### Detecting Key Phrases

In [28]:
import boto3
import json
import os

os.environ['AWS_PROFILE'] = "awsml"

In [29]:
comprehend = boto3.client(service_name='comprehend', region_name='us-east-1')

In [30]:
english_string = "I study Machine Learning in Seattle on Thursday."

In [34]:
print('Calling DetectKeyPhrases')
print(json.dumps(comprehend.detect_key_phrases(Text = english_string,
      LanguageCode='en'), sort_keys=True, indent=4))
print('End of DetectKeyPhrases\n')

Calling DetectKeyPhrases
{
    "KeyPhrases": [
        {
            "BeginOffset": 8,
            "EndOffset": 24,
            "Score": 0.9886739253997803,
            "Text": "Machine Learning"
        },
        {
            "BeginOffset": 28,
            "EndOffset": 35,
            "Score": 0.9990654587745667,
            "Text": "Seattle"
        },
        {
            "BeginOffset": 39,
            "EndOffset": 47,
            "Score": 0.9988609552383423,
            "Text": "Thursday"
        }
    ],
    "ResponseMetadata": {
        "HTTPHeaders": {
            "content-length": "259",
            "content-type": "application/x-amz-json-1.1",
            "date": "Thu, 14 Mar 2019 11:39:25 GMT",
            "x-amzn-requestid": "d17f7c9c-464d-11e9-8066-75946df059d0"
        },
        "HTTPStatusCode": 200,
        "RequestId": "d17f7c9c-464d-11e9-8066-75946df059d0",
        "RetryAttempts": 0
    }
}
End of DetectKeyPhrases



### Detecting Sentiment Analysis

In [35]:
import boto3
import json
import os

os.environ['AWS_PROFILE'] = "awsml"

comprehend = boto3.client(service_name='comprehend', region_name='us-east-1')

In [36]:
english_string = 'Today is my birthday, I am so happy.'

In [37]:
print('Calling DetectSentiment')
print(json.dumps(comprehend.detect_sentiment(Text = english_string,
      LanguageCode='en'), sort_keys=True, indent=4))
print('End of DetectSentiment\n')

Calling DetectSentiment
{
    "ResponseMetadata": {
        "HTTPHeaders": {
            "connection": "keep-alive",
            "content-length": "166",
            "content-type": "application/x-amz-json-1.1",
            "date": "Thu, 14 Mar 2019 11:42:03 GMT",
            "x-amzn-requestid": "2fea2bee-464e-11e9-b65a-0f1b2d2e045a"
        },
        "HTTPStatusCode": 200,
        "RequestId": "2fea2bee-464e-11e9-b65a-0f1b2d2e045a",
        "RetryAttempts": 0
    },
    "Sentiment": "POSITIVE",
    "SentimentScore": {
        "Mixed": 0.0009548648959025741,
        "Negative": 0.00010232770000584424,
        "Neutral": 0.007493684068322182,
        "Positive": 0.9914490580558777
    }
}
End of DetectSentiment



# 참고자료
* [1] Machine Learning with AWS - https://www.amazon.com/Machine-Learning-AWS-artificial-intelligence-ebook/dp/B07HQ1TB44/
* [2] Pragmatic AI: An Introduction to Cloud-Based Machine Learning - https://www.amazon.com/Pragmatic-AI-Introduction-Cloud-Based-Analytics/dp/0134863860/