### Below exercise is meant for understanding how to integrate Python on a System with AWS Comprehend Services.
#### Please note you need to have an AWS account for this and you need to complete the credentials and config settings on the system beforehand

#### We are going to do the below activities through this exercise:

1. Download the data & split it into train and test/truth dataset
2. Upload the data on AWS S3 by creating a bucket by calling S3 API
3. Build and train a classifier model using AWS Comprehend Services by calling Comprehend API
4. Once the model is build and trained, expose it to the test dataset from S3 bucket
5. Evaluate the model for accuracy, precision, recall, f1 score, etc

### Importing the Libraries

In [46]:
import numpy as np
import pandas as pd
import os
import boto3
import json
import tarfile
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
warnings.filterwarnings("ignore") 

### Importing the data

In [None]:
!wget http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip

In [None]:
!unzip bbc-fulltext.zip -d bbc

In [2]:
df = pd.DataFrame()
mapping = {}
source_path = "bbc/bbc/"

for i in sorted(os.listdir(source_path)):
    if os.path.isdir(source_path+i):
        mapping[i] = sorted(os.listdir(source_path+i))[:300]

label = []
data = []

for key, file in mapping.items():
    for fcontent in file:
        label.append(key)
        data.append(str(open(source_path+i+"/"+fcontent, encoding="cp1252").read().replace("\n"," "))[:5000])
        
df["document"] = data
df["label"] = label

#df = df.sample(frac=1).reset_index(drop=True)

df.to_csv("dataset.csv", header=False, index=False)

### Creating Train & Test Datasets

In [3]:
X, y = df['document'], df['label']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0,stratify=y)

In [5]:
print('Xtrain Shape', X_train.shape)
print('Xtest Shape', X_test.shape)
print('y train Shape', y_train.shape)
print('y test Shape', y_test.shape)

Xtrain Shape (1050,)
Xtest Shape (450,)
y train Shape (1050,)
y test Shape (450,)


### Creating Train, Test & Truth Files for Uploading to S3

In [6]:
df_train = pd.DataFrame()
df_train['label'] = y_train
df_train['document'] = X_train

df_test = pd.DataFrame()
df_test['document'] = X_test

df_truth = pd.DataFrame()
df_truth['label'] = y_test
df_truth['document'] = X_test

df_train.to_csv('train.csv',index=False, header=False)
df_test.to_csv('test.csv',index=False, header=False)
df_truth.to_csv('truth.csv',index=False, header=False)

### Creating a new S3 Bucket

In [7]:
s3_client = boto3.client('s3')
comprehend_client = boto3.client('comprehend')

In [170]:
response = s3_client.create_bucket(Bucket='doc-classifier-radwani',
                               ACL='private',
                               CreateBucketConfiguration={'LocationConstraint': 'ap-south-1'})
response

{'ResponseMetadata': {'RequestId': '8C2E661B9F978BE5',
  'HostId': '55imwuXai0y7PtK/PbKg4Ggy9NlAvbcfX47KNc26qBne3PJdmX/a72fCVn/AsgREWguIjZEwT8I=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '55imwuXai0y7PtK/PbKg4Ggy9NlAvbcfX47KNc26qBne3PJdmX/a72fCVn/AsgREWguIjZEwT8I=',
   'x-amz-request-id': '8C2E661B9F978BE5',
   'date': 'Tue, 24 Nov 2020 11:45:28 GMT',
   'location': 'http://doc-classifier-radwani.s3.amazonaws.com/',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'Location': 'http://doc-classifier-radwani.s3.amazonaws.com/'}

### Uploading data on AWS S3

In [8]:
s3_client.upload_file(Filename="train.csv",Bucket="doc-classifier-radwani",Key="train.csv")
s3_client.upload_file(Filename="test.csv",Bucket="doc-classifier-radwani",Key="test.csv")
s3_client.upload_file(Filename="truth.csv",Bucket="doc-classifier-radwani",Key="truth.csv")

### Creating a Custom Classifier & Training it

In [9]:
# Create a document classifier
create_response = comprehend_client.create_document_classifier(
    InputDataConfig={
        'S3Uri': 's3://doc-classifier-radwani/train.csv'
    },
    DataAccessRoleArn='arn:aws:iam::341732173095:role/service-role/AmazonComprehendServiceRole-doc-classifier-rad-role-2',
    DocumentClassifierName='doc-classifier-by-radwani',
    LanguageCode='en'
)
print("Create response:\n")
create_response

Create response:



{'DocumentClassifierArn': 'arn:aws:comprehend:ap-south-1:341732173095:document-classifier/doc-classifier-by-radwani',
 'ResponseMetadata': {'RequestId': 'ec597c89-5ab9-4f7c-9b7e-8f2da7f5e2a8',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'ec597c89-5ab9-4f7c-9b7e-8f2da7f5e2a8',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '116',
   'date': 'Tue, 24 Nov 2020 15:30:52 GMT'},
  'RetryAttempts': 0}}

In [32]:
# Check the status of the classifier
describe_response = comprehend_client.describe_document_classifier(DocumentClassifierArn=create_response['DocumentClassifierArn'])
print("Status of Classifier:\n")
describe_response['DocumentClassifierProperties']['Status']

Status of Classifier:



'TRAINED'

### Creating a custom job and using classifier on test dataset

In [33]:
start_response = comprehend_client.start_document_classification_job(
    InputDataConfig={
        'S3Uri': 's3://doc-classifier-radwani/test.csv',
        'InputFormat': 'ONE_DOC_PER_LINE'
    },
    OutputDataConfig={
        'S3Uri': 's3://doc-classifier-radwani/'
    },
    DataAccessRoleArn='arn:aws:iam::341732173095:role/service-role/AmazonComprehendServiceRole-doc-classifier-rad-role-2',
    DocumentClassifierArn=
    'arn:aws:comprehend:ap-south-1:341732173095:document-classifier/doc-classifier-by-radwani'
)

print("Start response:\n")
start_response

Start response:



{'JobId': '679c821820e5036c986eecfb0ba943eb',
 'JobStatus': 'SUBMITTED',
 'ResponseMetadata': {'RequestId': 'f1b2c08b-15ab-4cde-b3db-f5d5f2390c3a',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'f1b2c08b-15ab-4cde-b3db-f5d5f2390c3a',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '68',
   'date': 'Tue, 24 Nov 2020 16:22:54 GMT'},
  'RetryAttempts': 0}}

In [35]:
# Check the status of the job
describe_response = comprehend_client.describe_document_classification_job(JobId=start_response['JobId'])
print("Job Status:\n")
describe_response['DocumentClassificationJobProperties']['JobStatus']

Job Status:



'COMPLETED'

### Checking the test output against truth dataset

#### Get the output file

In [37]:
s3_client.download_file(Bucket='doc-classifier-radwani', Key='341732173095-CLN-679c821820e5036c986eecfb0ba943eb/output/output.tar.gz', Filename='output.tar.gz')

In [38]:
my_tar = tarfile.open('output.tar.gz')
my_tar.extractall('./outputfiles') # specify which folder to extract to
my_tar.close()

In [39]:
with open('outputfiles/predictions.jsonl', 'r') as file:
    file = file.readlines()

In [40]:
from pprint import pprint

In [41]:
for line in file:
    pprint(json.loads(line))

{'Classes': [{'Name': 'business', 'Score': 0.2168},
             {'Name': 'entertainment', 'Score': 0.2139},
             {'Name': 'politics', 'Score': 0.1998}],
 'File': 'test.csv',
 'Line': '0'}
{'Classes': [{'Name': 'business', 'Score': 0.2166},
             {'Name': 'entertainment', 'Score': 0.213},
             {'Name': 'politics', 'Score': 0.2026}],
 'File': 'test.csv',
 'Line': '1'}
{'Classes': [{'Name': 'business', 'Score': 0.2156},
             {'Name': 'entertainment', 'Score': 0.2149},
             {'Name': 'politics', 'Score': 0.2019}],
 'File': 'test.csv',
 'Line': '2'}
{'Classes': [{'Name': 'entertainment', 'Score': 0.2151},
             {'Name': 'business', 'Score': 0.2147},
             {'Name': 'politics', 'Score': 0.2016}],
 'File': 'test.csv',
 'Line': '3'}
{'Classes': [{'Name': 'business', 'Score': 0.2156},
             {'Name': 'entertainment', 'Score': 0.2139},
             {'Name': 'politics', 'Score': 0.2009}],
 'File': 'test.csv',
 'Line': '4'}
{'Classes': [{'N

             {'Name': 'entertainment', 'Score': 0.2124},
             {'Name': 'politics', 'Score': 0.2068}],
 'File': 'test.csv',
 'Line': '168'}
{'Classes': [{'Name': 'business', 'Score': 0.2159},
             {'Name': 'entertainment', 'Score': 0.2149},
             {'Name': 'politics', 'Score': 0.2009}],
 'File': 'test.csv',
 'Line': '169'}
{'Classes': [{'Name': 'business', 'Score': 0.2144},
             {'Name': 'entertainment', 'Score': 0.2133},
             {'Name': 'sport', 'Score': 0.2016}],
 'File': 'test.csv',
 'Line': '170'}
{'Classes': [{'Name': 'entertainment', 'Score': 0.2144},
             {'Name': 'business', 'Score': 0.2136},
             {'Name': 'politics', 'Score': 0.2026}],
 'File': 'test.csv',
 'Line': '171'}
{'Classes': [{'Name': 'business', 'Score': 0.2126},
             {'Name': 'politics', 'Score': 0.21},
             {'Name': 'entertainment', 'Score': 0.2087}],
 'File': 'test.csv',
 'Line': '172'}
{'Classes': [{'Name': 'business', 'Score': 0.215},
           

 'Line': '371'}
{'Classes': [{'Name': 'business', 'Score': 0.2174},
             {'Name': 'entertainment', 'Score': 0.2139},
             {'Name': 'politics', 'Score': 0.2004}],
 'File': 'test.csv',
 'Line': '372'}
{'Classes': [{'Name': 'business', 'Score': 0.2163},
             {'Name': 'entertainment', 'Score': 0.2106},
             {'Name': 'politics', 'Score': 0.2044}],
 'File': 'test.csv',
 'Line': '373'}
{'Classes': [{'Name': 'business', 'Score': 0.2141},
             {'Name': 'entertainment', 'Score': 0.2136},
             {'Name': 'politics', 'Score': 0.2023}],
 'File': 'test.csv',
 'Line': '374'}
{'Classes': [{'Name': 'business', 'Score': 0.2125},
             {'Name': 'entertainment', 'Score': 0.2087},
             {'Name': 'politics', 'Score': 0.208}],
 'File': 'test.csv',
 'Line': '375'}
{'Classes': [{'Name': 'business', 'Score': 0.2148},
             {'Name': 'entertainment', 'Score': 0.2127},
             {'Name': 'politics', 'Score': 0.2017}],
 'File': 'test.csv',
 'Line

In [42]:
predictedLabels = []
for line in file:
    predictedLabels.append(json.loads(line)['Classes'][0]['Name'])

In [43]:
df_truth['pred'] = predictedLabels

In [48]:
print('Precision: ',precision_score(df_truth['pred'], df_truth['label'],average='weighted'))

Precision:  0.660888888888889
