In [1]:
# Import all the required libraries

import os;
import numpy as np;
import pandas as pd;

Importing the input data from an S3 bucket

In [2]:
s3_bkt = "s3://aws-shweta-poc"
s3_json_in = s3_bkt + "/input_data/SCalcET8_out.json"
df = pd.read_json(s3_json_in,'index')

In [3]:
df.head(3)

Unnamed: 0,cendoc,chaptername,narrativeid,plaintext,sectionname,titlename
SCalcET8 1.1,"<cl:sect2 xmlns:cl=""http://xml.cengage-learnin...",Functions and Models,SCalcET8 1.1,Four Ways to Represent a Function 1.1 Functi...,Four Ways to Represent a Function,Calculus: Early Transcendentals
SCalcET8 1.2,"<cl:sect2 xmlns:cl=""http://xml.cengage-learnin...",Functions and Models,SCalcET8 1.2,Mathematical Models: A Catalog of Essential F...,Mathematical Models: A Catalog of Essential Fu...,Calculus: Early Transcendentals
SCalcET8 1.3,"<cl:sect2 xmlns:cl=""http://xml.cengage-learnin...",Functions and Models,SCalcET8 1.3,New Functions from Old Functions 1.3 In this...,New Functions from Old Functions,Calculus: Early Transcendentals


Specifying the output location

In [4]:
# output CSV with just id and text columns - used as input to comprehend topic model

#specify the location
s3_csv_out = s3_bkt + "/output_data/narrative.csv"
df.to_csv('/tmp/narrative.csv', columns = ['narrativeid','chaptername','sectionname','plaintext'], index=False, header=False) #temp file
!aws s3 cp '/tmp/narrative.csv' '{s3_csv_out}' #copy temp file to output location



upload: ../../../../../tmp/narrative.csv to s3://aws-shweta-poc/output_data/narrative.csv


## 2. Topic Modeling using Comprehend

In [6]:
import time
from time import gmtime, strftime
import boto3

In [7]:
#creating the client
client = boto3.client('comprehend')

In [8]:
client

<botocore.client.Comprehend at 0x7f87c1928278>

In [26]:
# Q. How was this initial number of topics decided ?
num_topics = 20

In [11]:
s3_comprehend_out = s3_bkt + "/output_data/comprehend_out/" #where the results will be written


# We need two roles 
# One is the Sagemaker execution role when we create the Sagemaker notebook instance
# Two is the Comprehend role which can be created from the Comprehend Console > submit a job > use the role created in the console 
# below for future jobs
#
comprehend_role = 'arn:aws:iam::684066638289:role/service-role/AmazonComprehendServiceRole-shweta-fromconsole'




#Starting the Comprehend Job. 
response = client.start_topics_detection_job(
    InputDataConfig={
        'S3Uri': s3_csv_out,
        'InputFormat': 'ONE_DOC_PER_LINE'
    },
    OutputDataConfig={
        'S3Uri': s3_comprehend_out
    },
    DataAccessRoleArn=comprehend_role,
    JobName='narrative_topic_model',
    NumberOfTopics=num_topics
)

In [12]:
jobid = response['JobId']
status = response['JobStatus']

In [13]:
while status in ['SUBMITTED', 'IN_PROGRESS']:
    response = client.describe_topics_detection_job(
        JobId=jobid
    )
    status = response['TopicsDetectionJobProperties']['JobStatus']
    print(strftime("%H:%M:%S", gmtime()) + ": " + status)
    time.sleep(120)

14:43:38: IN_PROGRESS
14:45:38: IN_PROGRESS
14:47:38: IN_PROGRESS
14:49:38: IN_PROGRESS
14:51:38: IN_PROGRESS
14:53:38: IN_PROGRESS
14:55:38: IN_PROGRESS
14:57:38: IN_PROGRESS
14:59:39: IN_PROGRESS
15:01:39: IN_PROGRESS
15:03:39: IN_PROGRESS
15:05:39: IN_PROGRESS
15:07:39: IN_PROGRESS
15:09:39: COMPLETED


In [14]:
#output_data = response['TopicsDetectionJobProperties']['OutputDataConfig']['S3Uri']
response

{'TopicsDetectionJobProperties': {'JobId': '2519531e0d0be34d3c44796b76eb139f',
  'JobName': 'narrative_topic_model',
  'JobStatus': 'COMPLETED',
  'SubmitTime': datetime.datetime(2019, 6, 6, 14, 43, 34, 636000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2019, 6, 6, 15, 9, 15, 36000, tzinfo=tzlocal()),
  'InputDataConfig': {'S3Uri': 's3://aws-poc-shweta/output_data/narrative.csv',
   'InputFormat': 'ONE_DOC_PER_LINE'},
  'OutputDataConfig': {'S3Uri': 's3://aws-poc-shweta/output_data/comprehend_out/684066638289-TOPICS-2519531e0d0be34d3c44796b76eb139f/output/output.tar.gz'},
  'NumberOfTopics': 20,
  'DataAccessRoleArn': 'arn:aws:iam::684066638289:role/service-role/AmazonComprehendServiceRole-shweta-fromconsole'},
 'ResponseMetadata': {'RequestId': '1b031430-886d-11e9-bab1-658f388d00c0',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '1b031430-886d-11e9-bab1-658f388d00c0',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '586',
   'date': 'Thu, 

In [15]:
output_data = response['TopicsDetectionJobProperties']['OutputDataConfig']['S3Uri']

In [16]:
output_data

's3://aws-poc-shweta/output_data/comprehend_out/684066638289-TOPICS-2519531e0d0be34d3c44796b76eb139f/output/output.tar.gz'

In [17]:
!aws s3 cp {output_data} /tmp # copy this data to a temp location 

download: s3://aws-poc-shweta/output_data/comprehend_out/684066638289-TOPICS-2519531e0d0be34d3c44796b76eb139f/output/output.tar.gz to ../../../../tmp/output.tar.gz


In [18]:
!cd /tmp; tar -xvzf ./output.tar.gz

topic-terms.csv
doc-topics.csv


### Examining the topics

In [23]:
df_topics = pd.read_csv('/tmp/doc-topics.csv')

# split docname into filename and lineno
df_topics[['filename','line']] = df_topics.docname.str.split(":",expand=True,)

# reorder columns and drop docname
df_topics = df_topics[['line','filename','topic','proportion']]

# line is numeric
df_topics['line'] = pd.to_numeric(df_topics['line'], errors='coerce')
 
# sort by line and proportion (significance)
df_topics = df_topics.sort_values(by=['line','proportion'],ascending=[True,False])


# Q. How was this significance filter decided ?
# filter: significant topics only
df_topics = df_topics[df_topics['proportion'] > .5]
df_topics.head(5)

Unnamed: 0,line,filename,topic,proportion
51,0,narrative.csv,2,0.850824
360,1,narrative.csv,2,0.796357
31,2,narrative.csv,2,0.936723
644,3,narrative.csv,2,0.917235
492,4,narrative.csv,2,0.92033


#### Add topic to narrative chapter/section info  

In [27]:
df_text = pd.read_csv('/tmp/narrative.csv', header=None, names=['id','chaptername','sectionname','plaintext'])
df_text.head(2)

Unnamed: 0,id,chaptername,sectionname,plaintext
0,SCalcET8 1.1,Functions and Models,Four Ways to Represent a Function,Four Ways to Represent a Function 1.1 Functi...
1,SCalcET8 1.2,Functions and Models,Mathematical Models: A Catalog of Essential Fu...,Mathematical Models: A Catalog of Essential F...


Combining the section metadata with topic results returned

In [28]:
df_sectiontopics = pd.merge(df_text, df_topics, left_index=True, right_on='line')
df_sectiontopics.head(2)

Unnamed: 0,id,chaptername,sectionname,plaintext,line,filename,topic,proportion
51,SCalcET8 1.1,Functions and Models,Four Ways to Represent a Function,Four Ways to Represent a Function 1.1 Functi...,0,narrative.csv,2,0.850824
360,SCalcET8 1.2,Functions and Models,Mathematical Models: A Catalog of Essential Fu...,Mathematical Models: A Catalog of Essential F...,1,narrative.csv,2,0.796357


#### Add terms for each topic

In [29]:
df_terms = pd.read_csv('/tmp/topic-terms.csv')
df_terms.head(2)

Unnamed: 0,topic,term,weight
0,0,x2,0.047832
1,0,y2,0.038478


In [30]:
# reshape to get one row per topic, with terms concatenated into a list
df_termslist = df_terms.groupby('topic').apply(lambda x: list(x['term'].values)).to_frame(name='terms')
df_termslist.head(2)

Unnamed: 0_level_0,terms
topic,Unnamed: 1_level_1
0,"[x2, y2, integral, region, coordinate, plane, ..."
1,"[ft, force, work, rate, mass, length, cm, time..."


Combining above topic dataframe with terms above

In [31]:
# combine terms with sectiontopics
df_sectiontopicsterms = pd.merge(df_sectiontopics, df_termslist, right_index=True, left_on='topic')
df_sectiontopicsterms = df_sectiontopicsterms[['line','id','chaptername','sectionname','topic', 'terms']].sort_values(by='line')

In [33]:
pd.set_option('display.max_colwidth', 0)
pd.set_option('display.max_rows', 100)
df_sectiontopicsterms

Unnamed: 0,line,id,chaptername,sectionname,topic,terms
51,0,SCalcET8 1.1,Functions and Models,Four Ways to Represent a Function,2,"[function, graph, domain, figure, exponential, loweralpha, model, sketch, inverse, time]"
360,1,SCalcET8 1.2,Functions and Models,Mathematical Models: A Catalog of Essential Functions,2,"[function, graph, domain, figure, exponential, loweralpha, model, sketch, inverse, time]"
31,2,SCalcET8 1.3,Functions and Models,New Functions from Old Functions,2,"[function, graph, domain, figure, exponential, loweralpha, model, sketch, inverse, time]"
644,3,SCalcET8 1.4,Functions and Models,Exponential Functions,2,"[function, graph, domain, figure, exponential, loweralpha, model, sketch, inverse, time]"
492,4,SCalcET8 1.5,Functions and Models,Inverse Functions and Logarithms,2,"[function, graph, domain, figure, exponential, loweralpha, model, sketch, inverse, time]"
163,5,SCalcET8 2.1,Limits and Derivatives,The Tangent and Velocity Problems,17,"[line, tangent, velocity, slope, average, time, point, secant, rate, problem]"
12,6,SCalcET8 2.2,Limits and Derivatives,The Limit of a Function,18,"[limx, limit, graph, exist, approach, large, value, law, asymptote, numb]"
558,7,SCalcET8 2.3,Limits and Derivatives,Calculating Limits Using the Limit Laws,18,"[limx, limit, graph, exist, approach, large, value, law, asymptote, numb]"
538,8,SCalcET8 2.4,Limits and Derivatives,The Precise Definition of a Limit,18,"[limx, limit, graph, exist, approach, large, value, law, asymptote, numb]"
624,9,SCalcET8 2.5,Limits and Derivatives,Continuity,7,"[function, continuous, limx, theorem, limit, y2, numb, exist, lim, x2]"
