Created on December 28th  2021 by Patrick Rotzetter

https://www.linkedin.com/in/rotzetter/

## Small experiment of document mining with various techniques Part 9

Let us use AWS built-in NTM algorithm for topic modeling

# Import libraries

In [1]:
# install NLTK and gensim if required
#%%sh
#pip3 -q install nltk gensim

In [2]:
# Import require libraries
import numpy as np
import PyPDF2
import pandas as pd
import pdftotext
import texthero as hero

# Data preparation

In [3]:
# function to read PDF files using pdftotext
def readPdfFile(filename):
    text=""
    with open(filename, "rb") as f:
        pdf = pdftotext.PDF(f)
        for page in pdf:
            text=text+page
    return text

In [4]:
#path of test files
path='./sampledocs/'

In [5]:
# let us scan the full directory, read PDF and PPT documents, clean them and process them with spacy

docName=[]
docType=[]
docText=[]
docNLP=[]
import glob
list_of_files = glob.glob(path+'*.pdf')           # create the list of file
fileNames=[]
for file_name in list_of_files:
    fileText=readPdfFile(file_name)
    docName.append(file_name)
    docType.append('pdf')
    docText.append(fileText)
fullDocs = pd.DataFrame({'Name':docName,'Type':docType,'Text':docText})
fullDocs['cleanText']=hero.clean(fullDocs['Text'])


In [6]:
 print ("Average length of text:" + str((np.mean(fullDocs['Text'].str.len()))))
 print ("Min length of text:" + str((np.min(fullDocs['Text'].str.len()))))
 print ("Max length of text:" + str((np.max(fullDocs['Text'].str.len()))))

Average length of text:90946.61111111111
Min length of text:9170
Max length of text:328295


In [7]:
fullDocs['text_word_count'] = fullDocs['Text'].apply(lambda x: len(x.strip().split()))  # word count
fullDocs['text_unique_words']=fullDocs['Text'].apply(lambda x:len(set(str(x).split())))  # number of unique words
fullDocs.head()

Unnamed: 0,Name,Type,Text,cleanText,text_word_count,text_unique_words
0,./sampledocs/NIST.IR.8312.pdf,pdf,NISTIR 8312\n\nFour Principles of Explainable ...,nistir four principles explainable artificial ...,16792,5026
1,./sampledocs/ai-360-research.pdf,pdf,AI 360: insights from the\nnext frontier of bu...,ai insights next frontier business corner offi...,5281,1746
2,./sampledocs/Module-1-Lecture-Slides.pdf,pdf,"Application of AI, Insurtech and Real Estate\n...",application ai insurtech real estate technolog...,3728,1506
3,./sampledocs/Technology-and-innovation-in-the-...,pdf,Technology and\ninnovation in the\ninsurance s...,technology innovation insurance sector technol...,16742,4228
4,./sampledocs/AI-bank-of-the-future-Can-banks-m...,pdf,Global Banking & Securities\n\nAI-bank of the ...,global banking securities ai bank future banks...,5774,2144


In [8]:
fullDocs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18 entries, 0 to 17
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Name               18 non-null     object
 1   Type               18 non-null     object
 2   Text               18 non-null     object
 3   cleanText          18 non-null     object
 4   text_word_count    18 non-null     int64 
 5   text_unique_words  18 non-null     int64 
dtypes: int64(2), object(4)
memory usage: 992.0+ bytes


In [9]:
import string
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
    
stop_words = stopwords.words('english')
wnl = WordNetLemmatizer()

def process_text(text):
    for p in string.punctuation:
        text = text.replace(p, '')
    text = ''.join([c for c in text if not c.isdigit()])
    text = text.lower().split()
    text = [w for w in text if not w in stop_words] 
    text = [wnl.lemmatize(w) for w in text]
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/patrickrotzetter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/patrickrotzetter/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
%%time
fullDocs['cleanText'] = fullDocs['cleanText'].apply(process_text)

CPU times: user 1.64 s, sys: 31.6 ms, total: 1.67 s
Wall time: 1.68 s


In [11]:
fullDocs.head()

Unnamed: 0,Name,Type,Text,cleanText,text_word_count,text_unique_words
0,./sampledocs/NIST.IR.8312.pdf,pdf,NISTIR 8312\n\nFour Principles of Explainable ...,"[nistir, four, principle, explainable, artific...",16792,5026
1,./sampledocs/ai-360-research.pdf,pdf,AI 360: insights from the\nnext frontier of bu...,"[ai, insight, next, frontier, business, corner...",5281,1746
2,./sampledocs/Module-1-Lecture-Slides.pdf,pdf,"Application of AI, Insurtech and Real Estate\n...","[application, ai, insurtech, real, estate, tec...",3728,1506
3,./sampledocs/Technology-and-innovation-in-the-...,pdf,Technology and\ninnovation in the\ninsurance s...,"[technology, innovation, insurance, sector, te...",16742,4228
4,./sampledocs/AI-bank-of-the-future-Can-banks-m...,pdf,Global Banking & Securities\n\nAI-bank of the ...,"[global, banking, security, ai, bank, future, ...",5774,2144


In [12]:
%%time

from gensim import corpora
dictionary = corpora.Dictionary(fullDocs['cleanText'])

CPU times: user 95.1 ms, sys: 3.64 ms, total: 98.7 ms
Wall time: 96.9 ms


In [13]:
print(dictionary)

Dictionary(11925 unique tokens: ['aaai', 'aad', 'ab', 'abhishek', 'abigail']...)


In [14]:
dictionary.filter_extremes(keep_n=1024)
print(dictionary)

Dictionary(1024 unique tokens: ['absence', 'academic', 'accept', 'acceptance', 'accessed']...)


In [15]:
with open('vocab.txt', 'w') as f:
    for index in range(0,len(dictionary)):
        f.write(dictionary.get(index)+'\n')

In [16]:
%%time

fullDocs['tokens'] = fullDocs.apply(lambda row: dictionary.doc2bow(row['cleanText']), axis=1)

CPU times: user 58.3 ms, sys: 2.13 ms, total: 60.4 ms
Wall time: 59.1 ms


In [17]:
data = fullDocs.drop(['cleanText'], axis=1)
data = data.drop(['Name'], axis=1)
data = data.drop(['Type'], axis=1)
data = data.drop(['Text'], axis=1)
data = data.drop(['text_word_count'], axis=1)
data = data.drop(['text_unique_words'], axis=1)
data.head()

Unnamed: 0,tokens
0,"[(0, 2), (1, 1), (2, 1), (3, 4), (4, 4), (5, 1..."
1,"[(2, 1), (3, 2), (8, 1), (14, 1), (42, 1), (46..."
2,"[(12, 1), (18, 3), (24, 2), (39, 2), (59, 4), ..."
3,"[(3, 1), (6, 2), (10, 3), (11, 1), (14, 5), (1..."
4,"[(5, 1), (9, 1), (13, 3), (21, 2), (30, 9), (3..."


In [18]:
import io, boto3
import sagemaker
import sagemaker.amazon.common as smac
from scipy.sparse import lil_matrix

print(sagemaker.__version__)

session = sagemaker.Session()
bucket = 'mynlpexperimentbucketforsagemaker'
prefix = 'headlines-lda-ntm'

s3 = boto3.client(
    service_name='s3',
    region_name='us-east-1',
    aws_access_key_id='xxxxx',
    aws_secret_access_key='yyyyy'
)

2.72.1


In [19]:
def build_protobuf_dataset(data, dictionary):
    num_lines = data.shape[0]
    num_columns = len(dictionary)
    token_matrix = lil_matrix((num_lines, num_columns)).astype('float32')
    line = 0
    for _, row in data.iterrows():
        for token_id, token_count in row['tokens']:
            token_matrix[line, token_id] = token_count
        line+=1
        
    buf = io.BytesIO()
    smac.write_spmatrix_to_sparse_tensor(buf, token_matrix, None)
    return buf

In [20]:
def upload_protbuf_dataset(buf, bucket, prefix, key):
    obj = '{}/{}'.format(prefix, key)
    buf.seek(0)
    s3.upload_fileobj(buf, bucket, obj)
    path = 's3://{}/{}'.format(bucket,obj)
    return path

In [21]:
%%time
training_buf = build_protobuf_dataset(data, dictionary)
s3_training_path = upload_protbuf_dataset(training_buf, bucket, prefix, 'training/training.protobuf')
print(s3_training_path)

s3://mynlpexperimentbucketforsagemaker/headlines-lda-ntm/training/training.protobuf
CPU times: user 84.8 ms, sys: 8.26 ms, total: 93 ms
Wall time: 1.34 s


# Training

In [22]:
s3_output = 's3://{}/{}/output/'.format(bucket, prefix)

print(s3_output)

s3://mynlpexperimentbucketforsagemaker/headlines-lda-ntm/output/


In [70]:
from sagemaker.image_uris import retrieve

region = session.boto_session.region_name    
container = retrieve('lda', region)
print(container)

766337827248.dkr.ecr.us-east-1.amazonaws.com/lda:1


In [66]:

DUMMY_IAM_ROLE = 'arn:aws:iam::111111111111:role/service-role/AmazonSageMaker-ExecutionRole-20200101T000001'

lda = sagemaker.estimator.Estimator(
    container,
    role=DUMMY_IAM_ROLE,
    #    role='arn:aws:iam::783491625988:role/SageMakerFullAccess',
    instance_count=1, 
    instance_type='local',
    output_path=s3_output)

In [67]:
lda.set_hyperparameters(
    num_topics=10, 
    feature_dim=len(dictionary), 
    mini_batch_size=1,
    alpha0=0.1)

In [68]:
lda.fit(inputs={'train': s3_training_path})

Error response from daemon: pull access denied for 766337827248.dkr.ecr.us-east-1.amazonaws.com/lda, repository does not exist or may require 'docker login': denied: User: arn:aws:iam::783491625988:user/Administrator is not authorized to perform: ecr:BatchGetImage on resource: arn:aws:ecr:us-east-1:766337827248:repository/lda because no resource-based policy allows the ecr:BatchGetImage action


CalledProcessError: Command '['docker', 'pull', '766337827248.dkr.ecr.us-east-1.amazonaws.com/lda:1']' returned non-zero exit status 1.

In [None]:
# first initialize some AWS S3 and role parameters to be used later
# depending where you are running the experiment, adapt the parameters to your AWS environment

bucket='mynlpexperimentbucketforsagemaker'
input_s3_url = "s3://mynlpexperimentbucketforsagemaker/"
output_s3_url = "s3://mycomprehendoutputbucket3110"
data_access_role_arn = "arn:aws:iam::783491625988:role/ComprehendAccess"


In [None]:
#import boto and connect to S3
import boto3
s3 = boto3.client('s3')
# let us list the files available for analysis in the S3 bucket
subfolder=''
contents = s3.list_objects(Bucket=bucket, Prefix=subfolder)['Contents']
number_of_docs=0
filenames=[]
for f in contents:
    number_of_docs=number_of_docs+1
    print(f['Key'])
    filenames.append(f['Key'])




In [None]:
def split_s3_path(s3_path):
    path_parts=s3_path.replace("s3://","").split("/")
    bucket=path_parts.pop(0)
    key="/".join(path_parts)
    return bucket, key

In [None]:
bucket, key = split_s3_path(outputfilename)

In [None]:
print(bucket)

In [None]:
print(key)

In [None]:
s3.download_file(bucket,key,'./output.tar.gz')

In [None]:
!gunzip < output.tar.gz | tar -xv

In [None]:
import pandas as pd

In [None]:
import seaborn as sea
import matplotlib as plt

In [None]:
doc_topics=pd.read_csv('doc-topics.csv',lineterminator='\n')

In [None]:
topic_terms=pd.read_csv('topic-terms.csv',lineterminator='\n')

In [None]:
display(topic_terms)

In [None]:
topic0_df=topic_terms[topic_terms['topic']==0]
plot=sea.barplot(x=topic0_df.term,y=topic0_df.weight,order=topic0_df.sort_values(by=['weight'], ascending=False).set_index('term').index)
plot.set_xticklabels(plot.get_xticklabels(),rotation=60)

In [None]:
final_df = topic_terms.sort_values(by=['topic','weight'], ascending=False)
grid = sea.FacetGrid(final_df, col="topic", col_wrap=3,hue="topic",margin_titles=True, sharex=False)
grid_map=grid.map(sea.barplot,"term","weight",order=None);
for ax in grid_map.axes.flatten():
    ax.set_xticklabels(ax.get_xticklabels(),rotation=60)
grid_map.tight_layout()

In [None]:
topic_keywords = []
for i in range(number_of_topics):
    keywords = []
    df_sub= topic_terms[topic_terms['topic']==i].sort_values(by=['weight'], ascending=False)
    keywords = ", ".join(df_sub['term'])
    i=i+1
    topic_keywords.append(keywords)
topic_keywords_df= pd.DataFrame(topic_keywords)

In [None]:
display(doc_topics)

In [None]:
topics_df = pd.DataFrame()
for filename in filenames:
    sub_topics_df=doc_topics[doc_topics['docname']==filename].sort_values(by=['proportion'], ascending=False)
    row=sub_topics_df.iloc[0]
    topics_df = topics_df.append(pd.Series([row['docname'],int(row['topic']), round(row['proportion'],4)]), ignore_index=True)
topics_df.columns = ['Document name', 'Topic', 'Contribution']
topics_df

In [None]:
#Top 3 Keywords for each Topic

topic_top3words = [(i,topic) for i, topics in enumerate(topic_keywords) for (j, topic) in enumerate(topics.split(',')) if j<3]
print(topic_top3words)

df_top3words_stacked = pd.DataFrame(topic_top3words, columns=['topic_id', 'words'])
df_top3words = df_top3words_stacked.groupby('topic_id').agg(', \n'.join)
df_top3words.reset_index(level=0,inplace=True)
df_top3words

In [None]:
# Plot
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

#fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4), dpi=120, sharey=True)

plt.figure(figsize=(15,10))
# Topic Distribution by Dominant Topics
ax=sea.histplot(data=topics_df,x=topics_df.Topic)
ax.set_xticks(range(topics_df.Topic.unique().__len__()))
tick_formatter = FuncFormatter(lambda x, pos: 'Topic ' + str(x)+ '\n' + df_top3words.loc[df_top3words.topic_id==x, 'words'].values[0])
ax.xaxis.set_major_formatter(tick_formatter)
#ax.set_xticklabels(ax.get_xticklabels(),rotation=60)
ax.set_title('Number of Documents by Dominant Topic', fontdict=dict(size=10))
ax.set_ylabel('Number of Documents')
ax.set_ylim(0, 7)

In [None]:
# let us list the files available for analysis in the S3 bucket
s3 = boto3.client(
    service_name='s3',
    region_name='us-east-1',
    aws_access_key_id='AKIA3M255OQCLQRJ5W63',
    aws_secret_access_key='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'
)


subfolder=''
contents = s3.list_objects(Bucket=bucket, Prefix='')['Contents']
number_of_docs=0
filenames=[]
for f in contents:
    number_of_docs=number_of_docs+1
    print(f['Key'])
    filenames.append(f['Key'])


In [32]:
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()

role = get_execution_role()
print(role)

Couldn't call 'get_role' to get Role ARN from role name arn:aws:iam::783491625988:root to get Role path.


ValueError: The current AWS identity is not a role: arn:aws:iam::783491625988:root, therefore it cannot be used as a SageMaker execution role

In [72]:
retrieve(framework='linear-learner',region='eu-west-1')

'438346466558.dkr.ecr.eu-west-1.amazonaws.com/linear-learner:1'

In [75]:
from sagemaker.huggingface import HuggingFaceModel
import sagemaker 

role = DUMMY_IAM_ROLE # sagemaker.get_execution_role()

# Hub Model configuration. https://huggingface.co/models
hub = {
  'HF_MODEL_ID':'distilbert-base-uncased-distilled-squad', # model_id from hf.co/models
  'HF_TASK':'question-answering' # NLP task you want to use for predictions
}

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
   env=hub,
   role=role, # iam role with permissions to create an Endpoint
   transformers_version="4.6", # transformers version used
   pytorch_version="1.7", # pytorch version used
   py_version="py36", # python version of the DLC
)


In [76]:


# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
   initial_instance_count=1,
   instance_type="local"
)



KeyError: 'ModelDataUrl'