Created on December 28th  2021 by Patrick Rotzetter

https://www.linkedin.com/in/rotzetter/

## Small experiment of document mining with various techniques Part 9

Let us use AWS built-in LDA algorithm for topic modeling

# Import libraries

In [56]:
# install NLTK and gensim if required
!pip3 -q install nltk gensim
!pip3 install texthero

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes


In [57]:
# Import require libraries
import numpy as np
import pandas as pd
import texthero as hero
import boto3

# Setup S3 parameters

In [58]:
#initialize some parameters depending where you are running the experiment, adapt the parameters to your AWS environment
bucket='mymltextarticles'
subfolder=''
region='us-east-1'

In [60]:
# let us list the files available for analysis in the S3 bucket
import os
s3s = boto3.client('s3')
s3 = boto3.resource('s3')

#contents = s3.list_objects(Bucket=bucket, Prefix=subfolder)#['Contents']
mybucket = s3.Bucket(bucket)
mybucket.objects.filter(Prefix='foo/bar')
for file in mybucket.objects.all():
    root,ext = os.path.splitext(file.key)
    if ext in ['.txt']:
        filename=os.path.basename(file.key)
        target_filename='./sampledocs/'+filename
        print(target_filename)
        s3s.download_file(bucket, file.key, target_filename)

./sampledocs/AI-bank-of-the-future-Can-banks-meet-the-AI-challenge-1.txt
./sampledocs/Artificial Financial Intelligence.txt
./sampledocs/Data machine the insurers using AI to reshape the industry Financial Times.txt
./sampledocs/Digital-disruption-in-Insurance.txt
./sampledocs/Impact-Big-Data-AI-in-the-Insurance-Sector.txt
./sampledocs/Innovation_Artificial-Intelligence-in-Insurance-Whitepaper-deloitte-digital.txt
./sampledocs/Insurance-2030-The-impact-of-AI-on-the-future-of-insurance-F.txt
./sampledocs/Issues_Paper_on_Increasing_Digitalisation_in_Insurance_and_its_Potential_Impact_on_Consumer_Outcomes.txt
./sampledocs/Kaggle State of Machine Learning and Data Science 2020.txt
./sampledocs/Module-1-Lecture-Slides.txt
./sampledocs/Technology-and-innovation-in-the-insurance-sector.txt
./sampledocs/WEF_Governance_of_Chatbots_in_Healthcare_2020.txt
./sampledocs/ai-360-research.txt
./sampledocs/ai-insurance.txt
./sampledocs/ai_in_insurance_web_0.txt
./sampledocs/fra-2020-artificial-intellig

# Data preparation

In [61]:
#path of text files
path='./sampledocs/'

In [62]:
# let us scan the full directory, read PDF and PPT documents, clean them and process them with spacy

docName=[]
docType=[]
docText=[]
docNLP=[]
import glob
list_of_files = glob.glob(path+'*.txt')           # create the list of file
fileNames=[]
for file_name in list_of_files:
    f = open(file_name,'r')
    fileText=f.read()
    docName.append(file_name)
    docType.append('txt')
    docText.append(fileText)
fullDocs = pd.DataFrame({'Name':docName,'Type':docType,'Text':docText})
fullDocs['cleanText']=hero.clean(fullDocs['Text'])


In [63]:
 print ("Average length of text:" + str((np.mean(fullDocs['Text'].str.len()))))
 print ("Min length of text:" + str((np.min(fullDocs['Text'].str.len()))))
 print ("Max length of text:" + str((np.max(fullDocs['Text'].str.len()))))

Average length of text:166530.15789473685
Min length of text:9170
Max length of text:1513210


In [64]:
fullDocs['text_word_count'] = fullDocs['Text'].apply(lambda x: len(x.strip().split()))  # word count
fullDocs['text_unique_words']=fullDocs['Text'].apply(lambda x:len(set(str(x).split())))  # number of unique words
fullDocs.head()

Unnamed: 0,Name,Type,Text,cleanText,text_word_count,text_unique_words
0,./sampledocs/Technology-and-innovation-in-the-...,txt,Technology and\ninnovation in the\ninsurance s...,technology innovation insurance sector technol...,16742,4228
1,./sampledocs/ai-360-research.txt,txt,AI 360: insights from the\nnext frontier of bu...,ai insights next frontier business corner offi...,5281,1746
2,./sampledocs/Module-1-Lecture-Slides.txt,txt,"Application of AI, Insurtech and Real Estate\n...",application ai insurtech real estate technolog...,3728,1506
3,./sampledocs/Insurance-2030-The-impact-of-AI-o...,txt,Insurance Practice\n\nInsurance 2030—\nThe imp...,insurance practice insurance -- impact ai futu...,4424,1782
4,./sampledocs/sigma-5-2020-en.txt,txt,No 5 /2020\n\nMachine intelligence in\ninsuran...,machine intelligence insurance insights end en...,14478,4329


In [65]:
fullDocs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Name               19 non-null     object
 1   Type               19 non-null     object
 2   Text               19 non-null     object
 3   cleanText          19 non-null     object
 4   text_word_count    19 non-null     int64 
 5   text_unique_words  19 non-null     int64 
dtypes: int64(2), object(4)
memory usage: 1.0+ KB


In [66]:
import string
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
    
stop_words = stopwords.words('english')
wnl = WordNetLemmatizer()

def process_text(text):
    for p in string.punctuation:
        text = text.replace(p, '')
    text = ''.join([c for c in text if not c.isdigit()])
    text = text.lower().split()
    text = [w for w in text if not w in stop_words] 
    text = [wnl.lemmatize(w) for w in text]
    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [67]:
fullDocs['cleanText'] = fullDocs['cleanText'].apply(process_text)

In [68]:
fullDocs.head()

Unnamed: 0,Name,Type,Text,cleanText,text_word_count,text_unique_words
0,./sampledocs/Technology-and-innovation-in-the-...,txt,Technology and\ninnovation in the\ninsurance s...,"[technology, innovation, insurance, sector, te...",16742,4228
1,./sampledocs/ai-360-research.txt,txt,AI 360: insights from the\nnext frontier of bu...,"[ai, insight, next, frontier, business, corner...",5281,1746
2,./sampledocs/Module-1-Lecture-Slides.txt,txt,"Application of AI, Insurtech and Real Estate\n...","[application, ai, insurtech, real, estate, tec...",3728,1506
3,./sampledocs/Insurance-2030-The-impact-of-AI-o...,txt,Insurance Practice\n\nInsurance 2030—\nThe imp...,"[insurance, practice, insurance, impact, ai, f...",4424,1782
4,./sampledocs/sigma-5-2020-en.txt,txt,No 5 /2020\n\nMachine intelligence in\ninsuran...,"[machine, intelligence, insurance, insight, en...",14478,4329


In [69]:
from gensim import corpora
dictionary = corpora.Dictionary(fullDocs['cleanText'])

In [70]:
print(dictionary)

Dictionary(11382 unique tokens: ['ab', 'ability', 'able', 'abundantly', 'abusive']...)


In [71]:
# Let us filter only the top 1024 words
dictionary.filter_extremes(keep_n=1024)
print(dictionary)

Dictionary(1024 unique tokens: ['accelerating', 'acceleration', 'acceptance', 'accepted', 'accessing']...)


In [72]:
with open('vocab.txt', 'w') as f:
    for index in range(0,len(dictionary)):
        f.write(dictionary.get(index)+'\n')

In [73]:
fullDocs['tokens'] = fullDocs.apply(lambda row: dictionary.doc2bow(row['cleanText']), axis=1)

In [74]:
data = fullDocs.drop(['cleanText'], axis=1)
data = data.drop(['Name'], axis=1)
data = data.drop(['Type'], axis=1)
data = data.drop(['Text'], axis=1)
data = data.drop(['text_word_count'], axis=1)
data = data.drop(['text_unique_words'], axis=1)
data.head()

Unnamed: 0,tokens
0,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 2..."
1,"[(2, 2), (4, 1), (11, 1), (18, 1), (22, 1), (2..."
2,"[(7, 1), (9, 1), (13, 3), (19, 2), (26, 3), (3..."
3,"[(0, 2), (6, 1), (8, 4), (9, 1), (10, 1), (18,..."
4,"[(4, 1), (5, 1), (8, 1), (9, 3), (11, 2), (22,..."


In [75]:
import sagemaker.amazon.common as smac
from scipy.sparse import lil_matrix

prefix = 'training'



In [76]:
def build_protobuf_dataset(data, dictionary):
    num_lines = data.shape[0]
    num_columns = len(dictionary)
    token_matrix = lil_matrix((num_lines, num_columns)).astype('float32')
    line = 0
    for _, row in data.iterrows():
        for token_id, token_count in row['tokens']:
            token_matrix[line, token_id] = token_count
        line+=1
        
    buf = io.BytesIO()
    smac.write_spmatrix_to_sparse_tensor(buf, token_matrix, None)
    return buf

In [77]:
def upload_protbuf_dataset(buf, bucket, prefix, key):
    obj = '{}/{}'.format(prefix, key)
    buf.seek(0)
    s3s.upload_fileobj(buf, bucket, obj)
    path = 's3://{}/{}'.format(bucket,obj)
    return path

In [78]:
import io
training_buf = build_protobuf_dataset(data, dictionary)
s3_training_path = upload_protbuf_dataset(training_buf, bucket, prefix, 'training/training.protobuf')
print(s3_training_path)

s3://mymltextarticles/training/training/training.protobuf


# Training

In [79]:
s3_output = 's3://{}/{}/output/'.format(bucket, prefix)

print(s3_output)

s3://mymltextarticles/training/output/


In [82]:
from sagemaker.image_uris import retrieve
from sagemaker import get_execution_role
import sagemaker
region = 'us-east-1'  
container = retrieve('lda', region)
print(container)

766337827248.dkr.ecr.us-east-1.amazonaws.com/lda:1


In [83]:
role = get_execution_role()
print(role)

lda = sagemaker.estimator.Estimator(
    container,
    role=role,
    instance_count=1, 
    instance_type='ml.c4.2xlarge',
    output_path=s3_output)

arn:aws:iam::012086180905:role/service-role/AmazonSageMaker-ExecutionRole-20211121T093897


In [84]:
lda.set_hyperparameters(
    num_topics=10, 
    feature_dim=len(dictionary), 
    mini_batch_size=1,
    alpha0=0.1)

In [85]:
lda.fit(inputs={'train': s3_training_path})

2022-01-05 08:29:37 Starting - Starting the training job...
2022-01-05 08:29:46 Starting - Launching requested ML instancesProfilerReport-1641371377: InProgress
......
2022-01-05 08:30:56 Starting - Preparing the instances for training.........
2022-01-05 08:32:37 Downloading - Downloading input data
2022-01-05 08:32:37 Training - Downloading the training image...
2022-01-05 08:33:08 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mUsing mxnet backend.[0m
[34m[01/05/2022 08:33:11 INFO 139908622219072] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'alpha0': u'1.0', u'max_restarts': u'10', u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'allow_svd_init': u'true', u'epochs': u'1', u'tol': u'1e-8', u'_kvstore': u'local', u'max_iterations': u'1000'}[0m
[34m[01/05/2022 08:33:11 INFO 139908622219072] Reading provided configurati

# Load trained model
This will only work if the training completed successfully and the model is saved under the assumed location, you can change it if required

In [86]:
import sagemaker
from sagemaker import get_execution_role
s3_output='s3://mymltextarticles/training/output/lda-2022-01-05-08-29-37-479/output/model.tar.gz'
role = get_execution_role()
lda=sagemaker.LDAModel(s3_output, role)

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


In [87]:
lda_predictor = lda.deploy(
    initial_instance_count=1,
    instance_type='ml.t2.large')

---------!

# Model prediction using training data
This assumes that a data frame and a dictionary have been prepared using the cells above for data preparation
It also assumes the model has been loaded and the endpoint has been created

In [88]:
def prepare_samples(data, dictionary):
    num_lines = data.shape[0]
    num_columns = len(dictionary)
    line=0
    sample_matrix = np.zeros((num_lines, num_columns)).astype('float32')
    for _, row in data.iterrows():
        for token_id, token_count in row['tokens']:
            sample_matrix[line, token_id] = token_count
        line+=1

    return sample_matrix

In [89]:
samples= prepare_samples(data, dictionary)
samples.shape

(19, 1024)

In [90]:
lda_predictor.serializer = sagemaker.serializers.CSVSerializer()
response = lda_predictor.predict(samples)
#print(response)

In [91]:
import json

#response = json.loads(response)
for r in response:
    topic_vector=r.label['topic_mixture'].float32_tensor.values
    top_topic= np.argmax(topic_vector)
    print(top_topic)
    #vectors = [r['topic_mixture'] for r in response['predictions']]

3
8
7
5
0
7
3
0
4
6
0
4
2
2
1
4
2
9
4


In [92]:
lda_predictor.delete_endpoint()