In [1]:
#Split the pages of a pdf
#No need to split

In [2]:
## Covert pdf to text file
#!pip install pdfminer
#!pip install io
import io
from io import StringIO
import string
import pandas as pd
import os
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams

def convert_pdf_to_txt(path):
    #alltexts = []
    filelist=os.listdir(path)
    documentcollection=[]
    for files in filelist:
        files=os.path.join(path,files)
        documentcollection.append(files)
    for ifiles in documentcollection:
        if ifiles.endswith('.pdf') or ifiles.endswith('.PDF'): #different extensions on the raw data
            with open(ifiles, 'rb') as fh:
                for page in PDFPage.get_pages(fh, 
                                              caching=True,
                                              check_extractable=True):
                    resource_manager = PDFResourceManager()
                    fake_file_handle = io.StringIO()
                    converter = TextConverter(resource_manager, fake_file_handle)
                    page_interpreter = PDFPageInterpreter(resource_manager, converter)
                    page_interpreter.process_page(page)
 
                    text = fake_file_handle.getvalue() # extraction of the text data
                    yield text
 
                    # closing open handles
                    converter.close()
                    fake_file_handle.close()
        
    #return alltexts

In [3]:
#Loading the saved model and a random PDF file to test classification
import pickle
vectorizer = pickle.load(open('vectorizer.pkl','rb'))
NB_model = pickle.load(open('nbmodel.pkl','rb'))
RF_model = pickle.load(open('rfmodel.pkl','rb'))
XGB_model = pickle.load(open('xgbmodel.pkl','rb'))

In [4]:
#import the pdf file
filepath='/Users/baggu/Downloads/PDF'
textcontents = convert_pdf_to_txt(filepath)
df_textpages = pd.DataFrame(textcontents, columns = ['Text_Data'])
df_textpages['page_no'] = list(range(1,len(df_textpages.index)+1))
raw_textpages = df_textpages.copy()
df_textpages

Unnamed: 0,Text_Data,page_no
0,2021 2021 ANNUAL REPORT(A joint stock company ...,1
1,ProfileThe predecessor of the Bank was Agricul...,2
2,1Annual Report 2021Definitions2Basic Corporate...,3
3,"Definitions2In this report, unless the context...",4
4,3Annual Report 2021Definitions13.H Share(s)Sha...,5
...,...,...
359,358Notes to the Consolidated Financial Stateme...,360
360,359Annual Report 2021Unaudited Supplementary F...,361
361,360Unaudited Supplementary Financial Informati...,362
362,,363


In [5]:
## preprocess the data

#Defining the pre-preprocessing steps
import nltk
# Needed only once
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')
import re, unidecode, string
from bs4 import BeautifulSoup
from nltk.stem.porter import PorterStemmer 
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text(separator=" ")
    return stripped_text
def remove_accented_chars(text):
    text = unidecode.unidecode(text)
    return text
def remove_numbers(text): 
    result = re.sub(r'\d+', '', text) 
    return result
def remove_slash_with_space(text): 
    return text.replace('\\', " ")
def remove_punctuation(text): 
    translator = str.maketrans('', '', string.punctuation) 
    return text.translate(translator) 
def text_lowercase(text): 
    return text.lower()     
def remove_whitespace(text): 
    return  " ".join(text.split()) 
def remove_stopwords(text): 
    stop_words = set(stopwords.words("english")) 
    word_tokens = word_tokenize(text) 
    filtered_text = [word for word in word_tokens if word not in stop_words] 
    return ' '.join(filtered_text)
def stem_words(text): 
    stemmer = PorterStemmer() 
    word_tokens = word_tokenize(text) 
    stems = [stemmer.stem(word) for word in word_tokens] 
    return ' '.join(stems)
def lemmatize_words(text): 
    lemmatizer = WordNetLemmatizer() 
    word_tokens = word_tokenize(text) 
    # provide context i.e. part-of-speech 
    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens] 
    return ' '.join(lemmas) 

# Perform preprocessing
def perform_preprocessing(text):
    text = remove_html_tags(text)
    text = remove_accented_chars(text)
    text = remove_numbers(text)
    text = remove_stopwords(text)
    text = text_lowercase(text)
    text = remove_slash_with_space(text)
    text = remove_punctuation(text)
    text = stem_words(text)
    #text = lemmatize_words(text)
    text = remove_whitespace(text)
    return text

df_textpages.Text_Data = df_textpages.Text_Data.apply(perform_preprocessing)

In [6]:
inputs = vectorizer.transform(df_textpages.Text_Data)
print(inputs.shape)
#print(type(inputs))
#print(vectorizer.get_feature_names_out())
#print(inputs.toarray())

(364, 963)


In [7]:
# Predicting the category of the input file with the help of trained model

output_XGB = XGB_model.predict(inputs)
#Comment the next line if you are testing word2vec model as it doesn't require transformation
#output_category = (labelencoder.inverse_transform((output_category)))
#print(output_XGB)

In [8]:
raw_textpages['output'] = output_XGB
display(raw_textpages)

Unnamed: 0,Text_Data,page_no,output
0,2021 2021 ANNUAL REPORT(A joint stock company ...,1,1
1,ProfileThe predecessor of the Bank was Agricul...,2,1
2,1Annual Report 2021Definitions2Basic Corporate...,3,1
3,"Definitions2In this report, unless the context...",4,1
4,3Annual Report 2021Definitions13.H Share(s)Sha...,5,1
...,...,...,...
359,358Notes to the Consolidated Financial Stateme...,360,0
360,359Annual Report 2021Unaudited Supplementary F...,361,2
361,360Unaudited Supplementary Financial Informati...,362,1
362,,363,1


In [10]:
file_name = 'Prediction_test_2.xlsx'
raw_textpages.to_excel(file_name)

In [23]:
# selecting rows category wise

FS_df = raw_textpages.loc[raw_textpages['output'] == 0]
print('\n Financial Statement Pages :\n', list(FS_df.page_no))

Notes_df = raw_textpages.loc[raw_textpages['output'] == 2]
print('\n Financial Notes Pages :\n', list(Notes_df.page_no))

Junk_df = raw_textpages.loc[raw_textpages['output'] == 1]
print('\n Junk Pages :\n', list(Junk_df.page_no))


 Financial Statement Pages :
 [202, 203, 204, 205, 206, 207, 208, 360]

 Financial Notes Pages :
 [76, 192, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 243, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 361]

 Junk Pages :
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 3