In [2]:
import pdfplumber
import os
from tqdm.notebook import tqdm
import math
import pandas as pd
import numpy as np


In [3]:
pdf_path = r'E:\Others\document_assistant\nke-10k-2023.pdf'

In [4]:
import pymupdf

text_list = []

doc = pymupdf.open(pdf_path)
for page_number,page in tqdm(enumerate(doc),total=len(doc)):
    text_info={}
    text = page.get_text()
    text_info['page_num'] = page_number
    text_info['text_length'] = len(text)
    text_info['content'] = text
    text_info['num_sentences'] = len(text.split('. '))
    text_info['num_words'] = len(text.split(' '))
    text_info['num_tokens'] = math.ceil(len(text)/4)
    text_list.append(text_info)

  0%|          | 0/107 [00:00<?, ?it/s]

In [5]:
text_list[:2]

[{'page_num': 0,
  'text_length': 3635,
  'content': "Table of Contents\nUNITED STATES\nSECURITIES AND EXCHANGE COMMISSION\nWashington, D.C. 20549\nFORM 10-K\n(Mark One)\n☑ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934\nFOR THE FISCAL YEAR ENDED MAY 31, 2023\nOR\n☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934\nFOR THE TRANSITION PERIOD FROM \xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0TO \xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0.\nCommission File No.\xa01-10635\nNIKE, Inc.\n(Exact name of Registrant as specified in its charter)\nOregon\n93-0584541\n(State or other jurisdiction of incorporation)\n(IRS Employer Identification No.)\nOne Bowerman Drive, Beaverton, Oregon 97005-6453\n(Address of principal executive offices and zip code)\n(503) 671-6453\n(Registrant's telephone number, including area code)

In [6]:
df = pd.DataFrame(text_list)
df.head()
df.set_index('page_num',inplace=True)
df.head()

Unnamed: 0_level_0,text_length,content,num_sentences,num_words,num_tokens
page_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,3635,Table of Contents\nUNITED STATES\nSECURITIES A...,4,478,909
1,378,"Table of Contents\nAs of July 12, 2023, the nu...",1,47,95
2,1883,"Table of Contents\nNIKE, INC.\nANNUAL REPORT O...",2,166,471
3,4433,Table of Contents\nPART I\nITEM 1. BUSINESS\nG...,21,618,1109
4,3556,Table of Contents\nWe also offer interactive c...,18,489,889


In [7]:
from spacy.lang.en import English

nlp = English()
nlp.add_pipe('sentencizer')
for info_dict in tqdm(text_list):
    doc = nlp(info_dict['content'])
    info_dict['sentences'] = [sent.text.strip() for sent in doc.sents]

  0%|          | 0/107 [00:00<?, ?it/s]

In [8]:
text_list[1]

{'page_num': 1,
 'text_length': 378,
 'content': "Table of Contents\nAs of July\xa012, 2023, the number of shares of the Registrant's Common Stock outstanding were:\nClass\xa0A\n304,897,252\xa0\nClass B\n1,225,074,356\xa0\n1,529,971,608\xa0\nDOCUMENTS INCORPORATED BY REFERENCE:\nParts of Registrant's Proxy Statement for the Annual Meeting of Shareholders to be held on September\xa012, 2023, are incorporated by reference into Part III of this report.\n",
 'num_sentences': 1,
 'num_words': 47,
 'num_tokens': 95,
 'sentences': ["Table of Contents\nAs of July\xa012, 2023, the number of shares of the Registrant's Common Stock outstanding were:\nClass\xa0A\n304,897,252\xa0\nClass B\n1,225,074,356\xa0\n1,529,971,608\xa0\nDOCUMENTS INCORPORATED BY REFERENCE:\nParts of Registrant's Proxy Statement for the Annual Meeting of Shareholders to be held on September\xa012, 2023, are incorporated by reference into Part III of this report.",
  '']}

In [9]:
from sentence_transformers import SentenceTransformer

In [10]:
embedding_model = SentenceTransformer('all-mpnet-base-v2')

In [11]:


for info_dict in tqdm(text_list):
    info_dict['sentence_embeddings'] = embedding_model.encode(info_dict['sentences'])


  0%|          | 0/107 [00:00<?, ?it/s]

In [12]:
text_list[1]

{'page_num': 1,
 'text_length': 378,
 'content': "Table of Contents\nAs of July\xa012, 2023, the number of shares of the Registrant's Common Stock outstanding were:\nClass\xa0A\n304,897,252\xa0\nClass B\n1,225,074,356\xa0\n1,529,971,608\xa0\nDOCUMENTS INCORPORATED BY REFERENCE:\nParts of Registrant's Proxy Statement for the Annual Meeting of Shareholders to be held on September\xa012, 2023, are incorporated by reference into Part III of this report.\n",
 'num_sentences': 1,
 'num_words': 47,
 'num_tokens': 95,
 'sentences': ["Table of Contents\nAs of July\xa012, 2023, the number of shares of the Registrant's Common Stock outstanding were:\nClass\xa0A\n304,897,252\xa0\nClass B\n1,225,074,356\xa0\n1,529,971,608\xa0\nDOCUMENTS INCORPORATED BY REFERENCE:\nParts of Registrant's Proxy Statement for the Annual Meeting of Shareholders to be held on September\xa012, 2023, are incorporated by reference into Part III of this report.",
  ''],
 'sentence_embeddings': array([[ 0.02619972, -0.0179656

In [13]:
text_list[1]['sentence_embeddings'][0].shape

(768,)

In [23]:
sample_query = "What was the revenue of the company in 2022?"

sample_embedding = embedding_model.encode(sample_query)

In [24]:
sim = embedding_model.similarity(sample_embedding,text_list[1]['sentence_embeddings'])

In [25]:
similiarities = []
sim_idx = []
for i in tqdm(text_list):
    sim = embedding_model.similarity(sample_embedding,i['sentence_embeddings'])
    # sim_idx = sim.argmax()
    similiarities.append(sim.max())
    sim_idx.append(sim.argmax())

  0%|          | 0/107 [00:00<?, ?it/s]

In [26]:
max(similiarities)

tensor(0.7522)

In [27]:
sim_idx[np.argmax(similiarities)]

tensor(1)

In [28]:
retrieved_txt = text_list[np.argmax(similiarities)]['sentences'][sim_idx[np.argmax(similiarities)]]
retrieved_pg_no = text_list[np.argmax(similiarities)]['page_num']

In [29]:
retrieved_pg_no

57

In [20]:
simarr = np.array(sim)
simarr

array([[0.18066058, 0.48257306, 0.20899755]], dtype=float32)

In [21]:
a = [0,9,10]

top_2_idx = np.argsort(simarr[0])[-2:]
print(top_2_idx)
top_2_values = [simarr[0][i] for i in top_2_idx]
print(top_2_values)

[2 1]
[0.20899755, 0.48257306]


Using Langchain

In [30]:
from langchain_community.document_loaders import PyMuPDFLoader

In [32]:
loader = PyMuPDFLoader(pdf_path)
documents = loader.load()

In [33]:
len(documents)

107

In [None]:
#Cleaning header and footer
fr
