In [49]:
import os
import requests
import fitz 
from tqdm.auto import tqdm
import random
import pandas as pd
from spacy.lang.en import English
import torch
from sentence_transformers import SentenceTransformer
# Get PDF document path


In [15]:
pdf_path = "resources/spring-framework.pdf"

# Download PDF
if not os.path.exists(pdf_path):
    print("[INFO] File doesn't exist, downloading...")
    url = "https://docs.spring.io/spring-framework/docs/6.0.0/reference/pdf/spring-framework.pdf"
    filename = pdf_path
    response = requests.get(url)
    if response.status_code == 200:
        with open(filename, "wb") as file:
            file.write(response.content) 
        print(f"[INFO] The file has been download and saved as {filename}")
    else:
        print(f"[INFO] Failed to download the file. Status code: {reponse.status_code}")
else:
    print(f"File {pdf_path} exists.")



File resources/spring-framework.pdf exists.


In [16]:
def text_formatter(text: str) -> str: 
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip()
    return cleaned_text

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_texts = [] 
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text=text)
        pages_and_texts.append({"page_number": page_number,
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_setence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4, # 1 token = ~4 characters
                                "text": text})
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:2]

0it [00:00, ?it/s]

[{'page_number': 0,
  'page_char_count': 44,
  'page_word_count': 5,
  'page_setence_count_raw': 1,
  'page_token_count': 11.0,
  'text': 'Spring Framework Documentation Version 6.0.0'},
 {'page_number': 1,
  'page_char_count': 3152,
  'page_word_count': 494,
  'page_setence_count_raw': 31,
  'page_token_count': 788.0,
  'text': 'Chapter 1. Spring Framework Overview Spring makes it easy to create Java enterprise applications. It provides everything you need to embrace the Java language in an enterprise environment, with support for Groovy and Kotlin as alternative languages on the JVM, and with the flexibility to create many kinds of architectures depending on an application’s needs. As of Spring Framework 5.1, Spring requires JDK 8+ (Java SE 8+) and provides out-of-the-box support for JDK 11 LTS. Java SE 8 update 60 is suggested as the minimum patch release for Java 8, but it is generally recommended to use a recent patch release. Spring supports a wide range of application scenarios.

In [17]:
random.sample(pages_and_texts, k=3)

[{'page_number': 439,
  'page_char_count': 407,
  'page_word_count': 66,
  'page_setence_count_raw': 2,
  'page_token_count': 101.75,
  'text': 'Java @Configuration(proxyBeanMethods = false) public class DataSourceConfiguration { \xa0   @Bean \xa0   public SimpleDataSource dataSource() { \xa0       return new SimpleDataSource(); \xa0   } } Since there isn’t any particular condition on this class, dataSourceConfiguration and dataSource are identified as candidates. The AOT engine will convert the configuration class above to code similar to the following: 439'},
 {'page_number': 278,
  'page_char_count': 1658,
  'page_word_count': 233,
  'page_setence_count_raw': 8,
  'page_token_count': 414.5,
  'text': 'Java ExpressionParser parser = new SpelExpressionParser(); Expression exp = parser.parseExpression("new String(\'hello world\').toUpperCase()"); ① String message = exp.getValue(String.class); ① Construct a new String from the literal and make it be upper case. Kotlin val parser = SpelE

In [23]:
df = pd.DataFrame(pages_and_texts)
df.set_index("page_number")
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_setence_count_raw,page_token_count,text
0,0,44,5,1,11.0,Spring Framework Documentation Version 6.0.0
1,1,3152,494,31,788.0,Chapter 1. Spring Framework Overview Spring ma...
2,2,2621,418,25,655.25,specifications from the traditional EE umbrell...
3,3,2207,366,23,551.75,• Maintain strong backward compatibility. Spri...
4,4,2720,386,23,680.0,Chapter 2. Core Technologies This part of the ...


In [24]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_setence_count_raw,page_token_count
count,1429.0,1429.0,1429.0,1429.0,1429.0
mean,714.0,1725.97,268.72,10.1,431.49
std,412.66,595.52,92.28,6.43,148.88
min,0.0,44.0,5.0,1.0,11.0
25%,357.0,1304.0,202.0,5.0,326.0
50%,714.0,1677.0,259.0,9.0,419.25
75%,1071.0,2090.0,325.0,14.0,522.5
max,1428.0,3803.0,645.0,38.0,950.75


In [29]:
#Spacy pipeline creator

nlp = English()
nlp.add_pipe("sentencizer")


<spacy.pipeline.sentencizer.Sentencizer at 0x1c4909ff940>

In [None]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]
    item["page_sentence_count_spacy"] = len(item["sentences"])

random.sample(pages_and_texts, k=1)

In [31]:
# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 10


def split_list(input_list: list[str],
               slice_size: int=num_sentence_chunk_size) -> list[list[str]]:
    return [input_list[i:i+slice_size] for i in range(0, len(input_list), slice_size)]

# Loop through pages and texts and split sentences into chunks
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/1429 [00:00<?, ?it/s]

In [32]:
random.sample(pages_and_texts, k=1)

[{'page_number': 1328,
  'page_char_count': 1510,
  'page_word_count': 317,
  'page_setence_count_raw': 3,
  'page_token_count': 377.5,
  'text': 'public interface IJmxTestBean { \xa0   public int add(int x, int y); \xa0   public long myOperation(); \xa0   public int getAge(); \xa0   public void setAge(int age); \xa0   public void setName(String name); \xa0   public String getName(); } This interface defines the methods and properties that are exposed as operations and attributes on the JMX MBean. The following code shows how to configure Spring JMX to use this interface as the definition for the management interface: <beans> \xa0   <bean id="exporter" class="org.springframework.jmx.export.MBeanExporter"> \xa0       <property name="beans"> \xa0           <map> \xa0               <entry key="bean:name=testBean5" value-ref="testBean"/> \xa0           </map> \xa0       </property> \xa0       <property name="assembler"> \xa0           <bean class="org.springframework.jmx.export.assembler.I

In [33]:
df = pd.DataFrame(pages_and_texts)

In [34]:
import re

# Split each chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_and_texts): 
    for sentence_chunk in item["sentence_chunks"]: 
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]

        # Join the sentences together into a paragraph-like structure, aka join the list of sentences into one paragraph
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" => ". A" (will work for any captial letter)

        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get some stats on our chunks
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 chars

        pages_and_chunks.append(chunk_dict) 

len(pages_and_chunks)

  0%|          | 0/1429 [00:00<?, ?it/s]

2255

In [38]:
random.sample(pages_and_chunks, k=1)

[{'page_number': 860,
  'sentence_chunk': 'Java public class MyWebAppInitializer extends AbstractDispatcherServletInitializer { \xa0  @Override \xa0  protected WebApplicationContext createRootApplicationContext() { \xa0    return null; \xa0  } \xa0  @Override \xa0  protected WebApplicationContext createServletApplicationContext() { \xa0    XmlWebApplicationContext cxt = new XmlWebApplicationContext(); \xa0    cxt.setConfigLocation("/WEB-INF/spring/dispatcher-config.xml"); \xa0    return cxt; \xa0  } \xa0  @Override \xa0  protected String[] getServletMappings() { \xa0    return new String[] { "/" }; \xa0  } } Kotlin class MyWebAppInitializer : AbstractDispatcherServletInitializer() { \xa0  override fun createRootApplicationContext(): WebApplicationContext? {\xa0    return null \xa0  } \xa0  override fun createServletApplicationContext(): WebApplicationContext { \xa0    return XmlWebApplicationContext().apply { \xa0      setConfigLocation("/WEB-INF/spring/dispatcher-config.xml") \xa0    

In [39]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)
min_token_length = 30
for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

Chunk token count: 7.0 | Text: If you want only pooling 428
Chunk token count: 25.5 | Text: The SqlUpdate class is concrete. It can be subclassed — for example, to add a custom update method.776
Chunk token count: 11.0 | Text: Spring Framework Documentation Version 6.0.0
Chunk token count: 27.5 | Text: ③ The Flux::switchOnFirst operator allows you to see whether you are handling a form field or file upload.1152
Chunk token count: 21.5 | Text: In other words, you should encounter no difficulties by building with this flag on.347


In [40]:
min_token_length = 30
for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

Chunk token count: 0.75 | Text: 506
Chunk token count: 16.75 | Text: In contrast to RestTemplate, WebClient supports the following: 1046
Chunk token count: 28.25 | Text: The input content can be a JSON array, or any line-delimited JSON format such as NDJSON, JSON Lines, or JSON 1112
Chunk token count: 14.5 | Text: The following listing shows the FormatterRegistry SPI: 264
Chunk token count: 15.25 | Text: As an example, here is an actual exception from WebLogic: 825


In [41]:
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': 1,
  'sentence_chunk': 'Chapter 1. Spring Framework Overview Spring makes it easy to create Java enterprise applications. It provides everything you need to embrace the Java language in an enterprise environment, with support for Groovy and Kotlin as alternative languages on the JVM, and with the flexibility to create many kinds of architectures depending on an application’s needs. As of Spring Framework 5.1, Spring requires JDK 8+ (Java SE 8+) and provides out-of-the-box support for JDK 11 LTS. Java SE 8 update 60 is suggested as the minimum patch release for Java 8, but it is generally recommended to use a recent patch release. Spring supports a wide range of application scenarios. In a large enterprise, applications often exist for a long time and have to run on a JDK and application server whose upgrade cycle is beyond developer control. Others may run as a single jar with the server embedded, possibly in a cloud environment. Yet others may be standalone applicatio

In [52]:
%%time

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device="cpu")
##%%time

# embedding_model.to("cpu")

# # Embed each chunk one by one
# for item in tqdm(pages_and_chunks_over_min_token_len):
#     item["embedding"] = embedding_model.encode(item["sentence_chunk"])

device = "cuda" if torch.cuda.is_available() else "cpu"

embedding_model.to(device)

for item in tqdm(pages_and_chunks_over_min_token_len):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]
text_chunks[419]

  0%|          | 0/2161 [00:00<?, ?it/s]

CPU times: total: 1min 7s
Wall time: 1min 27s


'generic types in expressions, SpEL attempts conversions to maintain type correctness for any objects it encounters. What does this mean in practice?Suppose assignment, using setValue(), is being used to set a List property. The type of the property is actually List<Boolean>. SpEL recognizes that the elements of the list need to be converted to Boolean before being placed in it. The following example shows how to do so: Java class Simple { \xa0  public List<Boolean> booleanList = new ArrayList<Boolean>(); } Simple simple = new Simple(); simple.booleanList.add(true); EvaluationContext context = SimpleEvaluationContext.forReadOnlyDataBinding().build(); // "false" is passed in here as a String. SpEL and the conversion service // will recognize that it needs to be a Boolean and convert it accordingly.parser.parseExpression("booleanList[0]").setValue(context, simple, "false"); // b is false Boolean b = simple.booleanList.get(0); Kotlin class Simple { \xa0  var booleanList: MutableList<Boole

In [54]:
%%time
text_chunk_embeddings = embedding_model.encode(text_chunks,
                                               batch_size=32, # you can experiment to find which batch size leads to best results
                                               convert_to_tensor=True)
text_chunk_embeddings  

CPU times: total: 35.1 s
Wall time: 40.9 s


tensor([[ 0.0249, -0.0249, -0.0234,  ...,  0.0007,  0.0794, -0.0078],
        [ 0.0362, -0.0242, -0.0043,  ...,  0.0340,  0.1058,  0.0037],
        [ 0.0245, -0.0561, -0.0037,  ..., -0.0182,  0.0695,  0.0057],
        ...,
        [-0.0012, -0.0142,  0.0081,  ...,  0.0203, -0.0066,  0.0148],
        [ 0.0009,  0.0007,  0.0173,  ...,  0.0344,  0.0179,  0.0154],
        [-0.0446, -0.0184, -0.0290,  ...,  0.0433, -0.0105,  0.0342]],
       device='cuda:0')

In [55]:
pages_and_chunks_over_min_token_len[419]

{'page_number': 280,
 'sentence_chunk': 'generic types in expressions, SpEL attempts conversions to maintain type correctness for any objects it encounters. What does this mean in practice?Suppose assignment, using setValue(), is being used to set a List property. The type of the property is actually List<Boolean>. SpEL recognizes that the elements of the list need to be converted to Boolean before being placed in it. The following example shows how to do so: Java class Simple { \xa0  public List<Boolean> booleanList = new ArrayList<Boolean>(); } Simple simple = new Simple(); simple.booleanList.add(true); EvaluationContext context = SimpleEvaluationContext.forReadOnlyDataBinding().build(); // "false" is passed in here as a String. SpEL and the conversion service // will recognize that it needs to be a Boolean and convert it accordingly.parser.parseExpression("booleanList[0]").setValue(context, simple, "false"); // b is false Boolean b = simple.booleanList.get(0); Kotlin class Simple { 

In [65]:
# Save embeddings to file
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)


In [66]:

with open('text_chunks_and_embeddings_df.json', 'w') as fp:
    json.dump(text_chunks_and_embeddings_df.to_json(orient='records'), fp)