In [None]:
!pip install pymupdf
!pip install unidecode
!pip install langchain
!pip install sentence_transformers
!pip install pinecone-client

In [None]:
import fitz
my_path = "/content/ubuntu-server-guide-2023-10-15.pdf"
doc = fitz.open(my_path)

In [None]:
for page in doc:
    output = page.get_text("blocks")
    previous_block_id = 0 # Set a variable to mark the block id
    for block in output:
        if block[6] == 0: # We only take the text
            if previous_block_id != block[5]:
                # Compare the block number
                print("\n")
            print(block[4])

In [None]:
from unidecode import unidecode
output = []
for page in doc:
    output += page.get_text("blocks")
previous_block_id = 0 # Set a variable to mark the block id
for block in output:
  if block[6] == 0: # We only take the text
    if previous_block_id != block[5]: # Compare the block number
       #print("\n")
       plain_text = unidecode(block[4])
       #print(plain_text)

In [None]:
block_dict = {}
page_num = 1
for page in doc: # Iterate all pages in the document
    file_dict = page.get_text('dict') # Get the page dictionary
    block = file_dict['blocks'] # Get the block information
    block_dict[page_num] = block # Store in block dictionary
    page_num += 1 # Increase the page value by 1

In [None]:
import pandas as pd
import numpy as np
import re

In [None]:
spans = pd.DataFrame(columns=['xmin', 'ymin', 'xmax', 'ymax', 'text', 'tag'])
rows = []
for page_num, blocks in block_dict.items():
    for block in blocks:
        if block['type'] == 0:
            for line in block['lines']:
                for span in line['spans']:
                    xmin, ymin, xmax, ymax = list(span['bbox'])
                    font_size = span['size']
                    text = unidecode(span['text'])
                    span_font = span['font']
                    is_upper = False
                    is_bold = False
                    if "bold" in span_font.lower():
                        is_bold = True
                    if re.sub("[\(\[].*?[\)\]]", "", text).isupper():
                        is_upper = True
                    if text.replace(" ","") !=  "":
                        rows.append((xmin, ymin, xmax, ymax, text,                              is_upper, is_bold, span_font, font_size))
                        span_df = pd.DataFrame(rows, columns=['xmin','ymin','xmax','ymax', 'text', 'is_upper','is_bold','span_font', 'font_size'])

In [None]:
span_scores = []
span_num_occur = {}

for index, span_row in span_df.iterrows():
    score = 0

    if span_row.is_bold: # get bolded
        score +=1

    span_scores.append(score)

values, counts = np.unique(span_scores, return_counts=True)

In [None]:
style_dict = {}

for value, count in zip(values, counts):

    style_dict[value] = count

sorted(style_dict.items(), key=lambda x: x[1])

In [None]:
p_size = max(style_dict, key=style_dict.get)
idx = 0
tag = {}

for size in sorted(values, reverse = True):
    idx += 1

    if size == p_size:
        idx = 0
        tag[size] = 'p'

    if size > p_size:
        tag[size] = 'h'

    if size < p_size:
        tag[size] = 's'

In [None]:
span_tags = [tag[score] for score in span_scores]
span_df['tag'] = span_tags

In [None]:
headings_list = []
text_list = []
tmp = []
heading = ''

for index, span_row in span_df.iterrows():
    text = span_row.text
    tag = span_row.tag

    if 'h' in tag:
        headings_list.append(text)
        text_list.append('\n'.join(tmp))
        tmp = []
        heading = text
    else:
        tmp.append(text)

text_list.append('\n'.join(tmp))
text_list = text_list[1:]
text_df = pd.DataFrame(zip(headings_list, text_list),columns=['heading', 'content'] )

In [None]:
text_df['merged'] = text_df['heading'] + text_df['content']

In [None]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-small-en"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
hf = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [None]:
import pinecone

pinecone.init(api_key="db858125-bcf3-4f68-bfe1-514d2d62a476", environment="gcp-starter")
pinecone.list_indexes()
index = pinecone.Index("ubuntuir")

In [None]:
#Split the documentation dataset according to specifications below. 
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 100,
    chunk_overlap  = 20,
    length_function = len,
    add_start_index = True,
)

In [None]:
#Generate word embeddings of documentation chunks using pre-trained model 
counter = 0
for i in range(len(text_df.index)):
    info = text_df.iloc[i]['merged']
    texts = text_splitter.create_documents([info])
    for j in texts:
      embedding = hf.embed_query(j.page_content)
      index.upsert([(str(counter),embedding,{"text":j.page_content})])
      print(f'Embedding count: {counter}')
      counter += 1
#print(f'Total number of embeddings: {counter}')

In [None]:
# Query 1
query = "How do I upgrade an ubuntu server?"

embedding = hf.embed_query(query)

index.query(
  vector=embedding,
  top_k=15,
  include_metadata=True
)

In [None]:
# Query 2
query = "How do I ssh into an external server?"

embedding = hf.embed_query(query)

index.query(
  vector=embedding,
  top_k=15,
  include_metadata=True
)

In [None]:
# Query 3
query = "How do I create a new user account?"

embedding = hf.embed_query(query)

index.query(
  vector=embedding,
  top_k=15,
  include_metadata=True
)

In [None]:
# Query 4
query = "How do I copy files over into an external server?"

embedding = hf.embed_query(query)

index.query(
  vector=embedding,
  top_k=15,
  include_metadata=True
)

In [None]:
# Query 5
query = "How do I reset a forgotten Ubuntu account password?"

embedding = hf.embed_query(query)

index.query(
  vector=embedding,
  top_k=15,
  include_metadata=True
)

In [None]:
# Query 6
query = "How do I install software on Ubuntu?"

embedding = hf.embed_query(query)

index.query(
  vector=embedding,
  top_k=15,
  include_metadata=True
)

In [None]:
# Query 7
query = "How do I check the hardware specifications of a server?"

embedding = hf.embed_query(query)

index.query(
  vector=embedding,
  top_k=15,
  include_metadata=True
)

In [None]:
# Query 8
query = "How can I delete a non-empty directory?"

embedding = hf.embed_query(query)

index.query(
  vector=embedding,
  top_k=15,
  include_metadata=True
)

In [None]:
# Query 9
query = "How do I edit the /etc/fstab file?"

embedding = hf.embed_query(query)

index.query(
  vector=embedding,
  top_k=15,
  include_metadata=True
)

In [None]:
# Query 10
query = "How do I edit a read-only file in Ubuntu?"

embedding = hf.embed_query(query)

index.query(
  vector=embedding,
  top_k=15,
  include_metadata=True
)

In [None]:
# Query 11
query = "How can I rename a file in the terminal?"

embedding = hf.embed_query(query)

index.query(
  vector=embedding,
  top_k=15,
  include_metadata=True
)

In [None]:
# Query 12
query = "How can I synchronize the time in Ubuntu?"

embedding = hf.embed_query(query)

index.query(
  vector=embedding,
  top_k=15,
  include_metadata=True
)

In [None]:
# Query 13
query = "How do I configure Samba as a file server?"

embedding = hf.embed_query(query)

index.query(
  vector=embedding,
  top_k=15,
  include_metadata=True
)

In [None]:
# Query 14
query = "What is the difference between the “mv” and “cp” commands?"

embedding = hf.embed_query(query)

index.query(
  vector=embedding,
  top_k=15,
  include_metadata=True
)

In [None]:
# Query 15
query = "How do I install Nvidia drivers on Ubuntu?"

embedding = hf.embed_query(query)

index.query(
  vector=embedding,
  top_k=15,
  include_metadata=True
)