In [None]:
# !pip install pymongo

In [None]:
import setup
from pprint import pprint

In [None]:
setup.init_django()
import helpers

In [None]:
!pwd

In [None]:
## CONSTANTS
HUGGING_FACE_KEY = helpers.config('HUGGING_FACE_KEY', default=None, cast=str)
MONGO_USER = helpers.config('MONGO_USER', default=None, cast=str)
MONGO_PASSWORD = helpers.config('MONGO_PASSWORD', default=None, cast=str)

In [None]:
## MONGO SETUP

In [None]:
from pymongo import MongoClient

client = MongoClient(f"mongodb+srv://{MONGO_USER}:{MONGO_PASSWORD}@system-design.3tsw599.mongodb.net/?retryWrites=true&w=majority")

# Specify the database
db = client["sys_design_data"]  # Replace with your database name

# Specify the collection
collection = db["Topics"] 

In [None]:
## EMBEDDING SETUP
    # model used --> all-MiniLM-L12-v2 [ HUGGING FACE INFERENCE API]

In [None]:
import requests

API_URL = "https://api-inference.huggingface.co/models/sentence-transformers/all-MiniLM-L6-v2"
headers = {"Authorization": f"Bearer {HUGGING_FACE_KEY}"}

def generate_embedding(content):
    response = requests.post(API_URL, headers, json = { "inputs": content})

    if response.status_code != 200:
        raise ValueError(f"Embedding Failed wit status code: {response.status_code} {response.text}")
    return response.json()
                        

In [None]:
generate_embedding("What is system design?")

In [None]:
import json
def generate_embedding_jina(text):
    url = "https://api.jina.ai/v1/embeddings"
    JINA_EMBEDDING_KEY = helpers.config("JINA_EMBEDDING", default=None, cast=str)
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {JINA_EMBEDDING_KEY}"
    }
    
    # Define the payload
    payload = {
        "model": "jina-clip-v1",
        "embedding_type": "float",
        "input": [
            {"text": text},
        ]
    }
    
    # Send the request
    response = requests.post(url, headers=headers, data=json.dumps(payload))
    
    # Check the response
    if response.status_code == 200:
        return response.json()['data'][0]['embedding']
    else:
        return response.text

In [None]:
generate_embedding_jina("what is system design?")

In [None]:
## DATA CLEANING USING NLTK
# !pip install nltk

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
lemmatizer = WordNetLemmatizer()

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    
    word_tokens = word_tokenize(text)
    
    filtered_text = [lemmatizer.lemmatize(word) for word in word_tokens if word not in stop_words]
    
    cleaned_text = ' '.join(filtered_text)
    
    return cleaned_text

In [None]:
## PDF DATA EXTRACTION USING PDFMINER

In [None]:
# !pip install pdfminer.six
# !pip install 'pdfminer.six[image]'

In [None]:
# extract all images from pdf [command-line command]
# ! pdf2text.py [pdf-path] --output-dir [output-path]

In [None]:
## AFTER EXTRACTING THE TEXT WRITE A SCRIPT TO SELECT EACH TOPIC [EACH HEADING IN PDF] 

In [None]:
#EXTRACTING TOPIC FROM THE TEXT
file_path = 'data/topics'
with open(file_path, 'r') as file:
    data = file.readlines()
    parsed_data = []
    for i, line in enumerate(data):
        if line!='\n':  
            # print(line)
            values = line.strip() 
            parsed_data.append(values)

In [None]:
# read all the conent 
file_path = 'data/ttx'  

with open(file_path, 'r') as file:
    # Step 2: Read the contents
    file_contents = file.read()

In [None]:
## FINDS THE INDICES FOR US SO THAT WE CAN EXTRACT CONTENT FOR EACH TOPIC
def find_line_start_index(text, target_line):
    lines = text.splitlines()
    current_index = 0
    
    for line in lines:
        if line.strip() == target_line.strip():  
            return current_index
        current_index += len(line) + 1 
    
    return -1 

In [None]:
data_list = []
for i,line in enumerate(parsed_data):
    data = {}
    index = find_line_start_index(file_contents,line)
    if i==len(parsed_data)-1: #last Topic
        print(i,line)
        print(file_contents[index+len(line)+1:])
        break
    next_index = find_line_start_index(file_contents,parsed_data[i+1])
    if next_index == -1:
        print(next_index)
        next_index = find_line_start_index(file_contents,parsed_data[i+2])
        print(parsed_data[i+2])
    else:
        pass
        # print(parsed_data[i+1])
    # print(index,next_index)
    
    # print(line)
    # print(index,len(line)+1)
    content = clean_text(file_contents[index+len(line)+1:next_index])
    data['Topic'] = clean_text(line)
    data['content'] = content
    data['content_embedding'] = generate_embedding_jina(content)
    # print(i,line)
    # print(content)
    data_list.append(data)

In [None]:
# for data in data_list:
#     pprint(data)
#     break

In [None]:
collection.insert_many(data_list).inserted_ids

In [None]:
# for i,line in enumerate(parsed_data):
#     index = find_line_start_index(file_contents,line)
#     print(i,file_contents[index:index+len(line)+1])

In [1]:
## VECTOR INDEX SEARCH MONGODB

In [None]:
def get_mongo_data(message):
    pipeline = [
        {
            '$vectorSearch': {
            'index': 'slac_rag_bot', 
            'path': 'content_embedding', 
            'queryVector': generate_embedding_jina(message),
            'numCandidates': 23, 
            'limit': 5
            }
        }, {
            '$project': {
                'Topic': 1, 
                'content': 1, 
            }
        }
    ]
    try:
        return collection.aggregate(pipeline)
    except Exception as e:
        print(f"Error in aggregation: {e}")
        return []

In [None]:
## EMBEDDING THE CONTENT FIELD SO THAT WE CAN USE IT LATE IN RAG FOR CONTEXT

In [None]:
data =get_mongo_data("caching strategies")

In [None]:
data = list(data)
print(len(data))
for d in data:
    print(d)