In [None]:
# !pip install pymongo
# !pip install nltk
# !pip install pdfminer.six
# !pip install 'pdfminer.six[image]'

In [57]:
import setup
from pprint import pprint

In [58]:
## NECESSARY TO USE CURRENT ENVIRONMENT 
setup.init_django()
import helpers

In [59]:
!pwd

/Users/salahuddinpalagiri/Desktop/Repos/Mera/slack-rag/src


In [60]:
## CONSTANTS
HUGGING_FACE_KEY = helpers.config('HUGGING_FACE_KEY', default=None, cast=str)
MONGO_USER = helpers.config('MONGO_USER', default=None, cast=str)
MONGO_PASSWORD = helpers.config('MONGO_PASSWORD', default=None, cast=str)

In [61]:
## MONGO SETUP

In [62]:
from pymongo import MongoClient

client = MongoClient(f"mongodb+srv://{MONGO_USER}:{MONGO_PASSWORD}@system-design.3tsw599.mongodb.net/?retryWrites=true&w=majority")

# Specify the database
db = client["sys_design_data"]  # Replace with your database name

# Specify the collection
collection = db["topics"] 

In [63]:
## EMBEDDING SETUP
    # model used --> JINA [https://jina.ai/embeddings/]

In [64]:
import requests

API_URL = "https://api-inference.huggingface.co/models/sentence-transformers/all-MiniLM-L6-v2"
headers = {"Authorization": f"Bearer {HUGGING_FACE_KEY}"}

def generate_embedding(content):
    response = requests.post(API_URL, headers, json = { "inputs": content})

    if response.status_code != 200:
        raise ValueError(f"Embedding Failed wit status code: {response.status_code} {response.text}")
    return response.json()
                        

In [65]:
# generate_embedding("What is system design?")

In [66]:
import json
def generate_embedding_jina(text):
    url = "https://api.jina.ai/v1/embeddings"
    JINA_EMBEDDING_KEY = helpers.config("JINA_EMBEDDING", default=None, cast=str)
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {JINA_EMBEDDING_KEY}"
    }
    
    payload = {
        "model": "jina-clip-v1",
        "embedding_type": "float",
        "input": [
            {"text": text},
        ]
    }
    
    # Send the request
    response = requests.post(url, headers=headers, data=json.dumps(payload))
    
    # Check the response
    if response.status_code == 200:
        return response.json()['data'][0]['embedding']
    else:
        return response.text

In [67]:
# generate_embedding_jina("what is system design?")

In [68]:
## PDF DATA EXTRACTION USING PDFMINER

In [69]:
# extract all images from pdf [command-line command]
# ! pdf2text.py [pdf-path] --output-dir [output-path]

In [70]:
## AFTER EXTRACTING THE TEXT WRITE A SCRIPT TO SELECT EACH TOPIC [EACH HEADING IN PDF] 

In [71]:
#EXTRACTING TOPIC FROM THE TEXT
file_path = 'data/topics'
with open(file_path, 'r') as file:
    data = file.readlines()
    parsed_data = []
    for i, line in enumerate(data):
        if line!='\n':  
            # print(line)
            values = line.strip() 
            parsed_data.append(values)

In [72]:
print(len(parsed_data))

180


In [73]:
# read all the conent 
file_path = 'data/ttx_copy'  

with open(file_path, 'r') as file:
    # Step 2: Read the contents
    file_contents = file.read()

In [74]:
## DATA CLEANING USING NLTK

In [75]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/salahuddinpalagiri/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/salahuddinpalagiri/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/salahuddinpalagiri/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [76]:
lemmatizer = WordNetLemmatizer()

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    
    word_tokens = word_tokenize(text)
    # print(word_tokens)
    filtered_text = [lemmatizer.lemmatize(word) for word in word_tokens if word not in stop_words]
    # print(filtered_text)
    cleaned_text = ' '.join(filtered_text)
    
    return cleaned_text

In [77]:
# clean_text("Over to you: Do you have any other Kafka use cases to share?")

In [78]:
## FINDS THE INDICES FOR US SO THAT WE CAN EXTRACT CONTENT FOR EACH TOPIC
def find_line_start_index(text, target_line):
    lines = text.splitlines()
    current_index = 0
    
    for line in lines:
        if line.strip() == target_line.strip():  
            return current_index
        current_index += len(line) + 1 
    
    return -1 

In [79]:
def extract_remaining_text(input_string, keyword="Over to you"):
    index = input_string.find(keyword)
    # print(index)
    if index != -1:
        return input_string[:index]
    else:
        return "Keyword not found"

In [80]:
# extract_remaining_text("""Kafka was originally built for massive log processing. It retains messages until expiration and lets
# consumers pull messages at their own pace.

# Let’s review the popular Kafka use cases.
# Log processing and analysis

# -
# - Data streaming in recommendations
# System monitoring and alerting
# -
# CDC (Change data capture)
# -
# System migration
# -

# Over to you: Do you have any other Kafka use cases to share?""")

In [81]:
## DATA FORMATTING AND STORAGE IN MONGO [CLEANING + EMBEDDING + STORAGE]

In [87]:
import os
topics = parsed_data

# Path to the folder containing images
images_folder = "data/other_images"

# Function to extract number from filename
def extract_number(filename):
    match = re.search(r'\d+', filename)
    return int(match.group()) if match else float('inf')

# Get list of image files in the folder and sort them by the extracted number
images_list = sorted([f for f in os.listdir(images_folder) if os.path.isfile(os.path.join(images_folder, f))], key=extract_number)
print(len(images_list))
# print(image_files)

181


In [92]:
def format_data(topic, content, image_url):
    filtered_questions_content = extract_remaining_text(content) 
    if filtered_questions_content!= "Keyword not found":
        content = filtered_questions_content
    data['Topic'] = line
    data['content'] = content
    data['content_embedding'] = generate_embedding_jina(f"{line} {clean_text(content)}")
    data['image_url'] = image_url
    return data

In [93]:
data_list = []
flag = False
for i,line in enumerate(parsed_data):
    data = {}
    index = find_line_start_index(file_contents,line)
    if i==len(parsed_data)-1: #last Topic
        data_list.append(format_data(line, file_contents[index+len(line)+1:], images_list[i]))
        break
    next_index = find_line_start_index(file_contents,parsed_data[i+1])
    if next_index == -1:
        print(next_index)
        next_index = find_line_start_index(file_contents,parsed_data[i+2])
        print(parsed_data[i+2])
    else:
        pass
        # print(parsed_data[i+1])
    # print(index,next_index)
    
    # print(line)
    # print(index,len(line)+1)

    content = file_contents[index+len(line)+1:next_index]
    data = format_data(line, content, images_list[i])
    # print(i,line)
    # print(content)
    data_list.append(data)
    # print("----------------------------------------------")

In [95]:
print(data_list[-1]['image_url'])

X1119.jpg


In [97]:
print(len(data_list))

180


In [98]:
# for data in data_list:
#     pprint(data)
#     break

In [99]:
collection.insert_many(data_list).inserted_ids

[ObjectId('668d27a50515d18c900324c4'),
 ObjectId('668d27a50515d18c900324c5'),
 ObjectId('668d27a50515d18c900324c6'),
 ObjectId('668d27a50515d18c900324c7'),
 ObjectId('668d27a50515d18c900324c8'),
 ObjectId('668d27a50515d18c900324c9'),
 ObjectId('668d27a50515d18c900324ca'),
 ObjectId('668d27a50515d18c900324cb'),
 ObjectId('668d27a50515d18c900324cc'),
 ObjectId('668d27a50515d18c900324cd'),
 ObjectId('668d27a50515d18c900324ce'),
 ObjectId('668d27a50515d18c900324cf'),
 ObjectId('668d27a50515d18c900324d0'),
 ObjectId('668d27a50515d18c900324d1'),
 ObjectId('668d27a50515d18c900324d2'),
 ObjectId('668d27a50515d18c900324d3'),
 ObjectId('668d27a50515d18c900324d4'),
 ObjectId('668d27a50515d18c900324d5'),
 ObjectId('668d27a50515d18c900324d6'),
 ObjectId('668d27a50515d18c900324d7'),
 ObjectId('668d27a50515d18c900324d8'),
 ObjectId('668d27a50515d18c900324d9'),
 ObjectId('668d27a50515d18c900324da'),
 ObjectId('668d27a50515d18c900324db'),
 ObjectId('668d27a50515d18c900324dc'),
 ObjectId('668d27a50515d1

In [100]:
# for i,line in enumerate(parsed_data):
#     index = find_line_start_index(file_contents,line)
#     print(i,file_contents[index:index+len(line)+1])

0 Explaining 9 types of API testing

1 Top 5 Kafka use cases

2 How is data sent over the internet? What does that have to do with the OSI model? How does TCP/IP fit into this?

3 Top 5 common ways to improve API performance

4 CI/CD Simplified Visual Guide

5 There are over 1,000 engineering blogs. Here are my top 9 favorites:

6 REST API Authentication Methods

7 Linux boot Process Explained

8 How do SQL Joins Work?

9 Netflix's Tech Stack

10 Top Architectural Styles

11 What does ACID mean?

12 Oauth 2.0 Explained With Simple Terms

13 The Evolving Landscape of API Protocols in 2023

14 Explaining 8 Popular Network Protocols in 1 Diagram

15 Data Pipelines Overview

16 CAP, BASE, SOLID, KISS, What do these acronyms mean?

17 GET, POST, PUT... Common HTTP “verbs” in one figure

18 How Do C++, Java, Python Work?

19 Top 12 Tips for API Security

20 Our recommended materials to crack your next tech interview

21 How To Release A Mobile App

22 A handy cheat sheet for the most popular

In [None]:
## VECTOR INDEX SEARCH MONGODB

In [None]:
def get_mongo_data(message):
    pipeline = [
        {
            '$vectorSearch': {
            'index': 'slac_rag_bot', 
            'path': 'content_embedding', 
            'queryVector': generate_embedding_jina(message),
            'numCandidates': 23, 
            'limit': 5
            }
        }, {
            '$project': {
                'Topic': 1, 
                'content': 1, 
            }
        }
    ]
    try:
        return collection.aggregate(pipeline)
    except Exception as e:
        print(f"Error in aggregation: {e}")
        return []

In [None]:
## EMBEDDING THE CONTENT FIELD SO THAT WE CAN USE IT LATE IN RAG FOR CONTEXT

In [None]:
data =get_mongo_data("caching strategies")

In [None]:
data = list(data)
print(len(data))
for d in data:
    print(d)

In [None]:
# !pip install matplotlib

In [None]:
import os
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

# List of topics
topics = parsed_data

# Path to the folder containing images
images_folder = "data/other_images"


# Function to extract number from filename
def extract_number(filename):
    match = re.search(r'\d+', filename)
    return int(match.group()) if match else float('inf')

# Get list of image files in the folder and sort them by the extracted number
image_files = sorted([f for f in os.listdir(images_folder) if os.path.isfile(os.path.join(images_folder, f))], key=extract_number)

print(len(image_files))
print(image_files)


In [None]:
# Function to display an image
def display_image(image_path):
    img = mpimg.imread(image_path)
    plt.imshow(img)
    plt.axis('off')
    plt.show()

# Loop through topics and corresponding images
for i, topic in enumerate(topics):
    if i < len(image_files):
        image_path = os.path.join(images_folder, image_files[i])
        print(f"Topic: {topic}")
        print(f"Displaying image: {image_files[i]}")
        display_image(image_path)
        input("Press Enter to continue to the next topic...")
    else:
        print(f"No image available for {topic}")