In [1]:
###  Data ingestion

from langchain_community.document_loaders import TextLoader
loader = TextLoader("speech.txt")
data = loader.load()
data

[Document(page_content='“I have a dream that one day down in Alabama, with its vicious racists, with its governor having his lips dripping with the words of interposition and nullification – one day right there in Alabama little black boys and black girls will be able to join hands with little white boys and white girls as sisters and brothers.\n\nI have a dream today.\n\nI have a dream that one day every valley shall be exalted, and every hill and mountain shall be made low, the rough places will be made plain, and the crooked places will be made straight, and the glory of the Lord shall be revealed and all flesh shall see it together.\n\nThis is our hope. This is the faith that I go back to the South with. With this faith we will be able to hew out of the mountain of despair a stone of hope. With this faith we will be able to transform the jangling discords of our nation into a beautiful symphony of brotherhood. With this faith we will be able to work together, to pray together, to s

In [2]:
import os
os.environ["OPENAI_API_KEY"] = ""


In [7]:
## web based loader
from langchain_community.document_loaders import WebBaseLoader
import bs4

### load, chunk and index the content of the html page

loader = WebBaseLoader(web_path=("https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/",), 
                       bs_kwargs=dict(parse_only=bs4.SoupStrainer(
                           class_ = ("post-title", "post-content", "post-header")
                       )))

data = loader.load()
data

[Document(page_content='\n\n      Adversarial Attacks on LLMs\n    \nDate: October 25, 2023  |  Estimated Reading Time: 33 min  |  Author: Lilian Weng\n\n\nThe use of large language models in the real world has strongly accelerated by the launch of ChatGPT. We (including my team at OpenAI, shoutout to them) have invested a lot of effort to build default safe behavior into the model during the alignment process (e.g. via RLHF). However, adversarial attacks or jailbreak prompts could potentially trigger the model to output something undesired.\nA large body of ground work on adversarial attacks is on images, and differently it operates in the continuous, high-dimensional space. Attacks for discrete data like text have been considered to be a lot more challenging, due to lack of direct gradient signals. My past post on Controllable Text Generation is quite relevant to this topic, as attacking LLMs is essentially to control the model to output a certain type of (unsafe) content.\nThere is 

In [10]:
## reading from a pdf

from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader(file_path="./Satyadwyoom_Kumar_CV.pdf")
data = loader.load()
data

[Document(page_content='(+91)-8588038790\nGurugram, India\nsatya29m3@gmail.comSatyadwyoom KumarSatyadwyoom.com\ngithub/satyadwyoom\nlinkedin/satyadwyoom\nEducation\nUniversity of Delhi (NSIT) Aug 2018 - May 2022\nB.E., Electronics and Communication Engineering\nCVPSK Scholar (Awarded to Top 10 Students)\nThesis: Introducing temporally consistent weather conditions in aerial videos using LSTM & Cycle-GAN.\nCoursework: Computer Programming, Data Structures and Algorithms, Pattern Recognition, Image Processing.\nIndustry Experience\nOYO Rooms Jul 2022 - Present\nSenior Data Scientist — Dynaminc Pricing, Stuck Classification, EDA\n- Engineered a statistical model for dynamic pricing, leading to reduction in operation costs by $28K/month.\n- Developed method employs a cost/impact-based sorting algorithm on moving averages of accepted prices, and price\nbucketed escalations to predict optimal prices at minimal overhead cost.\n- Developed and deployed an XGBoost classifier (f1 score: 88%) on 

In [11]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
document = text_splitter.split_documents(data)
document

[Document(page_content='(+91)-8588038790\nGurugram, India\nsatya29m3@gmail.comSatyadwyoom KumarSatyadwyoom.com\ngithub/satyadwyoom\nlinkedin/satyadwyoom\nEducation\nUniversity of Delhi (NSIT) Aug 2018 - May 2022\nB.E., Electronics and Communication Engineering\nCVPSK Scholar (Awarded to Top 10 Students)\nThesis: Introducing temporally consistent weather conditions in aerial videos using LSTM & Cycle-GAN.\nCoursework: Computer Programming, Data Structures and Algorithms, Pattern Recognition, Image Processing.\nIndustry Experience\nOYO Rooms Jul 2022 - Present\nSenior Data Scientist — Dynaminc Pricing, Stuck Classification, EDA\n- Engineered a statistical model for dynamic pricing, leading to reduction in operation costs by $28K/month.\n- Developed method employs a cost/impact-based sorting algorithm on moving averages of accepted prices, and price\nbucketed escalations to predict optimal prices at minimal overhead cost.', metadata={'source': './Satyadwyoom_Kumar_CV.pdf', 'page': 0}),
 D

In [13]:
### vector embedding and vector-store
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
db = Chroma.from_documents(document, OpenAIEmbeddings())

In [18]:
### vector database
query = "search about NLP"
result = db.similarity_search(query)
result[0].page_content

'Zeroth-Order Optimization. arXiv , 2024 [Preprint]\nProjects \nPersonal WhatsApp Message Responder — ChatGPT, LLMs\n- Utilised Selenium to fetch and send messages from/to WhatsApp chats.\n- Further introduced the ability to fetch messages from a particular person in a group chat.\n- Extracted messages are then used to generate a response using GPT-3.5.\nNLP Tasks - using Transformers — ChatGPT-2, BERT, SVM, Random Forest, LSTMs\n- Fine-tuned BERT leading to an improvement of 7% on sentiment analysis task for airline tweets.\n- Implemented transfer learning on GPT-2 to tackle text-entailment problem.\n- Fine-tuned Distil-BERT on SQuAD dataset for Question/Answering task (f1 score: 88%).\n- Engineered and deployed a machine learning-based Reddit post flair detection web app on Heroku.\n- Fetched 1500+ unique Reddit posts for a variety of flairs appearing on r/india using PRAW API.\n- Employed preprocessing techniques: Stemming/Lemmatization to bring word tokens to their root form.'

In [20]:
## Faiss vector database
from langchain_community.vectorstores import FAISS
db1 = FAISS.from_documents(document, OpenAIEmbeddings())
query = "search about NLP"
result = db1.similarity_search(query)
result[0].page_content

'Zeroth-Order Optimization. arXiv , 2024 [Preprint]\nProjects \nPersonal WhatsApp Message Responder — ChatGPT, LLMs\n- Utilised Selenium to fetch and send messages from/to WhatsApp chats.\n- Further introduced the ability to fetch messages from a particular person in a group chat.\n- Extracted messages are then used to generate a response using GPT-3.5.\nNLP Tasks - using Transformers — ChatGPT-2, BERT, SVM, Random Forest, LSTMs\n- Fine-tuned BERT leading to an improvement of 7% on sentiment analysis task for airline tweets.\n- Implemented transfer learning on GPT-2 to tackle text-entailment problem.\n- Fine-tuned Distil-BERT on SQuAD dataset for Question/Answering task (f1 score: 88%).\n- Engineered and deployed a machine learning-based Reddit post flair detection web app on Heroku.\n- Fetched 1500+ unique Reddit posts for a variety of flairs appearing on r/india using PRAW API.\n- Employed preprocessing techniques: Stemming/Lemmatization to bring word tokens to their root form.'