# Loading Documents

----

In [None]:
!pip install -qU langchain langchain-community langchain-openai langchain-text-splitters


In [None]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("nvda_news_1.txt")

data = loader.load()

data[0].metadata

In [None]:
from langchain_community.document_loaders.csv_loader import CSVLoader

loader = CSVLoader("movies.csv",source_column = 'title')

csv_data = loader.load()
len(csv_data)

In [None]:
csv_data[3].metadata

In [None]:
# Install unstructured for Colab (no magic-bin needed)
!pip install -q "unstructured[local-inference]" langchain-community

# Install system dependencies for PDF parsing
!apt-get update -qq
!apt-get install -qq poppler-utils tesseract-ocr

In [None]:
from langchain_community.document_loaders import UnstructuredURLLoader

url_loader = UnstructuredURLLoader(urls=[
    "https://www.moneycontrol.com/news/business/banks/hdfc-bank-re-appoints-sanmoy-chakrabarti-as-chief-risk-officer-11259771.html",
        "https://www.moneycontrol.com/news/business/markets/market-corrects-post-rbi-ups-inflation-forecast-icrr-bet-on-these-top-10-rate-sensitive-stocks-ideas-11142611.html"

]
)

In [None]:
url_data=url_loader.load()

len(url_data)

url_data[0].metadata

# Text Splitting--->
----

In [None]:
text = """Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan.
It stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine.
Set in a dystopian future where humanity is embroiled in a catastrophic blight and famine, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for humankind.

Brothers Christopher and Jonathan Nolan wrote the screenplay, which had its origins in a script Jonathan developed in 2007 and was originally set to be directed by Steven Spielberg.
Kip Thorne, a Caltech theoretical physicist and 2017 Nobel laureate in Physics,[4] was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar.
Cinematographer Hoyte van Hoytema shot it on 35 mm movie film in the Panavision anamorphic format and IMAX 70 mm. Principal photography began in late 2013 and took place in Alberta, Iceland, and Los Angeles.
Interstellar uses extensive practical and miniature effects, and the company Double Negative created additional digital effects.

Interstellar premiered in Los Angeles on October 26, 2014. In the United States, it was first released on film stock, expanding to venues using digital projectors. The film received generally positive reviews from critics and grossed over $677 million worldwide ($715 million after subsequent re-releases), making it the tenth-highest-grossing film of 2014.
It has been praised by astronomers for its scientific accuracy and portrayal of theoretical astrophysics.[5][6][7] Interstellar was nominated for five awards at the 87th Academy Awards, winning Best Visual Effects, and received numerous other accolades."""

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

recur_split = RecursiveCharacterTextSplitter(
    separators = ["\n\n","\n"," "],
    chunk_size = 100,
    chunk_overlap = 0

)

chunks = recur_split.split_text(text)
print(chunks)

# Vector Database Transform-->

-----

In [None]:
!pip install faiss-cpu
!pip install sentence-transformers

In [None]:
import pandas as pd
df = pd.read_csv("sample_text.csv")
pd.set_option('display.max_colwidth', 100)

df.shape
df

In [None]:
from sentence_transformers import SentenceTransformer

encoder = SentenceTransformer("all-mpnet-base-v2")
vectors = encoder.encode(df.text)
vectors.shape

In [None]:
vectors
dim = vectors.shape[1]
dim

In [None]:
import faiss
index = faiss.IndexFlatL2(dim) #creating an empty vector database
index

#Applying a Search Query
----

In [None]:
search = "I want to buy a pant"

vec = encoder.encode(search)
vec.shape

In [None]:
import numpy as np

search_vec = np.array(vec).reshape(1,-1)
search_vec.shape

In [None]:
distances, indexes = index.search(search_vec,k=2)
indexes