In [81]:
# !pip install langchain_community

In [82]:
from langchain.document_loaders import TextLoader

In [83]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [84]:
loader = TextLoader('/content/drive/MyDrive/datasets/nvda_news_1.txt')
data = loader.load()
data



In [85]:
data[0].metadata

{'source': '/content/drive/MyDrive/datasets/nvda_news_1.txt'}

In [86]:
from langchain.document_loaders.csv_loader import CSVLoader
loader = CSVLoader('/content/drive/MyDrive/datasets/movies.csv',source_column="title")
data = loader.load()
len(data)

9

In [87]:
data[0].page_content

'movie_id: 101\ntitle: K.G.F: Chapter 2\nindustry: Bollywood\nrelease_year: 2022\nimdb_rating: 8.4\nstudio: Hombale Films\nlanguage_id: 3\nbudget: 1\nrevenue: 12.5\nunit: Billions\ncurrency: INR'

In [88]:
data[0].metadata

{'source': 'K.G.F: Chapter 2', 'row': 0}

In [89]:
from langchain.document_loaders import UnstructuredURLLoader

In [90]:
!pip install libmagic python-magic python-magic-bin

Collecting libmagic
  Using cached libmagic-1.0.tar.gz (3.7 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
[31mERROR: Could not find a version that satisfies the requirement python-magic-bin (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for python-magic-bin[0m[31m
[0m

In [91]:
!pip install unstructured



In [92]:
loader = UnstructuredURLLoader(urls=[
    "https://www.indiatoday.in/technology/news/story/nvidia-ceo-talks-about-ai-replacing-human-jobs-says-it-will-change-everyones-job-2739496-2025-06-12",
    "https://www.moneycontrol.com/news/tags/nvidia.html"
])

In [93]:
data = loader.load()
len(data)

2

In [94]:
data[0]



In [95]:
text = """Susan Morrow owns an upscale Los Angeles art gallery. Her current show involves a presentation of obese majorettes dancing vigorously while naked. Susan receives a proof of a novel written by her estranged ex-husband Edward Sheffield along with an invitation for dinner during Edward's upcoming visit to Los Angeles. Upset by her deteriorating marriage to unfaithful businessman Hutton Morrow, Susan becomes consumed by the novel, which is dedicated to her and named Nocturnal Animals after Edward's nickname for her.

In this novel, Tony Hastings is a family man who runs afoul of three local troublemakers – Ray Marcus, Lou, and Turk – during a road trip through West Texas. After being forced off the road, Tony is powerless to stop Ray and Turk from kidnapping his wife, Laura, and their daughter, India, leaving him with Lou, who forces him to drive Ray's car to the end of a road where he is abandoned. Tony manages to evade Ray and Lou when they return looking for him and makes his way to a nearby farmhouse to call the police.

Detective Roberto "Bobby" Andes is assigned to the case and with Tony, discovers the bodies of Laura and India near an abandoned shack, where they had been raped and murdered. Tony is wracked with guilt. He is contacted by Andes a year later and is asked to identify Lou, who was caught in a botched robbery and is charged as an accomplice in Laura and India's murders."""

In [96]:
from langchain.text_splitter import CharacterTextSplitter
splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size=200,
    chunk_overlap=0
)
chunks = splitter.split_text(text)
len(chunks)



3

In [97]:
chunks

["Susan Morrow owns an upscale Los Angeles art gallery. Her current show involves a presentation of obese majorettes dancing vigorously while naked. Susan receives a proof of a novel written by her estranged ex-husband Edward Sheffield along with an invitation for dinner during Edward's upcoming visit to Los Angeles. Upset by her deteriorating marriage to unfaithful businessman Hutton Morrow, Susan becomes consumed by the novel, which is dedicated to her and named Nocturnal Animals after Edward's nickname for her.",
 "In this novel, Tony Hastings is a family man who runs afoul of three local troublemakers – Ray Marcus, Lou, and Turk – during a road trip through West Texas. After being forced off the road, Tony is powerless to stop Ray and Turk from kidnapping his wife, Laura, and their daughter, India, leaving him with Lou, who forces him to drive Ray's car to the end of a road where he is abandoned. Tony manages to evade Ray and Lou when they return looking for him and makes his way t

In [98]:
for chunk in chunks:
  print(len(chunk))

517
516
369


In [99]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [100]:
r_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n",".","?","\n"," "],
    chunk_size=200,
    chunk_overlap=0
)
r_chunks = r_splitter.split_text(text)
len(r_chunks)

11

In [101]:
for chunk in r_chunks:
  print(len(chunk))

145
170
197
3
1
156
200
31
128
174
195


In [80]:
# !pip install faiss-cpu
# !pip install sentence-transformers



In [102]:
import pandas as pd
pd.set_option('display.max_colwidth',100)

In [103]:
df = pd.read_csv("/content/drive/MyDrive/datasets/sample_text.csv")
df.shape

(8, 2)

In [104]:
from sentence_transformers import SentenceTransformer
encoder = SentenceTransformer("all-mpnet-base-v2")
vectors = encoder.encode(df.text)
vectors.shape

(8, 768)

In [105]:
vectors

array([[-0.00247395,  0.03626722, -0.05290459, ..., -0.09152356,
        -0.03970001, -0.04330489],
       [-0.03357267,  0.00980519, -0.03250129, ..., -0.05165466,
         0.02245887, -0.03156182],
       [-0.01865322, -0.04051318, -0.01235387, ...,  0.00610586,
        -0.07179645,  0.02773851],
       ...,
       [-0.00066458,  0.04252127, -0.05645508, ...,  0.0131547 ,
        -0.03183567, -0.04357665],
       [-0.03317153,  0.03252455, -0.02484838, ...,  0.0117442 ,
         0.05747124,  0.00571023],
       [-0.00166395,  0.00413828, -0.04597083, ...,  0.02008527,
         0.05656243, -0.00161596]], dtype=float32)

In [106]:
dim = vectors.shape[1]
dim

768

In [107]:
#for faster indexing
import faiss
index = faiss.IndexFlatL2(dim)
index

<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x781781136dc0> >

In [108]:
index.add(vectors)

In [109]:
search_query = "I want to buy a color jeans"
vec = encoder.encode(search_query)
vec.shape

(768,)

In [110]:
import numpy as np
svec = np.array(vec).reshape(1,-1)
svec.shape

(1, 768)

In [111]:
distances, I = index.search(svec, k=2)

In [112]:
df

Unnamed: 0,text,category
0,Meditation and yoga can improve mental health,Health
1,"Fruits, whole grains and vegetables helps control blood pressure",Health
2,These are the latest fashion trends for this week,Fashion
3,Vibrant color jeans for male are becoming a trend,Fashion
4,The concert starts at 7 PM tonight,Event
5,Navaratri dandiya program at Expo center in Mumbai this october,Event
6,Exciting vacation destinations for your next trip,Travel
7,Maldives and Srilanka are gaining popularity in terms of low budget vacation places,Travel


In [113]:
I

array([[3, 2]])

In [114]:
df.loc[I[0]]

Unnamed: 0,text,category
3,Vibrant color jeans for male are becoming a trend,Fashion
2,These are the latest fashion trends for this week,Fashion
