In [None]:
!pip install faiss-cpu qdrant_client



In [None]:
import pandas as pd
import re
import string
import spacy
from sentence_transformers import SentenceTransformer
import faiss
import qdrant_client
from qdrant_client.models import PointStruct
import numpy as np
from qdrant_client import QdrantClient

In [None]:
# Load Spacy model for NLP tasks
nlp = spacy.load("en_core_web_sm")

# Load dataset (replace with actual dataset path)
df = pd.read_csv("udemy_course_data.csv")

In [None]:
# Data Cleaning and Normalization
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(f"[{string.punctuation}]", "", text)  # Remove punctuation
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    return text

df['course_title'] = df['course_title'].apply(clean_text)

In [None]:
# Tokenization and Lemmatization
def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop])

df['course_title'] = df['course_title'].apply(lemmatize_text)

In [None]:
# Named Entity Recognition (NER)
def extract_entities(text):
    doc = nlp(text)
    entities = {ent.label_: ent.text for ent in doc.ents}
    return entities

df['entities'] = df['course_title'].apply(extract_entities)

In [None]:
# Vectorization using Sentence Transformers
model = SentenceTransformer("all-MiniLM-L6-v2")
df['embedding'] = df['course_title'].apply(lambda x: model.encode(x))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Indexing using FAISS
d = len(df['embedding'][0])  # Dimension of vectors
index = faiss.IndexFlatL2(d)
index.add(np.array(df['embedding'].tolist()))

In [None]:
qdrant_client = QdrantClient(
    url="https://fb41c67a-8ffc-4c04-82d3-aea14e0eedbb.europe-west3-0.gcp.cloud.qdrant.io:6333",
    api_key="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.nCv7SoYBTf__-kB0A6e5NiEieq5uqLhTaEGabSrmeH0",
)

In [None]:
# Vector Database Integration with Qdrant
qdrant_client.create_collection(collection_name="course_embeddings", vectors_config={"size": d, "distance": "Cosine"})

In [None]:
# Insert vectors into Qdrant
points = [PointStruct(id=i, vector=df['embedding'][i], payload={"title": df['course_title'][i]}) for i in range(len(df))]
qdrant_client.upsert(collection_name="course_embeddings", points=points)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [None]:
df.head()

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject,profit,published_date,published_time,year,month,day,entities,embedding
0,1070968,ultimate investment banking course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5 hours,2017-01-18T20:58:58Z,Business Finance,429400,2017-01-18,20:58:58Z,2017,1,18,{},"[0.0661822, -0.06267198, -0.121345945, -0.0049..."
1,1113822,complete gst course certification grow practice,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39 hours,2017-03-09T16:34:20Z,Business Finance,209400,2017-03-09,16:34:20Z,2017,3,9,{},"[-0.024903452, 0.045616332, 0.047555085, -0.00..."
2,1006314,financial modeling business analyst consultant,https://www.udemy.com/financial-modeling-for-b...,True,45,2174,74,51,Intermediate Level,2.5 hours,2016-12-19T19:26:30Z,Business Finance,97830,2016-12-19,19:26:30Z,2016,12,19,{},"[0.026175423, -0.037050836, -0.12168076, 0.032..."
3,1210588,beginner pro financial analysis excel 2017,https://www.udemy.com/complete-excel-finance-c...,True,95,2451,11,36,All Levels,3 hours,2017-05-30T20:07:24Z,Business Finance,232845,2017-05-30,20:07:24Z,2017,5,30,{'DATE': '2017'},"[-0.01206798, 0.055876434, -0.11219524, -0.017..."
4,1011058,maximize profit trading option,https://www.udemy.com/how-to-maximize-your-pro...,True,200,1276,45,26,Intermediate Level,2 hours,2016-12-13T14:57:18Z,Business Finance,255200,2016-12-13,14:57:18Z,2016,12,13,{},"[-0.012674744, 0.05066099, -0.05734592, -0.065..."


In [None]:
print(index.ntotal)

3683


In [None]:
print(points[:5])

[PointStruct(id=0, vector=[0.0661822035908699, -0.06267198175191879, -0.12134594470262527, -0.004902704153209925, 0.007751732133328915, -0.06655764579772949, 0.04749322682619095, -0.006305358372628689, -0.032962024211883545, 0.0295303612947464, -0.022445427253842354, 0.03813745453953743, -0.05092768743634224, -0.00980379804968834, -0.031041229143738747, -0.0222789216786623, -0.007082563359290361, -0.06029173359274864, 0.04509580507874489, 0.0009567324887029827, 0.01285659708082676, -0.041956827044487, -0.0020261253230273724, -0.057641107589006424, 0.019938919693231583, 0.01777578704059124, 0.006987770553678274, 0.027636146172881126, -0.03075806424021721, -0.058831967413425446, -0.0077907550148665905, 0.03795338794589043, -0.0055187763646245, -0.004869609139859676, 0.03692697733640671, -0.002579933498054743, 0.04151371866464615, 0.016155609861016273, 0.06747237592935562, -0.08366535604000092, -0.07615744322538376, -0.009757866151630878, 0.006495179608464241, -0.028793493285775185, 0.051