### Installing & Importing Necessary Modules 

In [1]:
# %pip install sentence_transformers

In [1]:
import pandas as pd
import os 

from pymongo import MongoClient
from datetime import datetime, timedelta

from dotenv import load_dotenv
load_dotenv()

True

In [2]:
client = MongoClient(f"mongodb+srv://{os.getenv('mongo_user')}:{os.getenv('mongo_password')}@cluster0.y3ugmg6.mongodb.net/")
db = client["job_listings"]
collection = db["naukri_jobs"]


# Define today's date string in 'YYYY-MM-DD' format
today_str = datetime.utcnow().strftime("%Y-%m-%d")

# Query to match documents where 'uploaded_at' contains today's date
query = {
    "uploaded_at": {
        "$regex": f"^{today_str}"
    }
}

documents = list(collection.find(query, {"_id": 0}))


df=pd.DataFrame(documents)
columns_to_keep = ['job_id', 'title', 'description', 'locations', 'keywords', 'company_name', 'experience', 'salary','rating','review_count']
df = df[columns_to_keep]

##### Info about Data

In [3]:
total_jobs = df['salary'].count()
salary_details = df['salary'].str.lower().eq('not disclosed').sum()
disclosed_jobs = total_jobs - salary_details
print(f"Total Jobs: {total_jobs}")
print(f"Salary not disclosed jobs: {salary_details}")
print(f"Salary disclosed jobs: {disclosed_jobs}")

Total Jobs: 28423
Salary not disclosed jobs: 24907
Salary disclosed jobs: 3516


In [4]:
import torch
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("thenlper/gte-base").to("cuda")

  from tqdm.autonotebook import tqdm, trange


In [6]:

# print(torch.version.cuda)
# print(torch.__version__)
# print(torch.cuda.is_available())

In [5]:
df['locations'] = df['locations'].apply(lambda x: ' '.join(map(str, x)))
df['keywords'] = df['keywords'].apply(lambda x: ' '.join(map(str, x)))


In [6]:
df['combined_data'] = "The name of the company is "+df['company_name']+" The job role is "+df['title']+ " Technologies needed are or techstack is "+df['keywords']+" Locations are "+df['locations']+" Experience needed is "+df['experience']+" Salary range is "+df['salary']+" Rating is "+df['rating']+" and total ratings are "+str(df['review_count'])+" Job description is "+df['description']


In [7]:
def get_embedding(text):
    if not text.strip():
        print("Attempted to get embedding for empty text.")
        return []

    embedding = embedding_model.encode(text)

    return embedding.tolist()

In [8]:
from tqdm import tqdm
tqdm.pandas()

In [9]:
df['embedding'] = df['combined_data'].progress_apply(get_embedding)

100%|███████████████████████████████████████████████| 28423/28423 [11:44<00:00, 40.33it/s]


768

0    [-0.0013958535855636, -0.007160264067351818, 0...
1    [0.0020967063028365374, -0.01717951148748398, ...
Name: embedding, dtype: object

#### Pincone 

170724006274

In [21]:
# subset=df.head(2)
# vectors = [
#     {
#         'id': str(row['job_id']),
#         'values': row['embedding'],
#         'metadata': {'description': row['combined_data']}
#     }
#     for _, row in subset.iterrows()
# ]

# print(vectors)

In [22]:
from pinecone import Pinecone

pc = Pinecone(api_key=os.getenv('pinecone_api'))
index = pc.Index("intellijobs")
vectors = [
    {
        'id': str(row['job_id']),
        'values': row['embedding'],
        'metadata': {'description': row['combined_data']}
    }
    for _, row in df.iterrows()
]





batch_size = 43

# Upsert vectors in batches
for i in tqdm(range(0, len(vectors), batch_size), desc="Upserting vectors"):
    batch = vectors[i:i + batch_size]  # Extract batch
    index.upsert(batch)  # Upsert the batch of vectors

print("Upsert complete")





    
# for vec in tqdm(vectors, desc="Upserting vectors"):
#     print(vec)
#     index.upsert([vec])

Upserting vectors: 100%|████████████████████████████████| 661/661 [07:10<00:00,  1.54it/s]

Upsert complete





In [None]:
# pc.create_index(
#     name="intellijobs",
#     dimension=768, 
#     metric="cosine", 
#     spec=ServerlessSpec(
#         cloud="aws",
#         region="us-east-1"
#     ) 
# )

#### MongoDB 

In [None]:
# import pymongo

# def get_mongo_client(mongo_uri):
#     try:
#         client = pymongo.MongoClient(mongo_uri)
#         print("Connection to MongoDB successful")
#         return client
#     except pymongo.errors.ConnectionFailure as e:
#         print(f"Connection failed: {e}")
#         return None
    
# mongo_uri = "mongodb+srv://simmu:f1bVUh2akKNHES5V@intellijobs.2cyltry.mongodb.net/"
# mongo_client = get_mongo_client(mongo_uri)

# db = mongo_client["job_listings"]
# collection = db["naukri_vectorized_simmu"]

Connection to MongoDB successful


In [None]:
# documents = df.to_dict("records")
# collection.insert_many(documents)
# print("Data uploaded.")

Data uploaded.
