In [None]:
!pip install -U pymilvus unsloth --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

: 

In [None]:
!pip install datasets "pymilvus[model]" langchain_milvus langchain_openai langchain_community langchain_huggingface pandas sentence-transformers openai

In [None]:
!pip install gradio fitz PyMuPDF python-dotenv

### Imports

In [None]:
import os
import dotenv
import pandas as pd
from openai import OpenAI
from datasets import load_dataset
from pymilvus import MilvusClient
from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.vectorstores import Milvus
from langchain_milvus import Milvus, Zilliz
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from sentence_transformers import SentenceTransformer
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection

In [None]:
def load_env_vars():
    # Load environment variables from .env file.
    dotenv_path = 'security_keys.env'
    dotenv.load_dotenv(dotenv_path=dotenv_path)
    openai_api_key = os.getenv('OPENAIAPI_KEY')
    return openai_api_key

openai_api_key = load_env_vars()

# Load hugging face linkedin job postings dataset
ds = load_dataset("datastax/linkedin_job_listings")
df = pd.read_csv("hf://datasets/datastax/linkedin_job_listings/postings.csv")

In [None]:
df.head()

In [None]:
def preprocessing_dataframe(df):
  # Calculate the percentage of missing values in each column
  missing_percent = df.isnull().mean() * 100

  # Drop columns where more than 70% of the values are NaN
  df_cleaned = df.loc[:, missing_percent <= 70]

    # Select relevant columns for embedding
  df_cleaned['text_to_embed'] = df_cleaned[['title', 'company_name', 'description', 'location',
                                            'formatted_experience_level']].fillna('').agg(' '.join, axis=1)

  # Clean the text (lowercasing and removing special characters)
  df_cleaned['text_to_embed'] = df_cleaned['text_to_embed'].str.lower().str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)
  return df_cleaned

df_cleaned = preprocessing_dataframe(df)

In [None]:
df_cleaned = df_cleaned[0:1000]

In [None]:
def generate_embeddings(df_cleaned):
  # Load embedding model
  model = SentenceTransformer('all-MiniLM-L6-v2')

  # Generate embeddings for job postings
  df_cleaned['embeddings'] = df_cleaned['text_to_embed'].apply(lambda x: model.encode(x).tolist())

  # Connect to Milvus client given URI
  milvus_client = MilvusClient(uri="jobposting_demo.db")
  return model, milvus_client, df_cleaned

model, milvus_client, df_cleaned = generate_embeddings(df_cleaned)

In [None]:
def create_milvus_schema_and_collection(milvus_client, collection_name = "job_postings_linkedin", dim=384, metric_type="COSINE"):
  # Create schema and collection
  dim = dim
  collection_name = collection_name

  # Define the schema for the job postings collection
  fields = [
      FieldSchema(name="job_id", dtype=DataType.INT64, is_primary=True),
      FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=255),
      FieldSchema(name="location", dtype=DataType.VARCHAR, max_length=255),
      FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=len(df_cleaned['embeddings'][0]))
  ]

  schema = CollectionSchema(fields, description="Job Postings Embeddings")

  if milvus_client.has_collection(collection_name=collection_name):
      milvus_client.drop_collection(collection_name=collection_name)

  milvus_params = milvus_client.prepare_index_params()
  milvus_params.add_index(field_name="embedding", index_type="AUTOINDEX", metric_type=metric_type)

  milvus_client.create_collection(
      collection_name,
      dim,
      primary_field_name="job_id",
      vector_field_name="embedding",
      index_params=milvus_params,
      auto_id=True,
      schema=schema,
  )

  print(milvus_client.list_collections())
  return schema, collection_check

create_milvus_schema_and_collection(milvus_client, collection_name = "job_postings_linkedin", dim=384, metric_type="COSINE")

In [None]:
def insert_data_into_milvus(milvus_client, df_cleaned):
  insert_data = [{"job_id": jid, "title": title, "location": loc, "embedding": emb} for jid, title, loc, emb in zip(df_cleaned['job_id'].astype(int).tolist(),
      df_cleaned['title'].astype(str).tolist(),
      df_cleaned['location'].astype(str).tolist(),
      df_cleaned['embeddings'].tolist())]

  # Insert data into Milvus
  try:
    milvus_client.insert(
        collection_name=collection_name,
        data=insert_data)
    print(f"Inserted {len(df_cleaned)} records into Milvus.")
    return True
  except:
    return "Error in pushing data to milvus collection"

insert_data_into_milvus(milvus_client, df_cleaned)

In [None]:
def search_jobs(milvus_client, resume_text, k=5):
  search_params = {"metric_type": "", "params": {}}
  resume_embedding = model.encode(resume_text).tolist()

  results = milvus_client.search(
      collection_name=collection_name,
      data=[resume_embedding],
      search_params=search_params,
      limit=k,
      output_fields=["job_id", "title", "location"]
  )
  return results

### LLM Response

In [None]:
def generate_job_explanations(milvus_client, resume_text, k=5):
    jobs = search_jobs(milvus_client, resume_text, k)

    llm = ChatOpenAI(model="gpt-4", api_key=os.getenv('OPENAIAPI_KEY'))
    prompt_template = PromptTemplate(
        input_variables=["resume", "title", "location"],
        template=(
            "Given the candidate's resume: {resume}, explain why the role {title} "
            "in {location} is a good fit for them."
        )
    )

    chain = LLMChain(llm=llm, prompt=prompt_template)

    for job in jobs[0]:
        title = job['entity']['title']
        location = job['entity']['location']
        explanation = chain.run(resume=resume_text, title=title, location=location)
        print(f"Job ID: {job['entity']['job_id']}, Title: {title}, Location: {location}")
        print(f"Explanation: {explanation}\n")

# Input resume text and run the pipeline
resume_text = input("Enter your resume text: ")
generate_job_explanations(milvus_client, resume_text, k=5)