In [1]:
import pandas as pd
import re

# Load dataset
df = pd.read_csv('sampled_jobs.csv')

# Function to clean text
def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

# Clean description column
df['description'] = df['description'].apply(remove_html_tags)

# Drop rows with empty descriptions
df = df[df['description'].str.strip().astype(bool)]

# Concatenate job_title with cleaned description
df['Docs'] = df['job_title'] + ': ' + df['description']
df.drop(['job_title', 'description', 'career_level'], axis=1, inplace=True)

# Example of saving cleaned data to a new CSV
df.to_csv('cleaned_sampled_jobs.csv', index=False)

In [None]:
from sentence_transformers import SentenceTransformer

# Load Sentence Transformers model
model_name = 'sentence-transformers/all-mpnet-base-v2'
model = SentenceTransformer(model_name)

# Embedding job descriptions
embeddings = model.encode(df['Docs'].tolist(), convert_to_tensor=True)


  from tqdm.autonotebook import tqdm, trange





In [None]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from elasticsearch_dsl import Index, Document, Text, Keyword, Object, Float
import numpy as np

# Example Elasticsearch setup 
es = Elasticsearch()

# Create an Elasticsearch index
index_name = 'jobs_index'
if not es.indices.exists(index=index_name):
    es.indices.create(index=index_name)

# Define Elasticsearch document mapping
class JobDocument(Document):
    title = Text()
    description = Text()
    embeddings = Object()

    class Index:
        name = index_name

# Function to bulk index documents into Elasticsearch
def bulk_index_documents(df, embeddings):
    actions = []
    for i, row in df.iterrows():
        doc = JobDocument(
            title=row['job_title'],
            description=row['description'],
            embeddings=embeddings[i].tolist()
        )
        actions.append(doc.to_dict(include_meta=True))

    bulk(es, actions)

# Index documents with embeddings into Elasticsearch
bulk_index_documents(df, embeddings)


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load Phi-3 - Mini 4k Instruct model and tokenizer
model_name = 'microsoft/Phi-3-mini-4k-instruct'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Function to generate advice 
def generate_advice(query):
    inputs = tokenizer.encode(query, return_tensors='pt', max_length=512, truncation=True)
    outputs = model.generate(inputs, max_length=500, num_return_sequences=1, temperature=0.7)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example usage
user_query = "How to become a Machine Learning Engineer?"
advice = generate_advice(user_query)
print(advice)


In [None]:
def rag_system(user_query):
    # Retrieve relevant documents from Elasticsearch
    query_vector = model.encode([user_query], convert_to_tensor=True)
    query = {
        "query": {
            "knn": {
                "embeddings": {
                    "vector": query_vector.numpy().tolist(),
                    "k": 3
                }
            }
        }
    }
    search_results = es.search(index=index_name, body=query)['hits']['hits']

    # Extract retrieved document IDs
    retrieved_ids = [hit['_id'] for hit in search_results]

    # Generate personalized advice
    recommendations = []
    for doc_id in retrieved_ids:
        description = df.loc[df.index == int(doc_id), 'Docs'].values[0]
        advice = generate_advice(description)
        recommendations.append(advice)

    return recommendations

# Example usage of the RAG system
user_query = "How to become a Machine Learning Engineer?"
recommendations = rag_system(user_query)
for idx, advice in enumerate(recommendations):
    print(f"Recommendation {idx + 1}: {advice}")


In [None]:
def evaluate_rag_system(queries):
    results = []
    for query in queries:
        recommendations = rag_system(query)
        results.append({
            'query': query,
            'recommendations': recommendations
        })
    return results

# Example test queries for evaluation
test_queries = [
    "How to become a Data Scientist?",
    "Skills needed for a Software Engineer role",
    "Career advice for aspiring Project Managers"
]

evaluation_results = evaluate_rag_system(test_queries)
for result in evaluation_results:
    print(f"Query: {result['query']}")
    for idx, advice in enumerate(result['recommendations']):
        print(f"Recommendation {idx + 1}: {advice}")
    print()
