# Installing Dependencies

In [1]:
!pip install chromadb
!pip install sentence-transformers
!pip install google-generativeai
!pip install google-generativeai --upgrade
!pip install python-docx
!pip install fpdf2
!pip install pandas

Collecting chromadb
  Downloading chromadb-0.6.3-py3-none-any.whl.metadata (6.8 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.8-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.14.1-py2.py3-none-any.whl.metadata (2.9 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.30.0-py3

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [None]:
import chromadb
from sentence_transformers import SentenceTransformer
import google.generativeai as genai
import os

In [None]:
#checking if they're installed correctly

print(chromadb.__version__)
print(SentenceTransformer().get_sentence_embedding_dimension()) # Just to check if it imports
print(genai.__version__)

In [None]:
import getpass
os.environ['GOOGLE_API_KEY'] = getpass.getpass('Enter your API key: ')

In [None]:
# Configure the Gemini API client
genai.configure(api_key=os.environ.get('GOOGLE_API_KEY'))

gemini_model = genai.GenerativeModel(model_name='gemini-1.5-flash')
response = gemini_model.generate_content('Hi Gemini')

print(response.text)

# Set up Vector Database from Google Sheets

In [None]:
import pandas as pd
import json

In [None]:
sheet_id = "1-s4G6vPs5gh5GjCHSNC8Ds4fEoTEyaLP718U5yuyPaw"  # Replace with your actual Sheet ID
csv_url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv"

#Read the sheet into pandas dataframe
df = pd.read_csv(csv_url)

print(df.head)

In [None]:
#Convert data to JSON

json_data = df.to_json(orient='records')
print(json_data)

In [None]:
# Load your JSON data (assuming it's in a variable called json_data)
data = json.loads(json_data)  # Convert the JSON string back to a Python list of dictionaries

In [None]:
# Create a persistent ChromaDB client
client = chromadb.PersistentClient(path="./my_chroma_db")  # Saves the DB to a directory called my_chroma_db

# Create a collection (think of it as a table)
try:
    client.delete_collection(name="work_history") # Replace with your collection name
    print("Existing collection deleted.")
except ValueError: # Collection doesn't exist
    print("Collection doesn't exist, creating a new one.")
collection = client.get_or_create_collection(name="work_history") # Recreate collection

In [None]:
model = SentenceTransformer('all-mpnet-base-v2')  # Initialize Sentence Transformer model

In [None]:
# Generate embeddings and add data to ChromaDB
texts = []
metadatas = []
ids = []

for i, entry in enumerate(data):
    description = entry['Description']
    texts.append(description)

    metadata = {
        'section': entry['Section'],
        'start_date': entry['Start Date'],
        'end_date': entry['End Date'],
        'organization': entry['Organization'],
        'position': entry['Position'],
        'project_name': entry.get('Project Name'),  # Get Project Name or None
        'description': description,  # Add the description to the metadata
        'id': str(i)  # Convert i to a string
    }

    # Handle None values in metadata
    cleaned_metadata = {}
    for key, value in metadata.items():
        if value is None:
            cleaned_metadata[key] = "N/A"  # Or another placeholder like "Unknown"
        else:
            cleaned_metadata[key] = value

    metadatas.append(cleaned_metadata)
    ids.append(str(i))

embeddings = model.encode(texts)
collection.add(
    embeddings=embeddings,
    metadatas=metadatas,
    ids=ids
)

print("Vector database created and populated successfully!")

In [None]:
# Verify
results = collection.peek(5)  # Peek at the first 5 entries
print(results)

# Querying the Database

In [None]:
query = "Work experience related to Data Analytics"  # Your data analytics query
query_embedding = model.encode(query)

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=3  # Number of top results to retrieve (adjust as needed)
)

relevant_experiences = results['metadatas'][0]  # Get the metadata of the top results

print("Top Results for Data Analytics:")
for experience in relevant_experiences:
    print("-" * 20)  # Separator for clarity
    for key, value in experience.items():
        print(f"{key.capitalize()}: {value}")  # Print each key-value pair
    print("-" * 20)

# Job Description Refinement

In [None]:
job_description = input("Enter the job description: ")

In [None]:
# Prompt 1 is tasked to understand the job and define key responsibilities

prompt = f"""
Carefully analyze the following job description and extract the key information related to:

    1. Key Responsibilities: List the main responsibilities of the role.
    2. Team Name: Identify the name of the team or department (if specified).
    3. Cultural Requirements: Describe any cultural values, requirements, or expectations mentioned in the job. Ignore any information related to recruitment practices like equity and inclusion, only look for company-specific cultural requirements.
    4. Name of the Company: Identify the name of the company or organization.

    Job Description:
    {job_description}

    Provide the extracted information in a structured JSON format like this:

    ```json
    {{
      "company name": "Company Name",
      "position": "Position Title",
      "responsibilities": "List of responsibilities",
      "team": "Team name",
      "culture": "Description of cultural requirements"
    }}
    ```
"""

response = gemini_model.generate_content(prompt)
print(response.text)

In [None]:
def extract_job_info_with_gemini(job_description):
    prompt = f"""
    Carefully analyze the following job description and extract the key information related to:

    1. Key Responsibilities: List the main responsibilities of the role.
    2. Team Name: Identify the name of the team or department (if specified).
    3. Cultural Requirements: Describe any cultural values, requirements, or expectations mentioned in the job. Ignore any information related to recruitment practices like equity and inclusion, only look for company-specific cultural requirements.
    4. Name of the Company: Identify the name of the company or organization.

    Job Description:
    {job_description}

    Provide the extracted information in a structured format like this:

    ```
      "company name": "Company Name",
      "position": "Position Title",
      "responsibilities": "List of responsibilities",
      "team": "Team name",
      "culture": "Description of cultural requirements"
    ```
    """

    response = gemini_model.generate_content(prompt)

    return response.text

In [None]:
import re
import json

def extract_json_block(text):
    """Extracts the JSON block from a string, handling variations."""

    # 1. Look for JSON block enclosed in triple backticks (```json)
    match = re.search(r"```json\n(.*)\n```", text, re.DOTALL)
    if match:
        return match.group(1)  # Extract the content within the backticks

    # 2. Look for JSON block enclosed in curly braces ({}) if no backticks
    match = re.search(r"{\s*.*?\s*}", text, re.DOTALL)
    if match:
        return match.group(0)  # Extract the content within the curly braces

    # 3. Handle cases where no JSON block is found
    print("Warning: No JSON block found in the text.")
    print("Text:", text) # Print for debugging
    return None

# Example usage:
text_with_json = extract_job_info_with_gemini(job_description)
print(extract_json_block(text_with_json))

# Finding Relevant Experiences to the Job Description

In [None]:
job_info = extract_json_block(text_with_json)
job_info = json.loads(job_info)
print(job_info)

In [None]:
print(job_info['responsibilities'])

In [None]:
# Assuming job_info is the JSON extracted from the job description
responsibilities = job_info['responsibilities']

# Handle cases where responsibilities might be a string or a list:
if isinstance(responsibilities, str):
    tasks = responsibilities.split('\n')  # Split if it's a single string
elif isinstance(responsibilities, list):
    tasks = responsibilities  # Use the list directly
else:
    tasks = []  # Handle cases where it's neither (e.g. None)

In [None]:
relevant_experiences = []
for task in tasks:
    if task.strip(): # Check if task is not empty
        query_embedding = model.encode(task.strip()) # Remove leading/trailing spaces
        results = collection.query(
            query_embeddings=[query_embedding],
            n_results=3  # Get top 2 results per task (adjust as needed)
        )
        for result in results['metadatas'][0]: # Iterate on the results
            relevant_experiences.append(result)

# Remove duplicates (if any)
relevant_experiences = list({v['id']: v for v in relevant_experiences}.values())

# Sort by start date (optional)
relevant_experiences.sort(key=lambda x: x['start_date'], reverse=True)

print("Relevant Experiences:", relevant_experiences)

# Writing a Cover Letter

In [None]:
import datetime

In [None]:
def generate_cover_letter(job_info, relevant_experiences):
    # Auto-code date
    today = datetime.date.today()
    date_formatted = today.strftime("%B %dth, %Y")  # Format: Month date-th, Year

    prompt = f"""
    Write a cover letter with the following structure and format:

    Dear Hiring Manager,

    Paragraph 1: Express strong interest in the position. Give an overview of my background (70 words).

    Paragraph 2: Highlight relevant experience (150 words).

    Paragraph 3: Relevant experience, different project (150 words).

    Paragraph 4: Cultural Fit. Conclusion and gratitude for consideration (50 words).

    Best Regards,
    Rohan Srivastava

    Date: {date_formatted}  # Insert auto-coded date

    Job Description:
    {job_info}

    Relevant Work Experience:
    """

    for experience in relevant_experiences:
        prompt += f"""
        * {experience['position']} at {experience['organization']} ({experience['start_date']} - {experience['end_date']}): {experience['description']}
        """

    prompt += """

    Ensure the letter is professional, tailored to the specific job, and highlights the applicant's relevant skills and experience. Adhere to the specified paragraph structure and word counts as closely as possible.
    """

    response = gemini_model.generate_content(prompt)

    return response.text

In [None]:
cover_letter = generate_cover_letter(job_info, relevant_experiences)
print(cover_letter)

# Output Text and PDF Files

In [None]:
import re
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Pt, Inches

# ... (Your cover_letter string definition)

# Remove extra newlines from cover_letter string *before* processing
cover_letter = re.sub(r'\n\s*\n', '\n', cover_letter)  # Remove extra newlines and whitespace

document = Document()

# Split the cover letter into parts (now with single newlines)
parts = cover_letter.split("\n")  # Split by single newlines

# Date (left-aligned)
date_paragraph = document.add_paragraph(parts[0])
date_paragraph.alignment = WD_ALIGN_PARAGRAPH.LEFT

# Salutation (left-aligned)
salutation_paragraph = document.add_paragraph(parts[1])
salutation_paragraph.alignment = WD_ALIGN_PARAGRAPH.LEFT

# Main body (justified)
main_body_paragraphs = parts[2:-2]  # Direct slicing - no more joining and splitting

for paragraph_text in main_body_paragraphs:
    if paragraph_text.strip():
        paragraph = document.add_paragraph()
        run = paragraph.add_run(paragraph_text)
        run.font.name = 'Times New Roman'
        run.font.size = Pt(11)
        paragraph.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY

# Closing (left-aligned and indented)
closing_paragraph = document.add_paragraph()

# Combine closing lines if they exist
closing_text = "\n".join(parts[-2:]) if len(parts) > 2 else "" # Handle cases where there might not be a closing
closing_run = closing_paragraph.add_run(closing_text)
closing_run.font.name = 'Times New Roman'
closing_run.font.size = Pt(11)
closing_paragraph.alignment = WD_ALIGN_PARAGRAPH.LEFT

company_name = job_info['company name'].replace(" ", "-")
position = job_info['position'].replace(" ", "-")
filename = f"{company_name}-{position}-cover-letter.docx"
document.save(filename)