In [13]:
!pip install -q transformers
!pip install -q pymupdf
!pip install -q streamlit


In [14]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
import pandas as pd
import streamlit as st
# Replace this path with the actual path to your file in Drive
file_path = "/content/drive/MyDrive/jd_optimization_dataset.csv"

# Load the dataset
df = pd.read_csv(file_path)

# Show the first few rows to confirm
df.head()

Unnamed: 0,Job_Title,JD_Before,JD_After
0,Marketing Specialist,We are looking for a Marketing Specialist skil...,We are looking for a Marketing Specialist skil...
1,Full Stack Engineer,We are looking for a Full Stack Engineer skill...,We are looking for a Full Stack Engineer skill...
2,HR Manager,We are looking for a HR Manager skilled in Emp...,We are looking for a HR Manager skilled in Emp...
3,AI Engineer,We are looking for a AI Engineer skilled in ML...,We are looking for a AI Engineer skilled in ML...
4,HR Manager,We are looking for a HR Manager skilled in Onb...,We are looking for a HR Manager skilled in Onb...


In [17]:
!pip install -q transformers

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForCausalLM
import torch

In [18]:
# Step 2: Build Prompts from Dataset
def build_prompt(row):
    return f"Improve and expand the following job description:\n\n{row['JD_Before']}"

# Create a new column for prompts
df["Prompt"] = df.apply(build_prompt, axis=1)

# Show a few prompts to verify
df[["Job_Title", "Prompt"]].head()


Unnamed: 0,Job_Title,Prompt
0,Marketing Specialist,Improve and expand the following job descripti...
1,Full Stack Engineer,Improve and expand the following job descripti...
2,HR Manager,Improve and expand the following job descripti...
3,AI Engineer,Improve and expand the following job descripti...
4,HR Manager,Improve and expand the following job descripti...


In [19]:
# Step 3: Load Primary and Fallback Models

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForCausalLM

# Primary: Flan-T5
flan_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
flan_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

# Fallback: Falcon
falcon_model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-rw-1b")
falcon_tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-rw-1b")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.62G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.62G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [20]:
# Step 4: Generate Job Descriptions using Flan-T5 with Falcon Fallback
def generate_description(prompt):
    try:
        # Try with Flan-T5
        inputs = flan_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
        outputs = flan_model.generate(**inputs, max_new_tokens=300)
        return flan_tokenizer.decode(outputs[0], skip_special_tokens=True)
    except Exception as e:
        print(f"Flan-T5 failed: {e}")
        try:
            # Fallback to Falcon
            inputs = falcon_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
            outputs = falcon_model.generate(**inputs, max_new_tokens=300)
            return falcon_tokenizer.decode(outputs[0], skip_special_tokens=True)
        except Exception as e:
            print(f"Falcon also failed: {e}")
            return "Generation Failed"

# Apply generation on first few rows for now (to test)
df["Generated_JD"] = df["Prompt"].apply(generate_description)

# Show output
df[["Job_Title", "Generated_JD"]].head()


Unnamed: 0,Job_Title,Generated_JD
0,Marketing Specialist,We are looking for a Marketing Specialist skil...
1,Full Stack Engineer,We are looking for a Full Stack Engineer skill...
2,HR Manager,We are looking for a HR Manager skilled in Emp...
3,AI Engineer,We are looking for an AI Engineer skilled in M...
4,HR Manager,We are looking for a HR Manager skilled in Onb...


In [21]:
# Step 5: Save the Generated Job Descriptions to a New CSV File

output_path = "/content/drive/MyDrive/final_generated_job_descriptions.csv"
df[["Job_Title", "Generated_JD"]].to_csv(output_path, index=False)

print(f"✅ Final output saved to: {output_path}")


✅ Final output saved to: /content/drive/MyDrive/final_generated_job_descriptions.csv


In [22]:
# Final batch generation for all rows using the same function
df["Generated_JD"] = df["Prompt"].apply(generate_description)

# Save full output to a new CSV file
final_output_path = "/content/drive/MyDrive/full_generated_job_descriptions.csv"
df[["Job_Title", "Generated_JD"]].to_csv(final_output_path, index=False)

print(f"✅ Full dataset job descriptions saved to: {final_output_path}")


✅ Full dataset job descriptions saved to: /content/drive/MyDrive/full_generated_job_descriptions.csv


In [23]:
# Step 6: Template Formatting + Save to CSV

def format_jd(job_title, generated_text):
    formatted = (
        f"Job Title: {job_title}\n\n"
        f"{generated_text.strip()}\n\n"
        f"Apply Now: Interested candidates are encouraged to apply with their updated resume."
    )
    return formatted

# Apply formatting
df["Formatted_JD"] = df.apply(lambda row: format_jd(row["Job_Title"], row["Generated_JD"]), axis=1)

# Save the formatted descriptions to CSV
formatted_output_path = "/content/drive/MyDrive/formatted_generated_job_descriptions.csv"
df[["Job_Title", "Formatted_JD"]].to_csv(formatted_output_path, index=False)

print(f"✅ Formatted job descriptions saved to: {formatted_output_path}")


✅ Formatted job descriptions saved to: /content/drive/MyDrive/formatted_generated_job_descriptions.csv


In [24]:
import os
import fitz  # PyMuPDF

def extract_resume_text(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

def generate_jd_from_resume(pdf_path):
    resume_text = extract_resume_text(pdf_path)
    prompt = f"Based on the following resume, generate a suitable job description:\n\n{resume_text}"
    return generate_description(prompt)


In [25]:
resume_folder = "/content/drive/MyDrive/resumes/it_resumes"
results = []

for file in os.listdir(resume_folder):
    if file.endswith(".pdf"):
        full_path = os.path.join(resume_folder, file)
        jd = generate_jd_from_resume(full_path)
        results.append({"Resume_File": file, "Generated_JD": jd})
import pandas as pd

results_df = pd.DataFrame(results)
results_path = "/content/drive/MyDrive/resume_based_generated_jds.csv"
results_df.to_csv(results_path, index=False)

print(f"✅ JD generation for resumes completed and saved to: {results_path}")

results_df.head(15)  # Show top 15 resume JDs


✅ JD generation for resumes completed and saved to: /content/drive/MyDrive/resume_based_generated_jds.csv


Unnamed: 0,Resume_File,Generated_JD
0,15118506.pdf,a b c d e f i m a b c d e f i m a b c d e f i...
1,13405733.pdf,"Director of Information Technology , 11/2012 t..."
2,14789139.pdf,DIRECTOR OF INFORMATION TECHNOLOGY
3,13477922.pdf,Information Technology Specialist
4,13836471.pdf,"Microsoft Exchange Server 2010 Administration,..."
5,13385306.pdf,- a developer and a project manager
6,12763627.pdf,DEBTS and LEDGER are the projects that UBS wil...
7,12334140.pdf,Write a letter to the employer. Include a desc...
8,12045067.pdf,Project Engineer
9,12635195.pdf,Educational Technology


In [28]:
import pandas as pd
import streamlit as st

st.set_page_config(layout="wide")
st.title("📄 Resume-based Job Description Generator")

# Load CSV
df = pd.read_csv("/content/drive/MyDrive/resume_based_generated_jds.csv")

# Sidebar search
search_term = st.sidebar.text_input("🔍 Search by keyword in JD")

# Filter results
if search_term:
    filtered_df = df[df["Generated_JD"].str.contains(search_term, case=False, na=False)]
else:
    filtered_df = df

# Show results
for i, row in filtered_df.iterrows():
    st.subheader(f"📎 Resume File: {row['Resume_File']}")
    st.code(row['Generated_JD'])
    st.markdown("---")


2025-05-28 11:48:38.319 Session state does not function when running a script without `streamlit run`


In [37]:
%%writefile streamlit_app.py
import pandas as pd
import streamlit as st

st.set_page_config(layout="wide")
st.title("📄 Resume-based Job Description Generator")

# Load CSV
df = pd.read_csv("/content/drive/MyDrive/resume_based_generated_jds.csv")


# Sidebar search
search_term = st.sidebar.text_input("🔍 Search by keyword in JD")

# Filter results
if search_term:
    filtered_df = df[df["Generated_JD"].str.contains(search_term, case=False, na=False)]
else:
    filtered_df = df

# Show results
for i, row in filtered_df.iterrows():
    st.subheader(f"📎 Resume File: {row['Resume_File']}")
    st.code(row['Generated_JD'])
    st.markdown("---")


Overwriting streamlit_app.py


In [38]:
!npm install -g localtunnel
!streamlit run streamlit_app.py &>/content/logs.txt &
!npx localtunnel --port 8501 --subdomain resumejdgen > /dev/null 2>&1 &



[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K
changed 22 packages in 936ms
[1G[0K⠦[1G[0K
[1G[0K⠦[1G[0K3 packages are looking for funding
[1G[0K⠦[1G[0K  run `npm fund` for details
[1G[0K⠦[1G[0K

In [39]:
!echo "Your Streamlit app is live at: https://resumejdgen.loca.lt"


Your Streamlit app is live at: https://resumejdgen.loca.lt


In [40]:
!curl https://loca.lt/mytunnelpassword


34.125.30.134