In [None]:
import pandas as pd

In [None]:
df1 = pd.read_csv("linkedin_jobs.csv")
df1.head(2)


In [None]:
df2 = pd.read_csv("glassdoor_jobs.csv")
df2.head(2)

In [None]:
df = pd.concat((df1, df2), ignore_index=True)
df.head(2)

In [None]:
len(df)

In [None]:
len(df[df['Company'].isin(["eClerx", "Turing", "Webboost Solutions IT Consultants", "WEBBOOST SOLUTION IT SERVICES", "Skillfied Mentor Jobs", "Pixeltalent Lab", "UM IT PRIVATE LIMITED", "SkillFied Mentor", "TELUS Digital AI Data Solutions", "MedTourEasy", "MedTourEasy Gurugram", "MedTourEasy Dwarka", "UM IT Solutions", "TELUS Digital"])])

In [None]:
df = df[~df['Company'].isin(["eClerx", "Turing", "Webboost Solutions IT Consultants", "MedTourEasy Gurugram", "WEBBOOST SOLUTION IT SERVICES", "Skillfied Mentor Jobs", "Pixeltalent Lab", "UM IT PRIVATE LIMITED", "SkillFied Mentor", "TELUS Digital AI Data Solutions", "MedTourEasy", "MedTourEasy Dwarka", "UM IT Solutions", "TELUS Digital"])]
len(df)

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
len(df)

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
len(df)

In [None]:
def prompt_builder(resume_text, skills_text, additional_detail, job_description):
    prompt = f"""
You are an expert career advisor. Analyze the candidate's profile ONLY in relation to the Job Description. 
**Do not add, assume, infer, or generalize beyond what is explicitly stated.** 
If information is not found in the Job Description, return "Not mentioned" or -1 where applicable. 

Candidate Resume:
{resume_text}

Candidate Skills:
{skills_text}

Additional Candidate Details:
{additional_detail}

Job Description:
{job_description}

Instructions:
- The candidate has 1.3 years of professional experience and transitioned from Developer to Data roles.
- Target role: Data Analyst.
- Determine job relevance based strictly on candidate skills, experience, and Job Description.
- Classify the job into one of four relevance levels:
    1. High – Strong skill match AND required experience <= 1 year.
    2. Medium – Strong skill match AND required experience > 1 and <= 2 years.
    3. Low – Partial skill match OR required experience > 2 years.
    4. Not Relevant – Very little/no skill match OR requirement completely mismatched.
- Extract **only the key skills explicitly demanded** in the Job Description.
- Summarize **main responsibilities strictly from the Job Description**.
- Identify required experience (in years) strictly from the Job Description.
- If the Job Description does not state required experience, return:
    "Experience Required": "Not mentioned"
    "Experience Year": -1
- Respond ONLY in valid JSON with this exact structure (no extra text, no explanations outside JSON):

{{
  "Relevance": "High/Medium/Low/Not Relevant",
  "Reason": "1–2 sentences explaining relevance based strictly on candidate skills, experience, and JD.",
  "Skills Demanded": ["skill1", "skill2", "..."],
  "Expected Work": "Main work/responsibilities from JD only.",
  "Experience Required": "Exact wording from JD if present, otherwise 'Not mentioned'.",
  "Experience Year": "Numeric years required, or -1 if not mentioned"
}}
"""
    return prompt


In [None]:
import requests
import time
import json
import re

def extract_info(prompt):
    time.sleep(0.5)  # small delay to avoid overwhelming the server
    try:
        response = requests.post(
            "http://localhost:11434/api/generate",
            json={
                "model": "qwen3:8b",  # updated model
                # "model": "llama3.1:8b", 
                "prompt": prompt,
                "stream": False
            }
        )
        response.raise_for_status()
        # Ollama returns 'response' key with the text output
        return response.json().get('response', '')
    except requests.exceptions.RequestException as e:
        return f"Request error: {e}"
    except Exception as e:
        return f"Other error: {e}"


In [None]:
import time
import json
import re

def parse_response(text):
    time.sleep(1)  # small delay

    try:
        # Clean the text first
        text = text.strip()
        
        # Attempt to extract JSON using regex
        match = re.search(r'\{.*\}', text, re.DOTALL)
        if match:
            raw_json = match.group()

            # Replace trailing commas or fix minor issues
            raw_json = re.sub(r',\s*}', '}', raw_json)
            raw_json = re.sub(r',\s*]', ']', raw_json)

            response = json.loads(raw_json)
            return {
                "Relevance": response.get("Relevance", ""),
                "Reason": response.get("Reason", ""),
                "Skills Demanded": response.get("Skills Demanded", []),
                "Expected Work": response.get("Expected Work", ""),
                "Experience Required": response.get("Experience Required", ""),
                "Experience Year": response.get("Experience Year", "")
            }
        else:
            print("❌ No JSON found in model output.")
            return {
                "Relevance": "",
                "Reason": "",
                "Skills Demanded": [],
                "Expected Work": "",
                "Experience Required": "",
                "Experience Year": ""
            }

    except json.JSONDecodeError as e:
        print(f"❌ JSON decode error: {e}\nRaw text:\n{text}")
        return {
            "Relevance": "",
            "Reason": "",
            "Skills Demanded": [],
            "Expected Work": "",
            "Experience Required": "",
            "Experience Year": ""
        }
    except Exception as e:
        print(f"❌ Other error: {e}")
        return {
            "Relevance": "",
            "Reason": "",
            "Skills Demanded": [],
            "Expected Work": "",
            "Experience Required": "",
            "Experience Year": ""
        }


In [None]:
df = df.reset_index(drop=True)

In [None]:
resume_text = """
Professional Development
GeeksforGeeks Data Science/Analysis Course
HackerRank & LeetCode practice
Hands-on projects in Python, SQL, Power BI, DAX, and Data Visualization. Gained experience in data cleaning, analysis, and interpretation across e-commerce, healthcare, automotive, transportation, and finance.

Work Experience
Tata Consultancy Services | Associate System Engineer | 12/2021 - 03/2023
- Designed and maintained PostgreSQL databases with precise schema definitions, constraints, and dynamic schema updates via alter scripts to support evolving requirements.
- Performed comprehensive end-to-end data validation and preprocessing on datasets, collaborating with 5+ stakeholders to achieve 99.9% data accuracy and resolve discrepancies within 24 hours.
- Transitioned data transmission from JSON payloads to file-based formats, enabling batch transfers of 50,000+ records per file and increasing data throughput by 40%, eliminating JSON payload size constraints.
- Enhanced PostgreSQL system by implementing an automated workflow to resend flagged data, reducing manual effort and resend latency by 50%.
- Supported analytics teams by delivering clean, structured datasets that enabled reliable reporting and insights generation.


Education
College | 2016 - 2020
Bachelor of Technology (B.Tech) | Mechanical Engineering | CGPA: 8.52
"""

skills_text = """
Python (Pandas, NumPy, Seaborn, Matplotlib, Plotly), SQL (MySQL, PostgreSQL), Web Scraping (BeautifulSoup, Scrapy, Selenium),
Power BI (Desktop, Service, M-Query, DAX), Tableau, Google Sheets, Advanced Excel,
Data Cleaning, Data Storytelling, Data Modelling, Data Analysis, Data Visualization
"""

additional_detail = """
Candidate Highlights:
- 1.3 Years of Work Experience.
- Strong experience as a Data Analyst and Associate System Engineer with end-to-end data handling.
- Expertise in SQL, Python, data cleaning, validation, workflow automation, and structured reporting.
- Experienced in data visualization using Power BI, Tableau, and advanced Excel.
- Worked on projects involving NLP, dashboarding, and large-scale analytics (Amazon Reviews, IPL Data, EV Market Analysis).
- Familiar with web scraping, data pipelines, and API integrations.
- Target Roles: Data Analyst, Business Intelligence, Analytics, or Reporting-focused positions.
- Preferred domains: E-commerce, Finance, Healthcare, Automotive, Transportation, Market Analysis.
- Key strengths: Structured problem-solving, handling large datasets, delivering actionable insights, and automating repetitive tasks.

Gold Badge in Python and SQL on HackerRank
Demonstrated proficiency in Python and SQL programming languages by earning the Gold Badge, indicating a strong understanding of data manipulation, analysis, and problem-solving skills. Expertise in utilizing these languages for data-driven projects, including data cleaning, transformation, and visualization.
Solved all Pandas Problem on LeetCode
Demonstrated proficiency in data manipulation and analysis using Pandas library. Successfully resolved complex data issues, optimized code for efficiency, and generated insightful visualizations to support decision-making. Expertise in data cleaning, wrangling, and feature engineering. Strong analytical and problem-solving skills.
"""


In [None]:
df["llm_raw"] = None
df["llm_parsed"] = None

for idx in range(len(df)):
    job_description = df.loc[idx, "Job Desc."]  # Full text
    
    prompt = prompt_builder(resume_text, skills_text, additional_detail, job_description)
    raw = extract_info(prompt)
    parsed = parse_response(raw) if raw else {}

    df.at[idx, "llm_raw"] = raw
    df.at[idx, "llm_parsed"] = parsed

    relevance = parsed.get("Relevance", None)
    reason = parsed.get("Reason", None)
    skills_demanded = parsed.get("Skills Demanded", None)
    expected_work = parsed.get("Expected Work", None)
    experience_required = parsed.get("Experience Required", None)
    experience_year = parsed.get("Experience Year", None)
                            
    print(f"✅ Job {idx} | Relevance: {relevance} | Exp Required: {experience_required} | Skills: {skills_demanded} | Exp Year: {experience_year}")

    time.sleep(0.5)

# Flatten parsed column into separate columns
df_extracted = pd.json_normalize(df["llm_parsed"]).reset_index(drop=True)
final_df = pd.concat([df.reset_index(drop=True), df_extracted], axis=1)

final_df.head()

In [None]:
df_extracted = pd.json_normalize(df["llm_parsed"]).reset_index(drop=True)
final_df = pd.concat([df.reset_index(drop=True), df_extracted], axis=1)

final_df.head()

In [None]:
final_df.to_csv('llm_response.csv', index=False)

In [None]:
final_df.shape

In [None]:
final_df.isnull().sum()