# LoRA Fine-tuning : Roberta-base

In [1]:
!pip install datasets
!pip install evaluate



## Importing libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re
import string
import nltk
from nltk.corpus import stopwords

import torch
from collections import Counter

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import DatasetDict, Dataset, load_dataset
from peft import get_peft_model, LoraConfig, TaskType
import evaluate


import warnings
warnings.filterwarnings("ignore")


In [3]:
path = "/Users/saideepbunny/Projects/Application_Ranking_System"

## Reading the data

In [5]:
dataset = load_dataset("saideep-arikontham/jd_resume_dataset")
dataset

README.md:   0%|          | 0.00/612 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/5.97M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/660k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/637k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3200 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/400 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/400 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['job_data', 'resume_data', 'label', '__index_level_0__'],
        num_rows: 3200
    })
    validation: Dataset({
        features: ['job_data', 'resume_data', 'label', '__index_level_0__'],
        num_rows: 400
    })
    test: Dataset({
        features: ['job_data', 'resume_data', 'label', '__index_level_0__'],
        num_rows: 400
    })
})

In [6]:
train_df = dataset['train'].to_pandas()
train_df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,1600
1,1600


In [7]:
val_df = dataset['validation'].to_pandas()
val_df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,200
1,200


In [8]:
test_df = dataset['test'].to_pandas()
test_df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,200
1,200


In [9]:
# Defining label2id
label2id = {'Bad Fit': 0, 'Good Fit': 1}

## Loading the model

In [10]:
# -------------------------------
# Load tokenizer and base model
# -------------------------------

model_name = "FacebookAI/roberta-base"  # change to your desired pretrained model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)  # adjust num_labels as needed

print(model)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

## Configure LoRA

In [11]:
# -------------------------------
# Configure LoRA fine-tuning
# -------------------------------

# Define a LoRA configuration. Adjust parameters (r, lora_alpha, etc.) as needed.
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  # for sequence classification
    r=8,                        # low rank parameter; experiment with this value
    lora_alpha=32,              # scaling parameter
    lora_dropout=0.1,           # dropout probability for LoRA layers
    target_modules=["query", "value", "out_proj"]  # adjust based on your model architecture
)

# Wrap your model with LoRA. This freezes most of the model and inserts trainable LoRA layers.
model = get_peft_model(model, lora_config)

## Define Preprocessing function

In [12]:
# -------------------------------
# Improved tokenization approach for job and resume matching
# -------------------------------

# Download stopwords if not already downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stop_words.add("overqualified")
stop_words.add("underqualified")
stop_words.add("mismatch")
stop_words.add("good")

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove unwanted symbols except %, $, /, and .
    text = re.sub(r"[^a-z0-9\s%$/.-]", "", text)

    # Preserve hyphens only when followed by a number (e.g., 2005-2010, 2010-present)
    text = re.sub(r"-(?!\d)", "", text)  # Remove hyphens not followed by a digit

    # Preserve GPA-like formats (e.g., 3.8/4.0)
    text = re.sub(r"(?<!\d)/|/(?!\d)", " ", text)  # Remove '/' unless between numbers

    # Remove periods (".") if they are immediately after a word but not numbers (e.g., "good." → "good", but keep 3.8)
    text = re.sub(r"\b(\w+)\.(?!\d)", r"\1", text)

    # Remove newline characters
    text = text.replace("\n", " ").replace("\r", " ")

    # Remove any show less and show more texts
    text = text.replace("show less", "").replace("show more", "")
    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    text = " ".join(word for word in text.split() if word not in stop_words)

    return text


def preprocess_function(examples):

    # Preprocess the text first
    examples["job_data"] = [preprocess_text(job) for job in examples["job_data"]]
    examples["resume_data"] = [preprocess_text(resume) for resume in examples["resume_data"]]

    # Process job descriptions and resumes separately with appropriate max lengths
    job_max_length = 256
    resume_max_length = 256

    # Tokenize job descriptions
    job_inputs = tokenizer(
        examples["job_data"],
        truncation=True,
        max_length=job_max_length,
        padding="max_length"
    )

    # Tokenize resumes
    resume_inputs = tokenizer(
        examples["resume_data"],
        truncation=True,
        max_length=resume_max_length,
        padding="max_length"
    )

    # Combine the tokenized inputs
    combined_inputs = {
        "input_ids": [],
        "attention_mask": []
    }

    for job_ids, job_mask, resume_ids, resume_mask in zip(
        job_inputs["input_ids"], job_inputs["attention_mask"],
        resume_inputs["input_ids"], resume_inputs["attention_mask"]
    ):
        # Get separator token ID
        separator_id = tokenizer.sep_token_id
        if separator_id is None:
            separator_id = tokenizer.eos_token_id

        # Combine tokens with separator
        combined_ids = job_ids + [separator_id] + resume_ids
        combined_mask = job_mask + [1] + resume_mask

        # Ensure we don't exceed the model's max input length
        max_model_length = 512
        if len(combined_ids) > max_model_length:
            combined_ids = combined_ids[:max_model_length]
            combined_mask = combined_mask[:max_model_length]

        combined_inputs["input_ids"].append(combined_ids)
        combined_inputs["attention_mask"].append(combined_mask)

    # Add labels
    combined_inputs["labels"] = examples["label"]

    return combined_inputs

# Apply the tokenization to all splits
tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names
)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

## Setting Training parameters

In [13]:

# -------------------------------
# Setup training parameters
# -------------------------------

training_args = TrainingArguments(
    output_dir=f"/content/test1",
    evaluation_strategy="epoch",
    save_strategy="epoch",             # Set save strategy to epoch to match evaluation_strategy
    num_train_epochs=5,                # Adjust number of epochs as desired
    per_device_train_batch_size=16,    # Adjust based on your GPU memory
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    weight_decay=0.01,
    save_total_limit=1,                # Keep only the latest checkpoint
    load_best_model_at_end=True,       # Load the best model when finished training (if metric provided)
    metric_for_best_model="accuracy",  # Choose your metric
)


## Defining Evaluation metrics

In [14]:
# -------------------------------
# Define a metric function for evaluation
# -------------------------------

# Load metrics
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Compute metrics
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted")
    recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted")
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")

    return {
        "accuracy": accuracy["accuracy"],
        "precision": precision["precision"],
        "recall": recall["recall"],
        "f1": f1["f1"]
    }


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.56k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.38k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

## Training the model

In [15]:
# -------------------------------
# Create the Trainer and start training
# -------------------------------

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics
)

# Train the model with LoRA fine-tuning
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msaideepreddy1818[0m ([33msaideepreddy18[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.620277,0.58,0.771739,0.58,0.490044
2,No log,0.11539,0.9625,0.962512,0.9625,0.9625
3,0.413600,0.156723,0.9525,0.956621,0.9525,0.952393
4,0.413600,0.078071,0.975,0.975048,0.975,0.974999
5,0.098900,0.074243,0.975,0.975048,0.975,0.974999


TrainOutput(global_step=1000, training_loss=0.25621293640136716, metrics={'train_runtime': 1415.333, 'train_samples_per_second': 11.305, 'train_steps_per_second': 0.707, 'total_flos': 4253982326784000.0, 'train_loss': 0.25621293640136716, 'epoch': 5.0})

## Test Results

In [16]:
# -------------------------------
# Evaluate the final model on the test set
# -------------------------------

test_results = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
test_results

{'eval_loss': 0.15572448074817657,
 'eval_accuracy': 0.9425,
 'eval_precision': 0.9443774949160202,
 'eval_recall': 0.9425,
 'eval_f1': 0.9424392014064855,
 'eval_runtime': 13.0595,
 'eval_samples_per_second': 30.629,
 'eval_steps_per_second': 1.914,
 'epoch': 5.0}

## Predicition Function

In [105]:
def preprocess_inference(text):
    # Convert to lowercase
    text = text.lower()

    # Remove unwanted symbols except %, $, /, and .
    text = re.sub(r"[^a-z0-9\s%$/.-]", "", text)

    # Preserve hyphens only when followed by a number (e.g., 2005-2010, 2010-present)
    text = re.sub(r"-(?!\d)", "", text)  # Remove hyphens not followed by a digit

    # Preserve GPA-like formats (e.g., 3.8/4.0)
    text = re.sub(r"(?<!\d)/|/(?!\d)", " ", text)  # Remove '/' unless between numbers

    # Remove periods (".") if they are immediately after a word but not numbers (e.g., "good." → "good", but keep 3.8)
    text = re.sub(r"\b(\w+)\.(?!\d)", r"\1", text)

    # Remove newline characters
    text = text.replace("\n", " ").replace("\r", " ")

    # Remove any show less and show more texts
    text = text.replace("show less", "").replace("show more", "")

    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    text = " ".join(word for word in text.split() if word not in stop_words)

    return text


def tokenize_new_data(job_description, resume):
    # Preprocess input texts
    job_description = preprocess_inference(job_description)
    resume = preprocess_inference(resume)

    # Define the tokenizer settings
    job_max_length = 256
    resume_max_length = 256
    max_model_length = 512  # Ensure this matches what was used in training

    # Tokenize job description
    job_inputs = tokenizer(
        job_description,
        truncation=True,
        max_length=job_max_length,
        padding="max_length",
        return_tensors="pt"
    )

    # Tokenize resume
    resume_inputs = tokenizer(
        resume,
        truncation=True,
        max_length=resume_max_length,
        padding="max_length",
        return_tensors="pt"
    )

    # Get separator token ID
    separator_id = tokenizer.sep_token_id
    if separator_id is None:
        separator_id = tokenizer.eos_token_id

    # Combine tokens with separator
    combined_ids = torch.cat((job_inputs["input_ids"], torch.tensor([[separator_id]]), resume_inputs["input_ids"]), dim=1)
    combined_mask = torch.cat((job_inputs["attention_mask"], torch.tensor([[1]]), resume_inputs["attention_mask"]), dim=1)

    # Ensure we don't exceed the max length
    combined_ids = combined_ids[:, :max_model_length]
    combined_mask = combined_mask[:, :max_model_length]

    return {
        "input_ids": combined_ids,
        "attention_mask": combined_mask
    }


import torch
import torch.nn.functional as F

def infer(job_description, resume):
    # Tokenize input
    inputs = tokenize_new_data(job_description, resume)

    # Ensure model is in evaluation mode
    model.eval()

    # Move to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    inputs = {key: val.to(device) for key, val in inputs.items()}

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)

    # Get logits
    logits = outputs.logits

    # Compute softmax probabilities
    probs = F.softmax(logits, dim=-1)

    # Get predicted class
    predicted_class = torch.argmax(probs, dim=-1).item()

    # Get probability of class 1
    class_1_prob = probs[:, 1].item() if probs.shape[1] > 1 else probs.item()

    return predicted_class, class_1_prob



In [106]:
test_data = dataset['test'].to_pandas()
test_data

Unnamed: 0,job_data,resume_data,label,__index_level_0__
0,"CNC Machinist\nLoc Performance Products, LLC p...",**DONNA JONES**\n\n*donnajones@email.com* · (1...,0,160
1,"CNC Machinist\nLoc Performance Products, LLC p...",**Ann Nelson**\n(555) 555-5555 | ann.nelson@em...,0,161
2,"CNC Machinist\nLoc Performance Products, LLC p...",**Adam Johnson**\n\n*+1 (123) 456-7890* *|* *a...,1,162
3,"CNC Machinist\nLoc Performance Products, LLC p...","# MARK MURRAY\n\n*Plymouth, MI* *|* *555-555-5...",1,163
4,CNC Machinist\nJC Ford - Tennessee is seeking ...,"# Tim Young\n\n*Nashville, TN* *|* *(615) 555-...",0,164
...,...,...,...,...
395,Retail Store Manager\nAt Nespresso we place pe...,# WILLIAM BROWN\n\n*Email*: william.brown@emai...,1,3075
396,Retail Store Manager\nThe Retail Store Manager...,**Cory Mullins**\n*555-555-5555* *cory.mullins...,0,3076
397,Retail Store Manager\nThe Retail Store Manager...,**Catherine Church**\n*+1 (xxx) xxx- xxxx* *|*...,0,3077
398,Retail Store Manager\nThe Retail Store Manager...,**Franklin Gardner**\n*Phone: (555) 555-5555* ...,1,3078


In [112]:
rd="""
A Data Scientist with 3 years of experience in developing machine learning pipelines and models to address business needs. Also
experienced in communicating insights to enhance strategic decision-making and improve performance outcomes.
EDUCATION
Master Of Science In Data Science
Northeastern University | GPA: 3.93 / 4.00 Portland, Maine | Sep 2023
- May 2025
Bachelor Of Technology In Computer Science
Vardhaman College of Engineering | GPA: 8.84 / 10.00 Hyderabad | May 2017
- Jun 2021
WORK EXPERIENCE
Data Engineer Co-Op | Full-time Jul 2024
- Present
EAI at Northeastern University
• Engineered an end-to-end claim complexity scoring ML pipeline for MEMIC, negating ~3 minutes of mannual classification per claim.
• Developed a robust claim complexity scoring model and reduced false negatives by 20%, streamlining the Claim Assignment Engine's efficiency within 6 months.
• Identified an issue with prediction validation, which led to employing advanced testing validated by statistical tests.
• Revised the existing claim assignment process to automatically assign claims classified by scoring engine to agents by prioritizing highly complex claims, achieving 100% automation.
• Communicated weekly insights to clients, enhancing strategic decision-making and improving performance outcomes through
experimental findings and suggesting actionable plans.
Azure Data Engineer | Full-time Aug 2021
- Jul 2023
LTIMindtree
• Resolved customer issues from Azure cloud services with an average solution acceptance rate of 50%, consistently for over one year.
• Utilized Azure Databricks data engineering environment to solve business problems for customers and improve data quality by 35%.
• Automated ETL pipelines for big data extraction and processing using Data factory and Dataflows, reducing processing time by 40%.
• Recognized as a highly motivated member of Azure Collective on Stack Overflow, contributing 404 answers and reaching 428k people.
• Developed a Question Moderator Model that improved question validation efficiency, saving 10% of time monthly by minimizing
invalid question engagement.
PROJECTS
Application Ranking System Sep 2024
- Apr 2025
• Built an AI-powered Application Ranking System using cosine similarity, FAISS, and a custom scoring formula for efficient ranking.
Aims to improve hiring by reducing bias, rewarding well-crafted resumes, and offering a fair ranking system for job applications.
Stance Detection Model Using Generative AI
• Developed a Huggingface space using fine-tuned text classification LLM to detect the stance using manually labeled and synthetic data.
Leveraged LoRA fine-tuning to train only ~1% of LLM's parameters in Roberta base model's architecture to achieve 0.80 accuracy.

SKILLS
Programming: Python, SQL Server, Java, Data Structures and Algorithms, Problem-
solving skills
Machine Learning: Machine Learning, Regression, Classification, Data Cleaning, Data preprocessing, Data transformation, Feature
Engineering, Data Visualization, XGBoost, Cross Validation, Deep learning, Neural Networks, Natural Language Processing, Transformers,
Sentiment Analysis, Huggingface, Large Language models, Fine tuning, Prompt Engineering, Generative AI, RAG, Agentic AI
Libraries: NumPy, Pandas, Matplotlib, Scikit-
Learn, xgboost, SciPy, spaCy, Pytorch, Tensorflow, Optuna, Langchain, ollama, transformers
Others: Jupyter, Azure, Git, SSMS, PowerBI, Microsoft Excel, Databricks, PySpark, Documentation, Statistical analysis
"""

jd1="""
Construction project engineer
JARDINE is a premier construction management and general contracting firm based in Centerville, Utah. We specialize in commercial construction with a focus on historic renovations, mountain and resort construction, light industrial projects, pharmaceutical and nutraceutical facilities, clean rooms, lab spaces, retail environments, food & beverage spaces, and tenant improvements. Our approach prioritizes collaborative project delivery methods such as Construction Management/General Contracting, Design-Build, Design Assist, and Cost-Plus to ensure successful project outcomes.

Role Description
JARDINE is seeking a full-time, on-site Construction Project Engineer to join our team in the Salt Lake City Metropolitan Area. This role is instrumental in supporting project execution, ensuring smooth coordination between field teams, subcontractors, and project management. The Construction Project Engineer will be actively involved in project coordination, construction engineering, RFI's, submittals, and communication to drive project success.

Key Responsibilities
Project Coordination: Support project managers and field teams in scheduling, logistics, and workflow efficiency.
Construction Engineering: Assist in design reviews, constructability analysis, and technical problem-solving.
Quality Control: Ensure compliance with project specifications, safety standards, and best practices.
Submittals & RFIs: Prepare, track, and manage submittals, RFIs, and change orders throughout the project lifecycle.
Communication: Maintain clear and effective communication between project stakeholders, including subcontractors, suppliers, and clients.
Documentation & Reporting: Keep detailed records of project progress, budgets, and schedules using construction management software.
Qualifications
Strong project coordination and construction engineering skills.
Experience in RFI and submittal processes.
Excellent written and verbal communication skills.
Proficiency in construction project management software (e.g., Procore, Bluebeam, MS Project).
Ability to read and interpret construction drawings and specifications.
Bachelor’s degree in Construction Management, Civil Engineering, or a related field.
Relevant certifications such as PE (Professional Engineer) or PMP (Project Management Professional) are a plus.
Why Join JARDINE?
Competitive salary based on experience.
Health benefits, 401(k), and paid time off.
Opportunities for career growth and professional development.
A collaborative, team-oriented work environment.

Project Coordination, Construction Engineering, RFI Process, Submittals Management, Quality Control, Scheduling, Logistics, Workflow Efficiency, Constructability Analysis, Technical Problem-Solving, Compliance with Safety Standards, Change Order Management, Stakeholder Communication, Documentation, Reporting, Construction Management Software (Procore, Bluebeam, MS Project), Construction Drawings Interpretation, Budget Tracking, Civil Engineering, Construction Management, Project Management, Professional Certifications (PE, PMP).
"""


jd2 = """
We are seeking a Data Scientist with strong Data Engineering expertise to join our AI-driven team. The ideal candidate will have experience building end-to-end ML pipelines, optimizing ETL processes, and leveraging cloud-based data platforms to support scalable AI solutions. You will work closely with cross-functional teams to develop and deploy ML models that enhance decision-making and improve business operations.

Key Responsibilities:
Develop and optimize machine learning pipelines for predictive modeling, classification, and NLP tasks.
Engineer claim complexity scoring models and automate classification processes to streamline operations.
Design and build ETL pipelines using Azure Databricks, Data Factory, and PySpark to process large datasets efficiently.
Apply feature engineering, cross-validation, and hyperparameter tuning (Optuna, XGBoost, Transformers) for model improvement.
Implement Hugging Face-based fine-tuning (LoRA, Prompt Engineering) for LLM-powered applications such as Stance Detection and Generative AI models.
Collaborate with data engineers to design scalable data warehouses and big data solutions for AI-driven insights.
Work with Azure Cloud Services to manage data pipelines, storage, and AI workloads.
Communicate insights through Power BI, statistical reports, and stakeholder presentations.
Maintain best practices in code documentation, model validation, and deployment workflows.

Education:

MS in Data Science, Computer Science, or related field (or equivalent experience).
Technical Skills:

Programming: Python, SQL, Java, Data Structures & Algorithms.
ML & AI: Regression, Classification, Feature Engineering, NLP, Hugging Face, Transformers, Deep Learning (Pytorch, TensorFlow), LLM Fine-Tuning.
Cloud & Data Engineering: Azure Databricks, Data Factory, PySpark, Data Warehousing, ETL Pipelines.
Data Science Libraries: NumPy, Pandas, Scikit-learn, XGBoost, LangChain, ollama.
Visualization & Reporting: Power BI, Matplotlib, Excel, Statistical Analysis.
Experience:

3+ years of hands-on experience in Data Science & Machine Learning (with a focus on AI-driven automation).
2+ years in Data Engineering, optimizing Azure-based ETL workflows and big data pipelines.
Strong problem-solving skills with a track record of AI-driven automation solutions.

Work on cutting-edge ML & AI projects that drive real-world business impact.
 Collaborate with top-tier AI researchers, engineers, and data scientists.
 Fully remote / flexible work culture with competitive salary & benefits.
 Career growth in AI, Data Engineering, and Cloud Computing.
"""


jd3 = """
wowbrands is a premiere small business solution provider based in Columbus, Ohio, offering affordable and customized digital marketing services. The agency specializes in website design, online and print marketing/branding, naming and logo design, as well as incorporation assistance. wowbrands caters to individuals starting small to medium businesses and existing companies looking to enhance their online presence through holistic digital marketing strategies.

 Role Description

This is a full-time remote role for a Data Engineer at wowbrands. The Data Engineer will be responsible for tasks such as data modeling, ETL processes, data warehousing, and data analytics. They will play a key role in managing and optimizing data processes to support the company's digital marketing strategies.

 Qualifications

Data Engineering and Data Modeling skills
Experience in Extract Transform Load (ETL) processes
Data Warehousing and Data Analytics capabilities
Strong problem-solving and analytical skills
Proficiency in SQL and other database technologies
Excellent communication and collaboration abilities
Experience with digital marketing data is a plus
Bachelor's or Master's degree in Computer Science, Data Science, or related field
"""

In [113]:
infer(jd1, rd)

(0, 0.02135917730629444)

In [114]:
infer(jd2, rd)

(1, 0.845534086227417)

In [115]:
infer(jd3, rd)

(1, 0.6511306762695312)