In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        os.path.join(dirname, filename)

In [3]:
import torch
import numpy as np
import pandas as pd
import string
import re
from pypdf import PdfReader
from nltk import pos_tag, sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from tqdm.auto import tqdm

from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoModel, AutoTokenizer

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

## Loading data

In [5]:
import pandas as pd
from bs4 import BeautifulSoup as bs

candidate_df = pd.read_csv("/kaggle/input/resume-dataset/Resume/Resume.csv")
candidate_df.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [6]:
candidate_df.shape

(2484, 4)

In [7]:
candidate_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2484 entries, 0 to 2483
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           2484 non-null   int64 
 1   Resume_str   2484 non-null   object
 2   Resume_html  2484 non-null   object
 3   Category     2484 non-null   object
dtypes: int64(1), object(3)
memory usage: 77.8+ KB


In [8]:
len_classes = candidate_df.Category.nunique()
len_classes

24

## Preprocessing

In [17]:
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = "".join(page.extract_text() for page in reader.pages)
    return text

def preprocess_text(text):
    stop_words = set(stopwords.words("english"))
    text = text.lower()
    text = re.sub(r"[^a-zA-Z]", " ", text)
    sentences = sent_tokenize(text)
    features = {"features": ""}
    for sentence in sentences:
        if any(criteria in sentence for criteria in ["experience", "work", "position", "skills", "qualifications"]):
            words = word_tokenize(sentence)
            words = [word for word in words if word not in stop_words]
            tagged_words = pos_tag(words)
            filtered_words = [w for w, t in tagged_words if t not in ['DT', 'IN', 'PRP', 'WP', 'TO']]
            features["features"] += " ".join(filtered_words)
    return features

def preprocess_resume_data(df):
    id, category = df["ID"], df["Category"]
    text = extract_text_from_pdf(f"/kaggle/input/resume-dataset/data/data/{category}/{id}.pdf")
    features = preprocess_text(text)
    df["Features"] = features["features"]
    return df


In [18]:
# Reading the dataset
resume_data = pd.read_csv("/kaggle/input/resume-dataset/Resume/Resume.csv")
resume_data = resume_data.drop(["Resume_html"], axis=1)
resume_data = resume_data.apply(preprocess_resume_data, axis=1)
resume_data = resume_data.drop(columns=['Resume_str'])
resume_data.to_csv("resume_data.csv", index=False)

KeyboardInterrupt: 

In [19]:
#solving error
df = pd.read_csv("/kaggle/working/resume_data.csv")
df.head()

Unnamed: 0,ID,Category,Features
0,16852973,HR,hr administrator marketing associate hr admini...
1,22323967,HR,hr specialist hr operations summary media prof...
2,33176873,HR,hr director summary years experience recruitin...
3,27018550,HR,hr specialist summary dedicated driven dynamic...
4,17812897,HR,hr manager skill highlights hr skills hr depar...


In [20]:
df.shape

(2484, 3)

In [21]:
isnulldf = df.isnull()
columns_containing_nulls = isnulldf.columns[isnulldf.any()]
rows_containing_nulls = df[isnulldf[columns_containing_nulls].any(axis='columns')].index
only_nulls_df = df[columns_containing_nulls].loc[rows_containing_nulls]
print(only_nulls_df)

    Features
656      NaN


In [22]:
df.iloc[656]

ID                      12632728
Category    BUSINESS-DEVELOPMENT
Features                     NaN
Name: 656, dtype: object

In [23]:
df = df.dropna()
df.head()

Unnamed: 0,ID,Category,Features
0,16852973,HR,hr administrator marketing associate hr admini...
1,22323967,HR,hr specialist hr operations summary media prof...
2,33176873,HR,hr director summary years experience recruitin...
3,27018550,HR,hr specialist summary dedicated driven dynamic...
4,17812897,HR,hr manager skill highlights hr skills hr depar...


In [24]:
df.shape

(2483, 3)

In [25]:
df.to_csv("/kaggle/working/clean_resume_data.csv", index= False)

In [26]:
from sklearn.model_selection import train_test_split

# Example loading and splitting the dataset
df = pd.read_csv('/kaggle/working/clean_resume_data.csv')
train_df, test_df = train_test_split(df, 
                                     test_size=0.2, 
                                     random_state=42,
                                     stratify= df["Category"])

print(f"Training dataset shape: {train_df.shape}")
print(f"Testing dataset shape: {test_df.shape}")

# Assuming 'text' is the column with resume text and 'label' is the target label
train_texts = train_df['Features'].tolist()
train_labels = train_df['Category'].tolist()
test_texts = test_df['Features'].tolist()
test_labels = test_df['Category'].tolist()

print(f"Length of train texts: {len(train_texts)}\nLength of train labels: {len(train_labels)}")
print(f"Length of text texts: {len(test_texts)}\nLength of test labels: {len(test_labels)}")

Training dataset shape: (1986, 3)
Testing dataset shape: (497, 3)
Length of train texts: 1986
Length of train labels: 1986
Length of text texts: 497
Length of test labels: 497


In [27]:
df.Category.value_counts(ascending=True)

Category
BPO                        22
AUTOMOBILE                 36
AGRICULTURE                63
DIGITAL-MEDIA              96
APPAREL                    97
TEACHER                   102
ARTS                      103
DESIGNER                  107
HR                        110
PUBLIC-RELATIONS          111
CONSTRUCTION              112
BANKING                   115
CONSULTANT                115
HEALTHCARE                115
SALES                     116
FITNESS                   117
AVIATION                  117
ADVOCATE                  118
FINANCE                   118
CHEF                      118
ACCOUNTANT                118
ENGINEERING               118
BUSINESS-DEVELOPMENT      119
INFORMATION-TECHNOLOGY    120
Name: count, dtype: int64

In [28]:
train_df.Category.value_counts(ascending=True)

Category
BPO                       18
AUTOMOBILE                29
AGRICULTURE               50
DIGITAL-MEDIA             77
APPAREL                   78
TEACHER                   82
ARTS                      82
DESIGNER                  86
HR                        88
PUBLIC-RELATIONS          89
CONSTRUCTION              90
HEALTHCARE                92
CONSULTANT                92
BANKING                   92
AVIATION                  93
SALES                     93
FINANCE                   94
CHEF                      94
FITNESS                   94
ADVOCATE                  94
ENGINEERING               94
ACCOUNTANT                94
BUSINESS-DEVELOPMENT      95
INFORMATION-TECHNOLOGY    96
Name: count, dtype: int64

In [29]:
test_df.Category.value_counts(ascending=True)

Category
BPO                        4
AUTOMOBILE                 7
AGRICULTURE               13
DIGITAL-MEDIA             19
APPAREL                   19
TEACHER                   20
DESIGNER                  21
ARTS                      21
PUBLIC-RELATIONS          22
HR                        22
CONSTRUCTION              22
SALES                     23
BANKING                   23
HEALTHCARE                23
FITNESS                   23
CONSULTANT                23
BUSINESS-DEVELOPMENT      24
ENGINEERING               24
FINANCE                   24
INFORMATION-TECHNOLOGY    24
ADVOCATE                  24
CHEF                      24
ACCOUNTANT                24
AVIATION                  24
Name: count, dtype: int64

## Creating tokenizer,dataset and training data

In [30]:
type(train_texts), len(train_texts)

(list, 1986)

In [31]:
type(train_texts[0]), len(train_texts[0])

(str, 4372)

In [32]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)
test_labels = label_encoder.transform(test_labels)

In [33]:
import joblib
joblib.dump(label_encoder, "/kaggle/working/label_encoder.pkl")

['/kaggle/working/label_encoder.pkl']

In [183]:
train_labels.max(), train_labels.min()

(23, 0)

In [184]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [185]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the training and testing data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

In [186]:
import torch

class ResumeDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ResumeDataset(train_encodings, train_labels)
test_dataset = ResumeDataset(test_encodings, test_labels)

In [187]:
train_dataset.__len__()

1986

In [188]:
train_dataset.__getitem__(0)

{'input_ids': tensor([  101, 17850, 10669, 12654,  2034,  3105,  7027,  4341, 27576,  6896,
          4105,  2458,  2500,  2529,  4219,  2658, 14293,  2551,  2047,  3095,
          2372,  2393,  3144,  3029,  2529,  4219,  3325, 11539,  2236,  2923,
         10198,  2583,  9002,  2488,  3672,  3029,  2377,  3145,  2535,  4852,
         20125, 11194, 11637, 17850,  6043,  8853, 11532,  7904, 14812,  2458,
          3095,  2731,  2458,  2047,  7904,  9405,  9405,  6107,  2375,  3716,
         26854, 11532,  6666,  8911,  4114,  9319, 18777,  3012,  7513,  2436,
          7621, 17571,  7065, 16613,  2098, 10296,  2832,  2047, 28208,  7528,
          2194,  2898,  3687,  2569,  5038, 12697,  7678,  2048,  2095, 18322,
          2622,  2086,  6951,  2636,  2363,  7904,  2095,  5151,  4056,  2326,
          2805,  2449,  6194,  2136,  7276,  2177, 17908,  2015,  2717,  6820,
         14890,  2015,  2449,  2832,  3029,  3325,  2194,  2171,  2103,  2110,
         17850, 10669,  2783,  2611, 10

In [189]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',                  # output directory
    run_name='bert_finetune_lr_5e-5_bs_32_epochs_5',    
    num_train_epochs=5,                      # total number of training epochs
    per_device_train_batch_size=32,          # batch size per device during training
    per_device_eval_batch_size=64,           # batch size for evaluation
    warmup_steps=500,                         # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                       # strength of weight decay
    logging_dir='./logs',                    # directory for storing logs
    logging_steps=50,                        # log every 50 steps
    eval_strategy="steps",              # evaluate every 'eval_steps'
    eval_steps=100,                          # evaluate every 100 steps
    save_strategy="steps",                    # save model every 'save_steps'
    save_steps=500,                          # save model every 500 steps
    learning_rate=5e-5,                      # set a learning rate
    gradient_accumulation_steps=2,           # accumulate gradients over 2 steps
    fp16=True,                                # enable mixed precision training
)

In [190]:
from transformers import BertForSequenceClassification, Trainer
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {'accuracy': accuracy_score(labels, predictions)}

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(set(train_labels)))

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [191]:
type(trainer), trainer

(transformers.trainer.Trainer,
 <transformers.trainer.Trainer at 0x7f45e87ba4d0>)

In [192]:
train_dataset

<__main__.ResumeDataset at 0x7f45e873f940>

In [193]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [194]:
try:
    from torchinfo import summary
except:
    !pip install -q torchinfo
    from torchinfo import summary

# summary(model,
#         input_size= [32, 512],
#         col_names=["input_size", "output_size", "num_params", "trainable"])
summary(model)

Layer (type:depth-idx)                                       Param #
BertForSequenceClassification                                --
├─BertModel: 1-1                                             --
│    └─BertEmbeddings: 2-1                                   --
│    │    └─Embedding: 3-1                                   23,440,896
│    │    └─Embedding: 3-2                                   393,216
│    │    └─Embedding: 3-3                                   1,536
│    │    └─LayerNorm: 3-4                                   1,536
│    │    └─Dropout: 3-5                                     --
│    └─BertEncoder: 2-2                                      --
│    │    └─ModuleList: 3-6                                  85,054,464
│    └─BertPooler: 2-3                                       --
│    │    └─Linear: 3-7                                      590,592
│    │    └─Tanh: 3-8                                        --
├─Dropout: 1-2                                               --
├─L

In [195]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
100,3.0757,2.966376,0.158954


TrainOutput(global_step=155, training_loss=2.975350558373236, metrics={'train_runtime': 490.006, 'train_samples_per_second': 20.265, 'train_steps_per_second': 0.316, 'total_flos': 2578997669068800.0, 'train_loss': 2.975350558373236, 'epoch': 4.920634920634921})

In [196]:
trainer.evaluate()

{'eval_loss': 2.3013904094696045,
 'eval_accuracy': 0.5271629778672032,
 'eval_runtime': 8.1299,
 'eval_samples_per_second': 61.132,
 'eval_steps_per_second': 0.984,
 'epoch': 4.920634920634921}

In [203]:
model.save_pretrained('/kaggle/working/models/fine_tuned_model')
tokenizer.save_pretrained('/kaggle/working/models/fine_tuned_model')

('/kaggle/working/models/fine_tuned_model/tokenizer_config.json',
 '/kaggle/working/models/fine_tuned_model/special_tokens_map.json',
 '/kaggle/working/models/fine_tuned_model/vocab.txt',
 '/kaggle/working/models/fine_tuned_model/added_tokens.json')

## download the models directory on pc

In [204]:
import shutil

# Specify the directory you want to zip
output_dir = '/kaggle/working/models/fine_tuned_model'
shutil.make_archive(output_dir, 'zip', output_dir)

'/kaggle/working/models/fine_tuned_model.zip'

In [225]:
# Assuming you have a model named 'model'
torch.save(model.state_dict(), '/kaggle/working/model_state.pth')

In [228]:
%cd /kaggle/working

/kaggle/working


In [229]:
# from IPython.display import FileLink -> FileLink(r'*name of file*')
from IPython.display import FileLink
FileLink(r'model_state.pth')

In [1]:
# from IPython.display import FileLink -> FileLink(r'*name of file*')
from IPython.display import FileLink
FileLink(r'models.zip')

## RAG implementation

In [207]:
###link will be provided here

### Preprocess data for RAG

In [206]:
!pip install -q elasticsearch

  pid, fd = os.forkpty()


Collecting elasticsearch
  Downloading elasticsearch-8.14.0-py3-none-any.whl.metadata (7.2 kB)
Collecting elastic-transport<9,>=8.13 (from elasticsearch)
  Downloading elastic_transport-8.13.1-py3-none-any.whl.metadata (3.7 kB)
Downloading elasticsearch-8.14.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.2/480.2 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading elastic_transport-8.13.1-py3-none-any.whl (64 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.5/64.5 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: elastic-transport, elasticsearch
Successfully installed elastic-transport-8.13.1 elasticsearch-8.14.0


In [230]:
from elasticsearch import Elasticsearch, helpers
import pandas as pd

# Connect to Elasticsearch
es = Elasticsearch("https://localhost:9200")

if es.ping():
    print("Connected to Elasticsearch!")
else:
    print("Could not connect to Elasticsearch.")

# Load your candidate data
candidates_df = pd.read_csv('/kaggle/input/resume-sample-data-for-rag/rag_data.csv')

print(candidates_df.head())

Could not connect to Elasticsearch.
            Name            Contact Details       Location  \
0       John Doe       john.doe@example.com       New York   
1     Jane Smith     jane.smith@example.com  San Francisco   
2    Bob Johnson    bob.johnson@example.com       New York   
3    Alice Brown    alice.brown@example.com        Chicago   
4  Michael Green  michael.green@example.com         Boston   

                         Job Skills Experience  \
0            Java; Big Data; Hadoop    5 years   
1        JavaScript; React; Node.js    3 years   
2       Java; Spring; Microservices    6 years   
3  Python; Django; Machine Learning    4 years   
4           Ruby; Rails; PostgreSQL    7 years   

                                            Projects  \
0  Developed a big data processing system using H...   
1  Built a real-time chat application using React...   
2  Implemented a microservices architecture for a...   
3  Created a machine learning model for predictiv...   
4  Develop

In [221]:
candidates_df["ID"] = range(len(candidates_df))
candidates_df.head()

Unnamed: 0,Name,Contact Details,Location,Job Skills,Experience,Projects,Comments,ID
0,John Doe,john.doe@example.com,New York,Java; Big Data; Hadoop,5 years,Developed a big data processing system using H...,Strong problem-solving skills.,0
1,Jane Smith,jane.smith@example.com,San Francisco,JavaScript; React; Node.js,3 years,Built a real-time chat application using React...,Excellent in team collaboration.,1
2,Bob Johnson,bob.johnson@example.com,New York,Java; Spring; Microservices,6 years,Implemented a microservices architecture for a...,Proactive and detail-oriented.,2
3,Alice Brown,alice.brown@example.com,Chicago,Python; Django; Machine Learning,4 years,Created a machine learning model for predictiv...,Strong analytical skills.,3
4,Michael Green,michael.green@example.com,Boston,Ruby; Rails; PostgreSQL,7 years,Developed a scalable web application using Rub...,Great leadership qualities.,4


In [223]:
# Prepare the data for indexing
def generate_candidates_data(candidates_df):
    for _, row in candidates_df.iterrows():
        yield {
            "_index": "candidates",
            "_id": row['ID'],  # Assuming there's a unique ID for each candidate
            "_source": {
                "name": row['Name'],
                "contact": row['Contact Details'],
                "location": row['Location'],
                "skills": row['Job Skills'],
                "experience": row['Experience'],
                "projects": row["Projects"],
                "comments": row["Comments"],
            }
        }


In [None]:
# Index the candidate profiles
helpers.bulk(es, generate_candidates_data(candidates_df))

In [None]:
def retrieve_candidates(job_description, top_k=10):
    query = {
    "query": {
        "multi_match": {
            "query": job_description,
            "fields": ["skills^2", "experience", "projects"],
            "fuzziness": "AUTO"
                    }
                }
            }

    response = es.search(index="candidates", body=query, size=top_k)
    return response['hits']['hits']  # Returns the top K candidate profiles


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# Load the fine-tuned model and tokenizer
# model = AutoModelForSequenceClassification.from_pretrained('./fine_tuned_model')
# tokenizer = AutoTokenizer.from_pretrained('./fine_tuned_model')

def generate_response(job_description, candidates):
    candidate_profiles = [hit['_source']['profile'] for hit in candidates]
    input_text = f"Job Description: {job_description}\nCandidates:\n" + "\n".join(candidate_profiles)
    
    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(device)
    
    # Generate outputs
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Process the outputs (e.g., apply softmax to get probabilities)
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=1)
    
    return probabilities


In [None]:
def match_candidates(job_description):
    # Step 1: Retrieve candidates
    retrieved_candidates = retrieve_candidates(job_description)
    
    # Step 2: Generate response
    response = generate_response(job_description, retrieved_candidates)
    
    return response


In [4]:
# if __name__ == "__main__":
#     job_description = input("Enter the job description: ")
#     matched_candidates = match_candidates(job_description)
#     print("Matched Candidates Probabilities:\n", matched_candidates)

job_description = "Provide 10 candidate from agriculture department"
matched_candidates = match_candidates(job_description)
print("Matched Candidates Probabilities:\n", matched_candidates)


NameError: name 'match_candidates' is not defined