In [3]:
import os

%pwd




'/Users/satwiksahoo/Desktop/CodeBasics/machine learning/krish naik/NLP project/ATSresume/RESUME_ATS/research'

In [6]:
# os.chdir('../')

In [49]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataTransformationConfig:
    
    root_dir : Path
    train_path : Path
    test_path : Path
    transformed_train_path : Path
    transformed_test_path : Path
    model_name  : Path
    tokenizer_name : Path

In [50]:
from src.ATS_RESUME.constants import *
from src.ATS_RESUME.utils.common import read_yaml , create_directories

In [51]:
class ConfigurationManager:
    def __init__(self , config_file_path = CONFIG_FILE_PATH , params_file_path = PARAMS_FILE_PATH):
        
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(params_file_path)
        
        create_directories([self.config.artifact_root])
    
    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        
        config = self.config.data_transformation
        
        data_transformation_config = DataTransformationConfig(
            root_dir = config.root_dir ,
            train_path = config.train_path ,
            test_path = config.test_path ,
            transformed_train_path = config.transformed_train_path ,
            transformed_test_path = config.transformed_test_path , 
            model_name= config.model_name , 
            tokenizer_name=config.tokenizer_name

            
        )
        
        return data_transformation_config
        

        

In [56]:
import re
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer , pipeline

from datasets import Dataset
import numpy as np

from sentence_transformers import SentenceTransformer, InputExample, losses, models, evaluation
from torch.utils.data import DataLoader
import pandas as pd
from datasets import load_dataset
import pickle


class DataTransformation:
    def __init__(self , config : DataTransformationConfig):
        self.config = config
        
    def preprocess_text(self , text):
        tokens = []
        text = re.sub(r'\r\n|\n', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'<.*?>', ' ', text)
        text = text.strip().lower()
        return text
    
    def data_transformation_(self):
        df_train  = pd.read_csv('./artifacts/data_ingestion/train.csv')
        df_test  = pd.read_csv('./artifacts/data_ingestion/test.csv')
        
        df_train['label_encoding'] = df_train['label'].map({'No Fit' : 0  , 'Potential Fit' : 1 , 'Good Fit' : 2  } )
        df_test['label_encoding'] = df_test['label'].map({'No Fit' : 0  , 'Potential Fit' : 1 , 'Good Fit' : 2  } )
        
        df_train['resume_text'] = df_train['resume_text'].apply(self.preprocess_text)
        df_test['resume_text'] = df_test['resume_text'].apply(self.preprocess_text)
        
        
        df_train['job_description_text'] = df_train['job_description_text'].apply(self.preprocess_text)
        df_test['job_description_text'] = df_test['job_description_text'].apply(self.preprocess_text)
        
        num_labels = 3  # No Fit, Potential Fit, Good Fit
        model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased",  num_labels=num_labels)
        tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
        
        df_train['tokens'] = df_train.apply(lambda row: tokenizer(row['resume_text'] , row['job_description_text'] , padding = 'max_length' , 
                                                                  truncation = True  , max_length = 256)  ,axis = 1)
        df_test['tokens'] = df_test.apply(lambda row: tokenizer(row['resume_text'] , row['job_description_text'] , padding = 'max_length' , 
                                                                  truncation = True  , max_length = 256)  ,axis = 1)
        
        tokens_df = pd.DataFrame(df_train['tokens'].tolist())
        
        df_train_expanded = pd.concat([tokens_df , df_train['label_encoding'] ] ,axis =1 )
        df_test_expanded = pd.concat([pd.DataFrame(df_test['tokens'].tolist()) , df_test['label_encoding'] ] ,axis =1 )
        
        df_train_expanded.rename(columns={"label_encoding": "labels"}, inplace=True)
        df_test_expanded.rename(columns={"label_encoding": "labels"}, inplace=True)
        
        
        train_dataset = Dataset.from_pandas(df_train_expanded , preserve_index=False)
        test_dataset = Dataset.from_pandas(df_test_expanded , preserve_index=False)
        
        os.makedirs(os.path.dirname(self.config.transformed_train_path), exist_ok=True)
        os.makedirs(os.path.dirname(self.config.transformed_test_path), exist_ok=True)
        
        train_dataset.save_to_disk(self.config.transformed_train_path)
        test_dataset.save_to_disk(self.config.transformed_test_path)
    
    
    def data_transformation(self):
        
        
        df_train  = pd.read_csv('./artifacts/data_ingestion/train.csv')
        df_test  = pd.read_csv('./artifacts/data_ingestion/test.csv')
        label2score = {
           "No Fit": 0.0,
    "Potential Fit": 0.5,
    "Good Fit": 1.0
}       
        
        train_examples = [
             InputExample(
        texts=[row.resume_text, row.job_description_text],
        label=label2score[row.label])
             for _, row in df_train.iterrows()
]



        test_examples = [
    InputExample(
        texts=[row.resume_text, row.job_description_text],
        label=label2score[row.label]
    )
    for _, row in df_test.iterrows()
]


        train_data = train_examples
        val_data = test_examples
        
        os.makedirs(os.path.dirname(self.config.transformed_train_path), exist_ok=True)
        os.makedirs(os.path.dirname(self.config.transformed_test_path), exist_ok=True)
        
        train_file = os.path.join(self.config.transformed_train_path, "train.pkl")
        test_file = os.path.join(self.config.transformed_test_path, "test.pkl")
        
        # train_data.save_to_disk(self.config.transformed_train_path)
        # val_data.save_to_disk(self.config.transformed_test_path)
        
        with open(train_file, "wb") as f:
         pickle.dump(train_data, f)

        with open(test_file, "wb") as f:
         pickle.dump(val_data, f)

        
    
        
    
        
        
        
        
        


        
        
        
        

In [None]:
# import pandas as pd

# df_train  = pd.read_csv('./artifacts/data_ingestion/train.csv')


# df_train.head()

Unnamed: 0,resume_text,job_description_text,label
0,SummaryHighly motivated Sales Associate with e...,Net2Source Inc. is an award-winning total work...,No Fit
1,Professional SummaryCurrently working with Cat...,At Salas OBrien we tell our clients that were ...,No Fit
2,SummaryI started my construction career in Jun...,Schweitzer Engineering Laboratories (SEL) Infr...,No Fit
3,SummaryCertified Electrical Foremanwith thirte...,"Mizick Miller & Company, Inc. is looking for a...",No Fit
4,SummaryWith extensive experience in business/r...,Life at Capgemini\nCapgemini supports all aspe...,No Fit


In [57]:
config = ConfigurationManager()
data_transformation_config = config.get_data_transformation_config()
data_transformation = DataTransformation(config = data_transformation_config)
data_transformation.data_transformation()

[2025-08-19 23:48:25,354: INFO: common: yaml file: config/config.yaml loaded successfully]
[2025-08-19 23:48:25,357: INFO: common: yaml file: params.yaml loaded successfully]
[2025-08-19 23:48:25,358: INFO: common: created directory at: artifacts]
