Environment Setup

In [1]:
!pip3 install transformers datasets torch scikit-learn pandas

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Using cached pyarrow-18.1.0-cp312-cp312-win_amd64.whl.metadata (3.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.5.0-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Using cached multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Using cached fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
Using cached dill-0.3.8-py3-none-any.whl (116 kB)
Using cached fsspec-2024.9.0-py3-none-any.whl (179 kB)
Using cached multiprocess-0.70.16-py312-none-any.whl (146 kB)
Using cached pyarrow-18.1.0-cp312-cp312-win_amd64.whl (25.1 MB)
Using cached xxhash-3.5.0-cp3

Code for Fine-Tuning DistilBERT

In [5]:
import os
import torch
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, AdamW, AutoTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
import pandas as pd

# Load dataset
csv_path = "D:/Lusak.tech/combined_resumes.csv"  # Replace with a relative path if needed
if not os.path.exists(csv_path):
    raise FileNotFoundError(f"Dataset file not found at: {csv_path}")

df = pd.read_csv(csv_path)

# Ensure dataset contains required columns
if "text" not in df.columns or "label" not in df.columns:
    raise ValueError("The dataset must contain 'text' and 'label' columns.")

# Tokenize the data
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'], df['label'], test_size=0.2, stratify=df['label'], random_state=42
)

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=128)

# Prepare datasets for PyTorch
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.labels[idx])
        }

train_dataset = CustomDataset(train_encodings, train_labels.tolist())
val_dataset = CustomDataset(val_encodings, val_labels.tolist())

# Load pre-trained DistilBERT model
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=len(df['label'].unique())
)
model.to(device)

# Use weighted loss to handle class imbalance
class_weights = torch.tensor(
    [len(df) / (df['label'].value_counts()[i] * 2) for i in sorted(df['label'].unique())],
    dtype=torch.float
).to(device)
loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)

# Define optimizer and dataloaders
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
batch_size = 8
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Train the model for fine-tuning
epochs = 3
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss  # Use the model's built-in loss for classification tasks
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}: Train Loss = {total_loss / len(train_loader)}")

    # Validation
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            val_loss += outputs.loss.item()
            predictions = torch.argmax(outputs.logits, dim=1)
            correct += (predictions == batch['labels']).sum().item()
            total += batch['labels'].size(0)

    print(f"Validation Accuracy = {correct / total * 100:.2f}%, Validation Loss = {val_loss / len(val_loader):.4f}")


    # Validation phase
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            predictions = torch.argmax(outputs.logits, dim=1)
            all_preds.extend(predictions.cpu().numpy())
            all_labels.extend(batch['labels'].cpu().numpy())

    f1 = f1_score(all_labels, all_preds, average="weighted", zero_division=1)
    print(f"Validation F1-Score = {f1:.2f}")

# Save the trained model and tokenizer
model.save_pretrained("./fine_tuned_distilbert")
tokenizer.save_pretrained("./fine_tuned_distilbert")

# Classification report
print("Classification Report:")
print(classification_report(all_labels, all_preds, target_names=["Human-written", "AI-generated"], zero_division=1))


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1: Train Loss = 0.22096836803531325
Validation Accuracy = 100.00%, Validation Loss = 0.0556
Validation F1-Score = 1.00
Epoch 2: Train Loss = 0.030748086364788783
Validation Accuracy = 100.00%, Validation Loss = 0.0099
Validation F1-Score = 1.00
Epoch 3: Train Loss = 0.00847085122950375
Validation Accuracy = 100.00%, Validation Loss = 0.0040
Validation F1-Score = 1.00
Classification Report:
               precision    recall  f1-score   support

Human-written       1.00      1.00      1.00         5
 AI-generated       1.00      1.00      1.00        68

     accuracy                           1.00        73
    macro avg       1.00      1.00      1.00        73
 weighted avg       1.00      1.00      1.00        73



Testing

In [5]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
import json
import chardet

# Function to detect file encoding using chardet
def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read())
    return result['encoding']

# Load model and tokenizer
model = DistilBertForSequenceClassification.from_pretrained("./fine_tuned_distilbert")
tokenizer = DistilBertTokenizer.from_pretrained("./fine_tuned_distilbert")

# Function to predict AI or human-written resume
def detect_resume_type(resume_text):
    inputs = tokenizer(resume_text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits).item()
    return "AI-generated" if prediction == 1 else "Human-written"

# Recursive function to extract text from JSON
def extract_text_from_json(data, text=""):
    if isinstance(data, dict):
        for key, value in data.items():
            text = extract_text_from_json(value, text)
    elif isinstance(data, list):
        for item in data:
            text = extract_text_from_json(item, text)
    elif isinstance(data, (str, int, float)):
        text += str(data) + "\n"
    return text

# Function to load resumes from a .json file and detect type
def detect_from_json(file_path):
    # Detect encoding of the file
    file_encoding = detect_encoding(file_path)
    print(f"The detected encoding of the JSON file is: {file_encoding}")

    # Open the file with the detected encoding
    with open(file_path, 'r', encoding=file_encoding) as f:
        data = json.load(f)

    # Extract all text from JSON
    resume_text = extract_text_from_json(data)

    # Detect the type of resume
    result = detect_resume_type(resume_text)
    print(f"Resume Type: {result}")

# Example usage
json_file_path = "D:/Lusak.tech/Dataset/Mehar.json"  # Replace with your .json file path
detect_from_json(json_file_path)


The detected encoding of the JSON file is: utf-8
Resume Type: Human-written


In [8]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
import json
import os
import chardet

# Function to detect file encoding using chardet
def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read())
    return result['encoding']

# Load model and tokenizer
model = DistilBertForSequenceClassification.from_pretrained("./fine_tuned_distilbert")
tokenizer = DistilBertTokenizer.from_pretrained("./fine_tuned_distilbert")

# Function to predict resume type
def detect_resume_type(resume_text):
    inputs = tokenizer(resume_text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits).item()

    # Map prediction to label
    labels = {0: "Human-written", 1: "AI-generated", 2: "Faulty"}
    return labels.get(prediction, "Unknown")

# Recursive function to extract text from JSON
def extract_text_from_json(data, text=""):
    if isinstance(data, dict):
        for key, value in data.items():
            text = extract_text_from_json(value, text)
    elif isinstance(data, list):
        for item in data:
            text = extract_text_from_json(item, text)
    elif isinstance(data, (str, int, float)):
        text += str(data) + "\n"
    return text

# Function to process a single resume file
def process_resume(file_path):
    try:
        # Detect encoding of the file
        file_encoding = detect_encoding(file_path)
        print(f"Detected encoding for {file_path}: {file_encoding}")

        # Open and parse the JSON file
        with open(file_path, 'r', encoding=file_encoding) as f:
            data = json.load(f)

        # Extract all text from JSON
        resume_text = extract_text_from_json(data)

        # Detect the type of resume
        result = detect_resume_type(resume_text)
        return {"file_path": file_path, "result": result}
    except Exception as e:
        return {"file_path": file_path, "error": str(e)}

# Function to process multiple resumes in a directory
def process_resumes_in_directory(directory_path, output_file="results.json"):
    results = []

    for file_name in os.listdir(directory_path):
        if file_name.endswith(".json"):
            file_path = os.path.join(directory_path, file_name)
            result = process_resume(file_path)
            results.append(result)

    # Save results to output file
    with open(output_file, 'w', encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=4)

    print(f"Results saved to {output_file}")

# Example usage
directory_path = "D:/Lusak.tech/jsonresume-fake-master/jsonresume-fake-master/resumes"  # Replace with your directory containing JSON files
process_resumes_in_directory(directory_path)


Detected encoding for D:/Lusak.tech/jsonresume-fake-master/jsonresume-fake-master/resumes\.json: ascii
Detected encoding for D:/Lusak.tech/jsonresume-fake-master/jsonresume-fake-master/resumes\aaron-visser.json: ascii
Detected encoding for D:/Lusak.tech/jsonresume-fake-master/jsonresume-fake-master/resumes\abel.json: ascii
Detected encoding for D:/Lusak.tech/jsonresume-fake-master/jsonresume-fake-master/resumes\abhishek-clark.json: ascii
Detected encoding for D:/Lusak.tech/jsonresume-fake-master/jsonresume-fake-master/resumes\abraham-mokhtari.json: ascii
Detected encoding for D:/Lusak.tech/jsonresume-fake-master/jsonresume-fake-master/resumes\abram-c.json: ascii
Detected encoding for D:/Lusak.tech/jsonresume-fake-master/jsonresume-fake-master/resumes\adina-melchor.json: ascii
Detected encoding for D:/Lusak.tech/jsonresume-fake-master/jsonresume-fake-master/resumes\aditya-soni.json: ascii
Detected encoding for D:/Lusak.tech/jsonresume-fake-master/jsonresume-fake-master/resumes\adrian-si