In [1]:
pip install transformers==4.51.3 datasets evaluate scikit-learn


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [2]:
import transformers
print(transformers.__version__)

4.51.3


In [3]:
from transformers import Trainer
print("✅ Trainer imported successfully")

✅ Trainer imported successfully


In [4]:
from google.colab import files
uploaded = files.upload()  # upload `train.jsonl` and `val.jsonl`


Saving train.jsonl to train (1).jsonl
Saving val.jsonl to val (1).jsonl


In [5]:
import pandas as pd

train_df = pd.read_json("train.jsonl", lines=True)
val_df = pd.read_json("val.jsonl", lines=True)

print(f"✅ Train: {len(train_df)}, Validation: {len(val_df)}")
train_df.head()


✅ Train: 3024, Validation: 757


Unnamed: 0,text,label
0,Jawaharlal Nehru Technological University (JNT...,education
1,"BANK OF INDIA, Mumbai, India Business Analyst ...",experience
2,VFNL Sr. Java Developer Jul 08 Jun 11,experience
3,"Platforms: Windows, Sun Solaris, UNIX",skills
4,"CIGNA Healthcare, Sr. Business Analyst Sep’201...",experience


In [6]:
label_list = [
    "header", "summary", "experience", "education", "skills",
    "projects", "certifications", "achievements", "organizations",
    "hobbies", "miscellaneous"
]


In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [8]:
from datasets import Dataset, DatasetDict, Features, Value, ClassLabel

features = Features({
    "text": Value("string"),
    "label": ClassLabel(names=label_list)
})

train_ds = Dataset.from_pandas(train_df).cast(features)
val_ds = Dataset.from_pandas(val_df).cast(features)

dataset = DatasetDict({
    "train": train_ds,
    "validation": val_ds
})


Casting the dataset:   0%|          | 0/3024 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/757 [00:00<?, ? examples/s]

In [9]:
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=256)

tokenized_dataset = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/3024 [00:00<?, ? examples/s]

Map:   0%|          | 0/757 [00:00<?, ? examples/s]

In [10]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(label_list),
    id2label={i: l for i, l in enumerate(label_list)},
    label2id={l: i for i, l in enumerate(label_list)}
)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./resume-segmenter",
    do_train=True,
    do_eval=True,
    logging_dir="./logs",
    logging_steps=100,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    save_steps=500,
    eval_steps=500,
    load_best_model_at_end=False  # disable to avoid eval/save strategy conflict
)


In [22]:
import evaluate
from transformers import Trainer

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1": f1.compute(predictions=preds, references=labels, average="weighted")["f1"]
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

  trainer = Trainer(


In [23]:
trainer.train()




<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mshashgupth[0m ([33mshashgupth-student[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
100,0.7625
200,0.2498
300,0.1756
400,0.2143
500,0.1346
600,0.1111
700,0.1153
800,0.0987
900,0.056
1000,0.1078


TrainOutput(global_step=1890, training_loss=0.1255736299923488, metrics={'train_runtime': 22108.8798, 'train_samples_per_second': 0.684, 'train_steps_per_second': 0.085, 'total_flos': 1001614269358080.0, 'train_loss': 0.1255736299923488, 'epoch': 5.0})

In [24]:
results = trainer.evaluate()
print(results)


{'eval_loss': 0.12727349996566772, 'eval_accuracy': 0.9801849405548216, 'eval_f1': 0.978892852826302, 'eval_runtime': 328.4062, 'eval_samples_per_second': 2.305, 'eval_steps_per_second': 0.289, 'epoch': 5.0}


In [25]:
model.save_pretrained("resume-segmenter")
tokenizer.save_pretrained("resume-segmenter")


('resume-segmenter/tokenizer_config.json',
 'resume-segmenter/special_tokens_map.json',
 'resume-segmenter/vocab.txt',
 'resume-segmenter/added_tokens.json',
 'resume-segmenter/tokenizer.json')

In [29]:
from transformers import pipeline
import json

pipe = pipeline("text-classification", model="resume-segmenter", tokenizer="resume-segmenter")

with open("hl_resume.txt", "r", encoding="utf-8") as f:
    resume_text = f.read()

lines = resume_text.splitlines()
chunks = ["\n".join(lines[i:i+5]) for i in range(0, len(lines), 5)]

section_map = {}

for chunk in chunks:
    if len(chunk.strip()) < 10:
        continue
    pred = pipe(chunk[:512])[0]
    label = pred["label"]
    section_map.setdefault(label, []).append(chunk.strip())

print(json.dumps(section_map, indent=2))

Device set to use cpu


{
  "summary": [
    "Harika Lankalapally  1008   W Mitchell St, Apt 232, Arlington, Texas 76013  +1-512-284-4533   harikalankalapally@gmail.com  Professional Summary  Graduate student in Computer Science passionate about leveraging data science to optimize user engagement and long-term  value (LTV) modeling. Experienced in Python, SQL, and machine learning techniques for predictive modeling, deep learn-  ing, and A/B testing. Proficient in cleaning large-scale datasets, designing experiments, and building interactive dashboards  in Power BI and Tableau. Strong collaborator with experience supporting data-driven decision-making in cross-functional  teams.  Education  Campbellsville University   2023 - 2025  Master of Science, Computer Science  \u2022   Coursework:   Data Visualization, Database Management, Statistical Analysis, Machine Learning  CVR College of Engineering   2019 - 2023  Bachelor of Technology, Computer Science  \u2022   Coursework:   Data Structures and Algorithms, Dat