## **1. Setup and Initialization**

In [None]:
# Install necessary libraries (if not already installed)
!pip install transformers evaluate scikit-learn

# Import necessary libraries
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.model_selection import train_test_split

## **2. Data Preparation**

In [None]:
# Load the train and test CSV files
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Ensure the 'text' and 'label' columns are in the correct format
train_df['text'] = train_df['text'].astype(str).fillna('')
test_df['text'] = test_df['text'].astype(str).fillna('')

# Map categorical labels to numerical values
label_map = {label: idx for idx, label in enumerate(train_df['label'].unique())}
train_df['label'] = train_df['label'].map(label_map)
test_df['label'] = test_df['label'].map(label_map)

# Convert DataFrame to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)


Provide your HF_TOKEN

## **3. Tokenization**

In [None]:
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define a tokenization function
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

# Tokenize the datasets
train_encodings = train_dataset.map(tokenize_function, batched=True)
test_encodings = test_dataset.map(tokenize_function, batched=True)

# Set the format for PyTorch datasets
train_encodings.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_encodings.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])


## **4. Model Initialization**

In [None]:
# Define the model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(label_map)
)


## **5. Define Training Arguments**

In [None]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=16,  # batch size for training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",     # evaluation strategy
    save_strategy="epoch",           # save strategy
    load_best_model_at_end=True,     # load the best model when finished training
)


## **6. Training the Model**

In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,                         # the model to be trained
    args=training_args,                  # training arguments
    train_dataset=train_encodings,       # training dataset
    eval_dataset=test_encodings,         # evaluation dataset
    tokenizer=tokenizer,                 # tokenizer to use
)

# Train the model
trainer.train()


## **7. Evaluation**

In [None]:
# Evaluate the model
trainer.evaluate()


## **8. Save the Trained Model**

In [None]:
# Save the trained model
trainer.save_model("./final_model")


## **Pushing the model to Huggingface Hub**

In [None]:
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments
import torch
import os
from huggingface_hub import login

# Load your fine-tuned model
model = BertForSequenceClassification.from_pretrained('/content/drive/MyDrive/final_model', ignore_mismatched_sizes=True )
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Log in to Hugging Face Hub using your token
login(token="YOUR_TOKEN")

# Push to Hugging Face Hub
model.push_to_hub("RAKSHITHA7/bert_yc")
tokenizer.push_to_hub("RAKSHITHA7/bert_yc")

# Optionally, you can upload a README or additional files:
readme = """
# Fine-tuned BERT Model for Emotion Classification

This is a fine-tuned BERT model that classifies text into 6 emotions: sadness, joy, love, anger, fear, and surprise.
"""

with open("README.md", "w") as readme_file:
    readme_file.write(readme)

from huggingface_hub import upload_file

upload_file(
    path_or_fileobj="README.md",
    path_in_repo="README.md",
    repo_id="RAKSHITHA7/bert_yc"
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/final_model and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([10]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.weight: found shape torch.Size([10, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.
- empty or missing yaml metadata in repo card


CommitInfo(commit_url='https://huggingface.co/RAKSHITHA7/bert_yc/commit/d5ee50d675e182bc266e4a15c900f8e312e556a6', commit_message='Upload README.md with huggingface_hub', commit_description='', oid='d5ee50d675e182bc266e4a15c900f8e312e556a6', pr_url=None, repo_url=RepoUrl('https://huggingface.co/RAKSHITHA7/bert_yc', endpoint='https://huggingface.co', repo_type='model', repo_id='RAKSHITHA7/bert_yc'), pr_revision=None, pr_num=None)