In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from torch import nn, optim
import torch
import os

# Load your CSV file
file_path = "/content/drive/MyDrive/IMM/dataset.csv"
df = pd.read_csv(file_path)

# Encode the labels
label_encoder = LabelEncoder()
df['labels'] = label_encoder.fit_transform(df['types'])

# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Define the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(df['types'].unique()))

# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long),
        }


# Create data loaders
MAX_LEN = 64
train_dataset = CustomDataset(train_df.reset_index(drop=True)['sentence'], train_df.reset_index(drop=True)['labels'], tokenizer, MAX_LEN)
test_dataset = CustomDataset(test_df.reset_index(drop=True)['sentence'], test_df.reset_index(drop=True)['labels'], tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)


# Set device (GPU if available, otherwise CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Define training parameters
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

# Training loop
EPOCHS = 3

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{EPOCHS}, Loss: {average_loss:.4f}')


# Save the trained model
output_dir = "/content/drive/MyDrive/Text classification bert"
os.makedirs(output_dir, exist_ok=True)
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)


# Evaluation
model.eval()
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)

        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Decode the predictions and labels
decoded_predictions = label_encoder.inverse_transform(all_predictions)
decoded_labels = label_encoder.inverse_transform(all_labels)

# Print classification report
print(classification_report(decoded_labels, decoded_predictions))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch 1/3, Loss: 0.0487
Epoch 2/3, Loss: 0.0035
Epoch 3/3, Loss: 0.0030
              precision    recall  f1-score   support

        math       1.00      1.00      1.00       260
    non math       1.00      1.00      1.00       192

    accuracy                           1.00       452
   macro avg       1.00      1.00      1.00       452
weighted avg       1.00      1.00      1.00       452



In [None]:
# Load the saved model
loaded_model = BertForSequenceClassification.from_pretrained(output_dir)
loaded_tokenizer = BertTokenizer.from_pretrained(output_dir)

# Example of using the loaded model
text = input("Enter input: ")
encoding = loaded_tokenizer.encode_plus(
    text,
    add_special_tokens=True,
    max_length=MAX_LEN,
    return_token_type_ids=False,
    pad_to_max_length=True,
    return_attention_mask=True,
    return_tensors='pt',
)
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

with torch.no_grad():
    outputs = loaded_model(input_ids, attention_mask=attention_mask)
    predictions = torch.argmax(outputs.logits, dim=1)

decoded_prediction = label_encoder.inverse_transform(predictions.cpu().numpy())
print(f"Predicted label: {decoded_prediction[0]}")

Enter input: look there are 75 cars 25 trucks tell me how much is total now


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Predicted label: math


