In [1]:
# general imports
import os
import re
import time
import numpy as np
import pandas as pd
import string
from tqdm.notebook import tqdm

# pytorch imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import RandomSampler
from torch.utils.data import TensorDataset
from torch.nn.utils.rnn import pad_sequence
from transformers import BertForSequenceClassification, AdamW

# transformer imports
import datasets
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import Trainer
from transformers import TrainingArguments
from transformers import EarlyStoppingCallback
from transformers import set_seed
from transformers import BertTokenizer
from transformers import BertModel

# scikit learn imports
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import nltk
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

2024-04-21 16:05:17.276176: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-21 16:05:17.276298: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-21 16:05:17.399953: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
data=pd.read_csv("../input/physics-vs-chemistry-vs-biology/dataset/train.csv")
data

Unnamed: 0,Id,Comment,Topic
0,0x840,A few things. You might have negative- frequen...,Biology
1,0xbf0,Is it so hard to believe that there exist part...,Physics
2,0x1dfc,There are bees,Biology
3,0xc7e,I'm a medication technician. And that's alot o...,Biology
4,0xbba,Cesium is such a pretty metal.,Chemistry
...,...,...,...
8690,0x1e02,I make similar observations over the last week...,Biology
8691,0xc8d,You would know.,Biology
8692,0x723,Also use the correct number of sig figs,Chemistry
8693,0x667,"What about the ethical delimmas, groundbreaki...",Biology


In [3]:
data.drop('Id',axis=1 , inplace=True)

In [4]:
print(data.shape)

(8695, 2)


In [5]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8695 entries, 0 to 8694
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Comment  8695 non-null   object
 1   Topic    8695 non-null   object
dtypes: object(2)
memory usage: 136.0+ KB
None


In [6]:
print(data.isnull().sum())

Comment    0
Topic      0
dtype: int64


In [7]:
data.head()

Unnamed: 0,Comment,Topic
0,A few things. You might have negative- frequen...,Biology
1,Is it so hard to believe that there exist part...,Physics
2,There are bees,Biology
3,I'm a medication technician. And that's alot o...,Biology
4,Cesium is such a pretty metal.,Chemistry


In [8]:
X=data['Comment']
y=data['Topic']

In [9]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y

array([0, 2, 0, ..., 1, 0, 0])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
model_name = 'bert-base-uncased'  # Change this to the desired BERT model
tokenizer = BertTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [12]:
tokenized_texts = [tokenizer.tokenize(text) for text in X_train]

In [13]:
tokenized_texts_y = [tokenizer.tokenize(text) for text in X_test]

In [14]:
max_seq_length = 128

In [15]:
input_ids = [tokenizer.convert_tokens_to_ids(tokens) for tokens in tokenized_texts]
max_length = max(len(seq) for seq in input_ids)
input_ids = pad_sequence([torch.tensor(seq) for seq in input_ids], batch_first=True, padding_value=tokenizer.pad_token_id)
attention_masks = [[float(i > 0) for i in seq] for seq in input_ids]

In [16]:
# Truncate or pad input sequences to the maximum sequence length
input_ids = [ids[:max_seq_length].tolist() + [0] * (max_seq_length - len(ids)) for ids in input_ids]
attention_masks = [[1] * min(len(ids), max_seq_length) + [0] * (max_seq_length - len(ids)) for ids in input_ids]


In [17]:
input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)
labels = torch.tensor(y_train)

In [18]:
num_labels = len(set(y_train))  # Number of unique labels in y_train
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=2e-5)



In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
device

device(type='cuda')

In [21]:
min_length = min(len(input_ids), len(attention_masks), len(labels))
input_ids = input_ids[:min_length]
attention_masks = attention_masks[:min_length]
labels = labels[:min_length]


In [22]:
# Assuming you have already created `input_ids`, `attention_masks`, and `labels` tensors

batch_size = 32  # You can adjust the batch size according to your memory constraints
train_data = TensorDataset(input_ids, attention_masks, labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)


In [23]:
num_epochs = 15
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_masks, labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_dataloader)}")

Epoch 1/15, Loss: 0.9421135310186158
Epoch 2/15, Loss: 0.6099897815820274
Epoch 3/15, Loss: 0.454652012351456
Epoch 4/15, Loss: 0.31555411740795736
Epoch 5/15, Loss: 0.2025928111627288
Epoch 6/15, Loss: 0.14395506642417077
Epoch 7/15, Loss: 0.1283498858528878
Epoch 8/15, Loss: 0.08130837826553834
Epoch 9/15, Loss: 0.07514573625126562
Epoch 10/15, Loss: 0.07354102511206694
Epoch 11/15, Loss: 0.07152358167420286
Epoch 12/15, Loss: 0.0586601057957106
Epoch 13/15, Loss: 0.05193502999230185
Epoch 14/15, Loss: 0.057404308805750565
Epoch 15/15, Loss: 0.05036164773200903


In [24]:
max_seq_length = 128  # Maximum sequence length supported by the model

In [25]:
input_ids_test = [tokenizer.convert_tokens_to_ids(tokens) for tokens in tokenized_texts_y]
max_length = max(len(seq) for seq in input_ids_test)
input_ids_test = pad_sequence([torch.tensor(seq) for seq in input_ids_test], batch_first=True, padding_value=tokenizer.pad_token_id)
attention_masks_test = [[float(i > 0) for i in seq] for seq in input_ids_test]

In [26]:
input_ids_test = [ids[:max_seq_length].tolist() + [0] * (max_seq_length - len(ids)) for ids in input_ids_test]
attention_masks_test = [[1] * min(len(ids), max_seq_length) + [0] * (max_seq_length - len(ids)) for ids in input_ids_test]


In [27]:
input_ids_test = torch.tensor(input_ids_test)
attention_masks_test = torch.tensor(attention_masks_test)
labels_test = torch.tensor(y_test)

In [28]:
min_length = min(len(input_ids_test), len(attention_masks_test), len(labels_test))
input_ids_test = input_ids_test[:min_length]
attention_masks_test = attention_masks_test[:min_length]
labels_test = labels_test[:min_length]


In [29]:
from torch.utils.data import SequentialSampler
batch_size=32
test_data = TensorDataset(input_ids_test, attention_masks_test,labels_test)
test_sampler = SequentialSampler(test_data)  # Use SequentialSampler for evaluation
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [30]:
model.eval()

# Initialize lists to store predictions and true labels
predictions = []
true_labels = []

In [31]:
for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)
    input_ids, attention_masks, labels = batch  # Unpack input_ids, attention_masks, and labels
    with torch.no_grad():  # Disable gradient calculation during inference
        outputs = model(input_ids, attention_mask=attention_masks)
    logits = outputs.logits
    predictions.extend(torch.argmax(logits, dim=1).tolist())
    true_labels.extend(labels.tolist())
accuracy = accuracy_score(true_labels, predictions)
precision = precision_score(true_labels, predictions, average='weighted')
recall = recall_score(true_labels, predictions, average='weighted')


In [32]:
# Print performance metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)


Accuracy: 0.7745830937320299
Precision: 0.7754040671178606
Recall: 0.7745830937320299


In [33]:
torch.save(model.state_dict(), 'bert_model.pth')