In [1]:
# installing missing libraries and updating existing ones
!pip install transformers
!pip install datasets
!pip install --upgrade pandas
!pip install evaluate

Collecting pandas
  Downloading pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Downloading pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m84.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hInstalling collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 2.2.2
    Uninstalling pandas-2.2.2:
      Successfully uninstalled pandas-2.2.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf 24.8.3 requires cubinlinker, which is not installed.
cudf 24.8.3 requires cupy-cuda11x>=12.0.0, which is not installed.
cudf 24.8.3 requires ptxcom

In [2]:
# importing all required libraries
import pandas as pd
import nltk
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import preprocessing
from datasets import Dataset
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, TFAutoModelForSequenceClassification, 
        TrainingArguments, Trainer, DataCollatorWithPadding)
import evaluate
import warnings

In [3]:
# parameters used in the code
data_path = "/kaggle/input/method-singlelbl-subclass-csv/method_singlelbl_subclass.csv" # path to data
text_column_name = "Combined_text" # column containing input text
label_column_name = "label" # column containing output labels
model_name = "distilbert-base-uncased" # model used for classification
test_size = 0.2 # for training testing split
num_labels = 6 # total output classes

In [4]:
# loading the data sample
def load_data(data_path):
    df = pd.read_csv(data_path)
    print("Loaded Data:")
    print(df.head())
    return df

In [5]:
# functions to perform some preprocessing and combine prompt-reply (called later)
nltk.download('punkt')
def join_and_tokenize(prompt, reply):
    prompt_tokens = nltk.word_tokenize(prompt)
    reply_tokens = nltk.word_tokenize(reply)
    combined_text = ' '.join(prompt_tokens + reply_tokens)
    return combined_text

def preprocess_data(df):
    df['Combined_text'] = df.apply(lambda row: join_and_tokenize(str(row['Prompt']), str(row['Reply'])), axis=1)
    le = preprocessing.LabelEncoder()
    df[label_column_name] = le.fit_transform(df['Subclass'].tolist())
    print("Processed Data Sample:")
    print(df.head(10))
    return df

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
# train test split
def create_datasets(df, test_size=0.2):
    df_train, df_test = train_test_split(df, test_size=test_size)
    train_dataset = Dataset.from_pandas(df_train)
    test_dataset = Dataset.from_pandas(df_test)
    return train_dataset, test_dataset

In [7]:
# combining prompt and reply and doing label encoding on the output by calling previously written functions
df = load_data(data_path)
df = preprocess_data(df)
train_dataset, test_dataset = create_datasets(df, test_size)

Loaded Data:
                                              Prompt  \
0  ok  historical composite bow effective range o...   
1  you are a translator from normal english to de...   
2  which seclist would be best for scanning an ht...   
3  i am a software developer at a medium sized co...   
4  role  professional it translator tasks        ...   

                                               Reply Subclass  
0  while the effective range of a composite bow i...      get  
1  i understand  this translation style requires ...      put  
2  scanning a web server running on a tv may requ...      get  
3  sure  i d be happy to assist you with that  le...      get  
4                                  lama          ...      get  
Processed Data Sample:
                                              Prompt  \
0  ok  historical composite bow effective range o...   
1  you are a translator from normal english to de...   
2  which seclist would be best for scanning an ht...   
3  i am a software 

In [8]:
# tokenization for the transformer model
def tokenize_data(dataset, tokenizer):
    def preprocess_function(examples):
        return tokenizer(examples[text_column_name], padding=True, truncation=True)
    tokenized_dataset = dataset.map(preprocess_function, batched=True)
    return tokenized_dataset

In [9]:
# tokenization for hugging face models
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
tokenized_train = tokenize_data(train_dataset, tokenizer)
tokenized_test = tokenize_data(test_dataset, tokenizer)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3270 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

In [10]:
# converting label field to tensors
tokenized_train = tokenized_train.map(lambda x: {"label": torch.tensor(x["label"]).long()})
tokenized_test = tokenized_test.map(lambda x: {"label": torch.tensor(x["label"]).long()})

Map:   0%|          | 0/3270 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

In [11]:
# setting up evaluation metrics
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [12]:
# training arguments
training_args = TrainingArguments(
    output_dir="/kaggle/working/",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    report_to=[]
)



In [13]:
# trainer setup
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [14]:
# training the model on the train data
warnings.filterwarnings('ignore')
trainer.train()
trainer.save_model('distilbert_base_uncased')

Epoch,Training Loss,Validation Loss,Accuracy
1,0.7974,0.394279,0.876528
2,0.4793,0.42042,0.877751
3,0.2612,0.308877,0.91687
4,0.233,0.350081,0.919315
5,0.1607,0.282318,0.94132
6,0.1636,0.390388,0.938875
7,0.1206,0.288429,0.949878
8,0.0851,0.279589,0.953545
9,0.0767,0.239599,0.959658
10,0.0679,0.245582,0.96088


In [15]:
# evaluating the model on test data
preds = trainer.predict(tokenized_test)
preds = np.argmax(preds[:3][0],axis=1) #preds[:3][1]
truth = test_dataset['label']
print(classification_report(truth,preds))

              precision    recall  f1-score   support

           0       0.43      0.38      0.40         8
           1       0.97      0.98      0.98        60
           2       0.98      0.97      0.98       334
           3       0.81      0.72      0.76        18
           4       0.99      0.94      0.97       144
           5       0.94      0.98      0.96       254

    accuracy                           0.96       818
   macro avg       0.85      0.83      0.84       818
weighted avg       0.96      0.96      0.96       818

