In [42]:
import pandas as pd

# Load the dataset
file_path = "/content/reply_classification_dataset - reply_classification_dataset.csv.csv"
df = pd.read_csv(file_path)

# Display basic information and the first few rows
df.info()
display(df.head())

# Handle missing values (if any) - in this dataset, there are no missing values
df.dropna(inplace=True)

# Clean the text data by converting to lowercase
df['reply'] = df['reply'].str.lower()
df['label'] = df['label'].str.lower()

# Display basic information and the first few rows after preprocessing
df.info()
display(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2129 entries, 0 to 2128
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   reply   2129 non-null   object
 1   label   2129 non-null   object
dtypes: object(2)
memory usage: 33.4+ KB


Unnamed: 0,reply,label
0,Can we discuss pricing??,NEUTRAL
1,"Im excited to explore this further, plz send c...",POSITIVE
2,We not looking for new solutions.,negative
3,Could u clarify features included?,neutral
4,"lets,, schedule a meeting to dive deeper",positive


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2129 entries, 0 to 2128
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   reply   2129 non-null   object
 1   label   2129 non-null   object
dtypes: object(2)
memory usage: 33.4+ KB


Unnamed: 0,reply,label
0,can we discuss pricing??,neutral
1,"im excited to explore this further, plz send c...",positive
2,we not looking for new solutions.,negative
3,could u clarify features included?,neutral
4,"lets,, schedule a meeting to dive deeper",positive


In [43]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
import re
from transformers import AutoTokenizer
import pandas as pd # Import pandas as it's used in this cell

# Define a function to clean text data.
# It converts text to lowercase, removes non-alphabetic characters, and standardizes whitespace.
def clean_text(text):
    text = text.lower() # Convert text to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text) # Remove characters that are not letters or whitespace
    text = re.sub(r'\s+', ' ', text).strip() # Replace multiple spaces with a single space and remove leading/trailing whitespace
    return text

# Apply the clean_text function to the 'reply' column and store the result in a new 'cleaned_reply' column.
# This check prevents errors if the column already exists from a previous run.
if 'cleaned_reply' not in df.columns:
    df['cleaned_reply'] = df['reply'].apply(clean_text)


# Convert the 'label' column from categorical strings to numerical category codes.
# This is necessary for training a classification model.
# The codes are explicitly cast to int64 to ensure compatibility with the model.
# This check prevents errors if the label column is already of the correct dtype.
if not pd.api.types.is_int64_dtype(df['label']):
    df['label'] = df['label'].astype('category').cat.codes.astype('int64')


# Convert the pandas DataFrame into a Hugging Face Dataset format.
# This is required for using the Hugging Face transformers library's data processing and training tools.
hf_dataset = Dataset.from_pandas(df)

# Load a pre-trained tokenizer suitable for the DistilBERT model.
# The tokenizer is responsible for converting text into numerical token IDs that the model can understand.
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# Define a function to apply the tokenizer to the cleaned text data.
# truncation=True ensures that long texts are cut off to a maximum length.
# padding='max_length' pads shorter texts to the maximum length defined by the tokenizer.
# return_tensors='pt' formats the output as PyTorch tensors.
def tokenize_function(examples):
    return tokenizer(examples['cleaned_reply'], truncation=True, padding='max_length')

# Apply the tokenization function to the entire dataset.
# The map method processes the dataset in batches for efficiency.
tokenized_dataset = hf_dataset.map(tokenize_function, batched=True)


# Split the tokenized dataset into training and evaluation sets.
# test_size=0.2 allocates 20% of the data for evaluation.
# seed=42 ensures reproducibility of the split.
train_test_split_dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=42)

# Organize the training and evaluation datasets into a Hugging Face DatasetDict.
# This structure is convenient for managing different splits of the dataset.
hf_datasets = DatasetDict({
    'train': train_test_split_dataset['train'],
    'eval': train_test_split_dataset['test']
})

# Print the structure of the DatasetDict to verify the splits and features.
print(hf_datasets)

  if not pd.api.types.is_int64_dtype(df['label']):


Map:   0%|          | 0/2129 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['reply', 'label', 'cleaned_reply', 'input_ids', 'attention_mask'],
        num_rows: 1703
    })
    eval: Dataset({
        features: ['reply', 'label', 'cleaned_reply', 'input_ids', 'attention_mask'],
        num_rows: 426
    })
})


In [44]:
from transformers import AutoModelForSequenceClassification

# Load the pre-trained DistilBERT model for sequence classification.
# 'distilbert-base-uncased' specifies the base model architecture.
# num_labels=3 configures the model for a classification task with 3 output classes (for negative, neutral, positive sentiment).
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [45]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",  # Directory to save model checkpoints and evaluation results
    eval_strategy="epoch", # Evaluate the model at the end of each epoch
    learning_rate=2e-5, # The learning rate for the optimizer
    per_device_train_batch_size=16, # Batch size per device during training
    per_device_eval_batch_size=16, # Batch size per device during evaluation
    num_train_epochs=3, # Total number of training epochs to perform
    weight_decay=0.01, # The weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights in AdamW optimizer
    push_to_hub=False, # Whether or not to push the model to the Hugging Face model hub
)

In [46]:
from transformers import Trainer

# Initialize the Trainer object
trainer = Trainer(
    model=model,  # The Hugging Face model to be trained
    args=training_args,  # The training arguments defined earlier
    train_dataset=hf_datasets['train'],  # The training dataset
    eval_dataset=hf_datasets['eval']  # The evaluation dataset
)

# Start the training process
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.022746
2,No log,0.01531
3,No log,0.013581


TrainOutput(global_step=321, training_loss=0.11971216409748589, metrics={'train_runtime': 269.3565, 'train_samples_per_second': 18.967, 'train_steps_per_second': 1.192, 'total_flos': 676788009071616.0, 'train_loss': 0.11971216409748589, 'epoch': 3.0})

In [47]:
# Save the trained model
trainer.save_model("./results")
print("Model saved successfully to ./results")

Model saved successfully to ./results


In [48]:
# Evaluate the model on the evaluation dataset and print the results
eval_results = trainer.evaluate(hf_datasets['eval'])
print(eval_results)

{'eval_loss': 0.01358080841600895, 'eval_runtime': 5.8836, 'eval_samples_per_second': 72.405, 'eval_steps_per_second': 4.589, 'epoch': 3.0}


In [50]:
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

# Get predictions on the evaluation set. This includes logits, predicted labels, and true labels.
predictions = trainer.predict(hf_datasets['eval'])

# Extract the predicted labels by finding the index of the maximum logit for each prediction.
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Extract the true labels from the predictions object.
true_labels = predictions.label_ids

# Calculate accuracy by comparing true and predicted labels.
# Accuracy is the ratio of correctly predicted observations to the total observations.
# It is calculated as (True Positives + True Negatives) / (True Positives + True Negatives + False Positives + False Negatives).
accuracy = accuracy_score(true_labels, predicted_labels)

# Calculate F1 score, a measure of a model's accuracy on a dataset.
# The F1 score is the harmonic mean of precision and recall.
# Precision is the ratio of correctly predicted positive observations to the total predicted positive observations.
# Recall is the ratio of correctly predicted positive observations to the all observations in actual class.
# F1 Score = 2 * (Precision * Recall) / (Precision + Recall).
# 'weighted' average is used because the dataset might have imbalanced classes.
f1 = f1_score(true_labels, predicted_labels, average='weighted')

# Print the calculated accuracy and F1 score.
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

Accuracy: 0.9976525821596244
F1 Score: 0.9976519145640288
