In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import Dataset
import pandas as pd


In [2]:
# Load the dataset from CSV file
df = pd.read_csv('train_data.csv')

# Check the columns
print(df.columns)

Index(['Comment', 'Sentiment'], dtype='object')


In [3]:
# Select only the 'Comment' and 'Sentiment' columns
df = df[['Comment', 'Sentiment']]

# Check the first few rows of the dataframe
print(df.head())

                                             Comment  Sentiment
0  Lets not forget that Apple Pay in 2014 require...          1
1  Here in NZ 50 of retailers dont even have cont...          0
2  I will forever acknowledge this channel with t...          2
3  Whenever I go to a place that doesnt take Appl...          0
4  Apple Pay is so convenient secure and easy to ...          2


In [4]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import Dataset
import pandas as pd

In [5]:
# Check if MPS (Apple's GPU) is available
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
print(f"Using device: {device}")

# Load your preprocessed dataset
df = pd.read_csv('train_data.csv')

# Ensure the 'Comment' column is in string format and drop any rows with NaN values
df['Comment'] = df['Comment'].astype(str).fillna('')

# Select the required columns
df = df[['Comment', 'Sentiment']]

# Split the dataset into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(df['Comment'].tolist(), df['Sentiment'].tolist(), test_size=0.2)

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the texts for both training and validation sets
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

# Convert the data into Hugging Face Dataset format
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_labels
})

val_dataset = Dataset.from_dict({
    'input_ids': val_encodings['input_ids'],
    'attention_mask': val_encodings['attention_mask'],
    'labels': val_labels
})

# Load the pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
model.to(device)  # Move model to MPS if available

# Set up the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    eval_strategy="epoch",     # evaluation strategy
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=16,   # batch size for evaluation
    num_train_epochs=3,              # number of training epochs
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    dataloader_num_workers=4,        # Enable multi-threading for data loading
)

# Define the Trainer
trainer = Trainer(
    model=model,                         # the model to be trained
    args=training_args,                  # training arguments
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
)

# Train the model
trainer.train()

# Evaluate the model on the validation set
trainer.evaluate()

Using device: mps


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/5505 [00:00<?, ?it/s]

{'loss': 0.9766, 'grad_norm': 4.844658374786377, 'learning_rate': 4.990917347865577e-05, 'epoch': 0.01}
{'loss': 0.7371, 'grad_norm': 7.717729091644287, 'learning_rate': 4.981834695731154e-05, 'epoch': 0.01}
{'loss': 0.8395, 'grad_norm': 6.978791236877441, 'learning_rate': 4.9727520435967305e-05, 'epoch': 0.02}
{'loss': 0.8592, 'grad_norm': 24.968318939208984, 'learning_rate': 4.963669391462307e-05, 'epoch': 0.02}
{'loss': 0.699, 'grad_norm': 8.126932144165039, 'learning_rate': 4.954586739327884e-05, 'epoch': 0.03}
{'loss': 0.7347, 'grad_norm': 18.910205841064453, 'learning_rate': 4.945504087193461e-05, 'epoch': 0.03}
{'loss': 0.8477, 'grad_norm': 8.022194862365723, 'learning_rate': 4.9364214350590375e-05, 'epoch': 0.04}
{'loss': 0.47, 'grad_norm': 15.116703987121582, 'learning_rate': 4.927338782924614e-05, 'epoch': 0.04}
{'loss': 0.6732, 'grad_norm': 5.749303340911865, 'learning_rate': 4.918256130790191e-05, 'epoch': 0.05}
{'loss': 0.5101, 'grad_norm': 6.7499918937683105, 'learning_ra

  0%|          | 0/230 [00:00<?, ?it/s]

{'eval_loss': 0.4671715795993805, 'eval_runtime': 100.0594, 'eval_samples_per_second': 36.678, 'eval_steps_per_second': 2.299, 'epoch': 1.0}
{'loss': 0.4183, 'grad_norm': 10.312958717346191, 'learning_rate': 3.3287920072661215e-05, 'epoch': 1.0}
{'loss': 0.3641, 'grad_norm': 4.581229209899902, 'learning_rate': 3.319709355131699e-05, 'epoch': 1.01}
{'loss': 0.252, 'grad_norm': 2.162550449371338, 'learning_rate': 3.310626702997276e-05, 'epoch': 1.01}
{'loss': 0.4211, 'grad_norm': 17.339492797851562, 'learning_rate': 3.301544050862852e-05, 'epoch': 1.02}
{'loss': 0.187, 'grad_norm': 15.62761402130127, 'learning_rate': 3.2924613987284285e-05, 'epoch': 1.02}
{'loss': 0.2787, 'grad_norm': 11.497817993164062, 'learning_rate': 3.283378746594006e-05, 'epoch': 1.03}
{'loss': 0.278, 'grad_norm': 10.624368667602539, 'learning_rate': 3.274296094459583e-05, 'epoch': 1.04}
{'loss': 0.3473, 'grad_norm': 23.724443435668945, 'learning_rate': 3.265213442325159e-05, 'epoch': 1.04}
{'loss': 0.1907, 'grad_n

RuntimeError: Shared memory manager connection has timed out

In [10]:
# Evaluate the model on the validation dataset
eval_results = trainer.evaluate()

# Print the evaluation results
print(f"Evaluation results: {eval_results}")

NameError: name 'trainer' is not defined

In [8]:
# Get predictions for the validation dataset
predictions, labels, metrics = trainer.predict(val_dataset)

# Print the predicted labels and metrics
print(f"Predictions: {predictions.argmax(-1)}")
print(f"Metrics: {metrics}")

NameError: name 'trainer' is not defined

In [4]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Convert predictions and labels to lists
pred_labels = predictions.argmax(-1)
true_labels = labels

# Calculate accuracy
accuracy = accuracy_score(true_labels, pred_labels)

# Calculate precision, recall, and F1-score
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, pred_labels, average='weighted')

# Print the results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

Accuracy: 0.8483606557377049
Precision: 0.846326746846525
Recall: 0.8483606557377049
F1-Score: 0.8471487726138608


In [3]:
def predict_sentiment(text):
    # Move the inputs to the MPS device
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)

    # Move model to the correct device
    model.to(device)
    
    # Use torch.no_grad() to disable gradient tracking
    with torch.no_grad():
        # Ensure outputs are also computed on the MPS device
        outputs = model(**inputs)

    # Move outputs back to CPU for processing and get the predicted sentiment
    prediction = torch.argmax(outputs.logits.cpu(), dim=-1).item()

    # Define the sentiment mapping
    sentiments = {0: "negative", 1: "neutral", 2: "positive"}

    # Return the predicted sentiment
    return sentiments[prediction]

# Test with a sample input
sample_text = "I am not liking this airline, it's service is slow & representatives aren't responsible"
sentiment = predict_sentiment(sample_text)
print(f"Predicted Sentiment: {sentiment}")

NameError: name 'tokenizer' is not defined

In [11]:
# Save the model and tokenizer
model.save_pretrained('./saved_model')
tokenizer.save_pretrained('./saved_model')

NameError: name 'model' is not defined

In [7]:
from transformers import BertForSequenceClassification, BertTokenizer

# Load the model and tokenizer
model = BertForSequenceClassification.from_pretrained('./saved_model')
tokenizer = BertTokenizer.from_pretrained('./saved_model')

In [2]:
import numpy

In [3]:
numpy.__version__

'2.0.2'

In [6]:
pip install numpy==1.26.4

[0m[31mERROR: Could not find a version that satisfies the requirement numpy==1.26.4 (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for numpy==1.26.4[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.
