In [2]:
pip install transformers

Collecting transformers
  Downloading transformers-4.32.1-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m40.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m44.3 MB/s[0m eta [36m0:00:0

In [3]:
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
nltk.download('stopwords')

# Load the dataset
dataset_path = '/content/drive/MyDrive/Personal data/topical_chat.csv'
df = pd.read_csv(dataset_path)

# Data Cleaning and Preprocessing
df.dropna(inplace=True)

# Encode the sentiment column (Label Encoding)
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['sentiment_encoded'] = label_encoder.fit_transform(df['sentiment'])

# Text preprocessing: Remove stop words and normalize text
stop_words = set(stopwords.words('english'))
#normalization
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove punctuation and special characters
    text = ''.join([char for char in text if char not in string.punctuation])

    # Remove extra whitespaces
    text = ' '.join(text.split())

    # Remove stop words
    text = ' '.join([word for word in text.split() if word not in stop_words])

    return text

df['message'] = df['message'].apply(preprocess_text)

# Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Display a sample of the preprocessed data
train_df


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,conversation_id,message,sentiment,sentiment_encoded
120868,5535,thats funny would like see older car wooden trunk,Curious to dive deeper,1
58549,2680,would peg character voice done man remember or...,Surprised,7
70882,3244,james earl jones iconic darth vaders voice pai...,Curious to dive deeper,1
182126,8343,would definitely like,Happy,4
81391,3726,guess cleanshaven men women look young inexper...,Neutral,5
...,...,...,...,...
119879,5490,aware knew appears spherical sure uniform actu...,Curious to dive deeper,1
103694,4748,love music videos,Happy,4
131932,6042,definitely seems hed angry type haha speaking ...,Curious to dive deeper,1
146867,6725,yes companies turned largest threat iceland de...,Surprised,7


In [4]:
unique_classes = df['sentiment'].unique()
num_classes = len(unique_classes)

print("Unique classes:", unique_classes)
print("Number of classes:", num_classes)


Unique classes: [' Curious to dive deeper' ' Happy' ' Neutral' ' Surprised' ' Disgusted'
 ' Sad' ' Fearful' ' Angry']
Number of classes: 8


In [5]:
pip install transformers torch scikit-learn



In [8]:
import torch
from transformers import BertForSequenceClassification, BertTokenizer
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import Adam
import torch.nn as nn
import pandas as pd

# Load your preprocessed data as shown in the previous example
# Define tokenizer and model
num_classes = 8
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)

# Define optimizer and loss function
optimizer = Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

# Convert your preprocessed data to DataLoader
batch_size = 64  # Adjust as needed
train_inputs = train_df['message'].tolist()  # Extract the message column as a list
train_labels = train_df['sentiment_encoded'].tolist()  # Assuming you have encoded sentiment labels

# Tokenize and pad the text data to a uniform length
input_ids = []
attention_masks = []

# Define a maximum sequence length for padding
max_seq_length = 128  # Adjust as needed

for text in train_inputs:
    encoding = tokenizer(text, padding='max_length', truncation=True, max_length=max_seq_length, return_tensors="pt")
    input_ids.append(encoding['input_ids'])
    attention_masks.append(encoding['attention_mask'])

# Create tensors for labels and inputs
input_ids = torch.cat(input_ids, dim=0)  # Concatenate the list of tensors
attention_masks = torch.cat(attention_masks, dim=0)  # Concatenate the list of tensors
train_labels = torch.tensor(train_labels)

# Create TensorDataset
train_dataset = TensorDataset(input_ids, attention_masks, train_labels)

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Training loop
num_epochs = 5  # Adjust as needed
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

# ...

for epoch in range(num_epochs):
    total_loss = 0.0
    correct_predictions = 0
    total_predictions = 0
    num_batches = len(train_loader)

    for batch_idx, batch in enumerate(train_loader):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Calculate accuracy
        logits = outputs.logits
        predicted_labels = torch.argmax(logits, dim=1)
        correct_predictions += torch.sum(predicted_labels == labels).item()
        total_predictions += len(labels)

        # Print the loss and accuracy every 100 batches
        if batch_idx % 1000 == 0:
            print(f"Epoch {epoch + 1}/{num_epochs}, Batch {batch_idx}/{num_batches}, Loss: {loss.item()}, Accuracy: {correct_predictions / total_predictions * 100}%")

    average_loss = total_loss / len(train_loader)
    accuracy = correct_predictions / total_predictions * 100
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {average_loss}, Accuracy: {accuracy}%")

torch.save(model.state_dict(), '/content/drive/sentiment_model.pth')



Epoch 1/5, Batch 0/2355, Loss: 2.05610728263855, Accuracy: 9.375%
Epoch 1/5, Batch 1000/2355, Loss: 1.638406753540039, Accuracy: 41.09172077922078%


In [1]:
torch.save(model.state_dict(), '/content/drive/sentiment_model.pth')

NameError: ignored