In [None]:
!pip install transformers
!pip install datasets
!pip install sacremoses
!pip install sentencepiece
!pip install importlib_metadata

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m54.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m106.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.15.1 tokenizers-0.13.3 transformers-4.29.2
Looking in i

In [None]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer
from transformers import AutoConfig

model_name = 'bert-base-uncased'  
num_labels = 5  

model = torch.hub.load('huggingface/pytorch-transformers', 'model', model_name)
config = AutoConfig.from_pretrained(model_name)
last_state_dim = config.hidden_size

model.classifier = torch.nn.Linear(last_state_dim, num_labels)

model.eval()  

tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading: "https://github.com/huggingface/pytorch-transformers/zipball/main" to /root/.cache/torch/hub/main.zip


Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
def preprocess(text):
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    return inputs['input_ids'], inputs['attention_mask']

In [None]:
def predict_sentiment(texts):
    input_ids, attention_mask = preprocess(texts)
    with torch.no_grad():
        hidden_states = model(input_ids, attention_mask=attention_mask)
        last_hidden_states = hidden_states.last_hidden_state
        final_feature_map = last_hidden_states[:, -1, :]
        logits = model.classifier(final_feature_map)
        probabilities = F.softmax(logits, dim=1)
        sentiments = torch.argmax(probabilities, dim=1).squeeze()
    return sentiments, probabilities

In [None]:
text = "This is a great movie!"
sentiment, probabilities = predict_sentiment(text)
print(f"Sentiment: {sentiment}")
print(f"Probabilities: {probabilities}")

Sentiment: 0
Probabilities: tensor([[0.2481, 0.2202, 0.1887, 0.2406, 0.1025]])


In [None]:
import torch
import torch.nn as nn
from transformers import BertModel
from torch.utils.data import DataLoader
import datasets as datasets

class SentimentClassifier(nn.Module):
    def __init__(self, num_labels):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs['pooler_output']
        output = self.dropout(pooled_output)
        logits = self.classifier(output)
        return logits

In [None]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = SentimentClassifier(num_labels)
model.to(device)
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

num_epochs = 5
train_dataset = datasets.load_dataset("yelp_review_full", split="train")
val_dataset = datasets.load_dataset("yelp_review_full", split="test")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading builder script:   0%|          | 0.00/4.41k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.04k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.55k [00:00<?, ?B/s]

Downloading and preparing dataset yelp_review_full/yelp_review_full to /root/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf...


Downloading data:   0%|          | 0.00/196M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/650000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset yelp_review_full downloaded and prepared to /root/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf. Subsequent calls will reuse this data.




In [None]:

batch_size = 32
#datasets = datasets.load_dataset("yelp_review_full", split="train")
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [None]:
for epoch in range(num_epochs):
    total_loss = 0

    for batch in train_dataloader:
        inputs = batch['text']
        labels = batch['label'].to(device)

        # Preprocess the input text (tokenize, convert to input IDs, attention mask, etc.)
        inputs = tokenizer(inputs, padding=True, truncation=True, return_tensors='pt')
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)

        optimizer.zero_grad()
        logits = model(input_ids=input_ids.squeeze(1), attention_mask=attention_mask.squeeze(1))
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}/{num_epochs} - Loss: {avg_loss:.4f}')



OutOfMemoryError: ignored

In [None]:
def evaluate_model(model, dataloader):
    model.eval()
    total_loss = 0
    all_labels = []
    all_predictions = []

    with torch.no_grad():
        for batch in dataloader:
            inputs = batch['text']
            labels = batch['label'].to(device)
            input_ids, attention_mask = preprocess(inputs)
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)

            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            total_loss += loss.item()

            _, predicted_labels = torch.max(logits, dim=1)
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predicted_labels.cpu().numpy())

    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(all_labels, all_predictions)
    precision = precision_score(all_labels, all_predictions, average='weighted')
    recall = recall_score(all_labels, all_predictions, average='weighted')
    f1 = f1_score(all_labels, all_predictions, average='weighted')

    return avg_loss, accuracy, precision, recall, f1

for epoch in range(num_epochs):
    total_loss = 0
    model.train()

    for batch in train_dataloader:
        inputs = batch['text']
        labels = batch['label'].to(device)
        input_ids, attention_mask = preprocess(inputs)
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)

        optimizer.zero_grad()

        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_dataloader)

    val_loss, val_accuracy, val_precision, val_recall, val_f1 = evaluate_model(model, val_dataloader)

    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Train Loss: {avg_loss:.4f}")
    print(f"Validation Loss: {val_loss:.4f}")
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    print(f"Validation Precision: {val_precision:.4f}")
    print(f"Validation Recall: {val_recall:.4f}")
    print(f"Validation F1-Score: {val_f1:.4f}")
    print()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

# Preprocess the data 
train_texts = train_dataset['text']
train_labels = train_dataset['label']
val_texts = val_dataset['text']
val_labels = val_dataset['label']

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Convert text data to TF-IDF features
train_features = vectorizer.fit_transform(train_texts)
val_features = vectorizer.transform(val_texts)

# Train logistic regression model
lr_model = LogisticRegression()
lr_model.fit(train_features, train_labels)

# Evaluate logistic regression model
lr_predictions = lr_model.predict(val_features)
lr_accuracy = accuracy_score(val_labels, lr_predictions)
lr_precision = precision_score(val_labels, lr_predictions, average='weighted')
lr_recall = recall_score(val_labels, lr_predictions, average='weighted')
lr_f1 = f1_score(val_labels, lr_predictions, average='weighted')

# Compare performance with deep learning model
print("Logistic Regression Performance:")
print(f"Accuracy: {lr_accuracy:.4f}")
print(f"Precision: {lr_precision:.4f}")
print(f"Recall: {lr_recall:.4f}")
print(f"F1-Score: {lr_f1:.4f}")

# Compare with the deep learning model results obtained previously
print("Deep Learning Model Performance:")
print(f"Accuracy: {val_accuracy:.4f}")
print(f"Precision: {val_precision:.4f}")
print(f"Recall: {val_recall:.4f}")
print(f"F1-Score: {val_f1:.4f}")