In [14]:
# %pip install transformers

In [15]:
# import necessary
import os
import tarfile
import urllib.request
import hashlib

import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset

In [16]:
print(torch.version.cuda)

11.8


In [17]:
# check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [18]:
def download_extract_imdb(base_dir='./data'):
    """
    Download and extract the IMDB dataset

    Parameters:
    base_dir (str): Base directory to store the dataset

    Returns:
    str: Path to the extracted dataset
    """
    # Create data directory if it doesn't exist
    os.makedirs(base_dir, exist_ok=True)

    # Dataset URL and expected SHA-1 hash
    url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
    filename = os.path.join(base_dir, "aclImdb_v1.tar.gz")
    expected_sha1 = '01ada507287d82875905620988597833ad4e0903'

    # Download the file if it doesn't exist or has wrong hash
    if not os.path.exists(filename) or hashlib.sha1(open(filename, 'rb').read()).hexdigest() != expected_sha1:
        print(f"Downloading IMDB dataset from {url}...")
        urllib.request.urlretrieve(url, filename)

        # Verify downloaded file
        sha1 = hashlib.sha1(open(filename, 'rb').read()).hexdigest()
        if sha1 != expected_sha1:
            raise ValueError(f"Downloaded file has incorrect SHA-1 hash. Expected {expected_sha1}, got {sha1}")

    # Extract the dataset if not already extracted
    extract_path = os.path.join(base_dir, 'aclImdb')
    if not os.path.exists(extract_path):
        print("Extracting dataset...")
        with tarfile.open(filename, 'r:gz') as tar:
            def is_within_directory(directory, target):
                abs_directory = os.path.abspath(directory)
                abs_target = os.path.abspath(target)
                prefix = os.path.commonprefix([abs_directory, abs_target])
                return prefix == abs_directory

            def safe_extract(tar, path):
                for member in tar.getmembers():
                    member_path = os.path.join(path, member.name)
                    if not is_within_directory(path, member_path):
                        raise Exception("Attempted Path Traversal in Tar File")

                tar.extractall(path)

            safe_extract(tar, base_dir)
        print("Extraction complete!")

    return extract_path

if __name__ == "__main__":
    try:
        data_dir = download_extract_imdb()
        print(f"\nDataset downloaded and extracted to: {data_dir}")

        # Print dataset structure
        print("\nDataset structure:")
        for root, dirs, files in os.walk(data_dir):
            level = root.replace(data_dir, '').count(os.sep)
            indent = ' ' * 4 * level
            print(f"{indent}{os.path.basename(root)}/")
            if level < 2:  # Only show files up to 2 levels deep
                subindent = ' ' * 4 * (level + 1)
                for f in files[:5]:  # Show only first 5 files
                    print(f"{subindent}{f}")
                if len(files) > 5:
                    print(f"{subindent}...")
    except Exception as e:
        print(f"Error: {str(e)}")


Dataset downloaded and extracted to: ./data\aclImdb

Dataset structure:
aclImdb/
    imdb.vocab
    imdbEr.txt
    README
    test/
        labeledBow.feat
        urls_neg.txt
        urls_pos.txt
        neg/
        pos/
    train/
        labeledBow.feat
        unsupBow.feat
        urls_neg.txt
        urls_pos.txt
        urls_unsup.txt
        neg/
        pos/
        unsup/


In [46]:
def load_data_from_directory(base_dir):
    data = []
    for label, sentiment in [("pos", 1), ("neg", 0)]:  # assign 1 to positive, 0 to negative
        dir_path = os.path.join(base_dir, label)
        for file in os.listdir(dir_path):
            with open(os.path.join(dir_path, file), encoding='utf-8') as f:
                data.append({"text": f.read().strip(), "label": sentiment})
    return pd.DataFrame(data)

# load train and test datasets
train_dir = "./data/aclImdb/train"
test_dir = "./data/aclImdb/test"

train_data = load_data_from_directory(train_dir)
test_data = load_data_from_directory(test_dir)

print(train_data.head())

                                                text  label
0  Bromwell High is a cartoon comedy. It ran at t...      1
1  Homelessness (or Houselessness as George Carli...      1
2  Brilliant over-acting by Lesley Ann Warren. Be...      1
3  This is easily the most underrated film inn th...      1
4  This is not the typical Mel Brooks film. It wa...      1


In [47]:
import torch
import torch.nn as nn
from transformers import RobertaModel, RobertaConfig
from transformers import TrainingArguments

class CNNRoBERTa(nn.Module):
    def __init__(self, num_labels=2, dropout_rate=0.2):
        super(CNNRoBERTa, self).__init__()
        
        # Load RoBERTa
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.hidden_size = self.roberta.config.hidden_size
        
        # CNN layers
        self.conv1 = nn.Conv1d(self.hidden_size, 256, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(256, 128, kernel_size=3, padding=1)
        
        # Pooling
        self.max_pool = nn.MaxPool1d(kernel_size=2, stride=2)
        
        # Activation
        self.relu = nn.ReLU()
        
        # Dropout
        self.dropout = nn.Dropout(dropout_rate)
        
        # Global Average Pooling
        self.global_avg_pool = nn.AdaptiveAvgPool1d(1)
        
        # Final classification layers
        self.classifier = nn.Sequential(
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(64, num_labels)
        )
        
        # Loss function
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask=None, labels=None):
        # Get RoBERTa embeddings
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state  # Shape: (batch_size, seq_length, hidden_size)
        
        # Prepare for CNN (transpose to get channels first)
        x = hidden_states.transpose(1, 2)  # Shape: (batch_size, hidden_size, seq_length)
        
        # Apply CNN layers
        x = self.relu(self.conv1(x))
        x = self.max_pool(x)
        x = self.dropout(x)
        
        x = self.relu(self.conv2(x))
        x = self.max_pool(x)
        x = self.dropout(x)
        
        # Global average pooling
        x = self.global_avg_pool(x)
        x = x.squeeze(-1)  # Remove the last dimension
        
        # Classification
        logits = self.classifier(x)
        
        # Calculate loss if labels provided
        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)
            return {"loss": loss, "logits": logits}
        
        return {"logits": logits}

# Function to create and initialize the model
def create_cnn_roberta_model(num_labels=2, device='cuda'):
    model = CNNRoBERTa(num_labels=num_labels)
    model = model.to(device)
    return model

# Training configuration
training_args = TrainingArguments(
    output_dir='./cnn_roberta_results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir='./cnn_roberta_logs',
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [48]:
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Create the CNN-DistilBERT model
model = create_cnn_roberta_model(num_labels=2, device=device)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [49]:
def tokenize_data(data):
    return tokenizer(
        data['text'].tolist(),
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )

train_encodings = tokenizer(train_data['text'].tolist(), padding=True, truncation=True, max_length=128)
test_encodings = tokenizer(test_data['text'].tolist(), padding=True, truncation=True, max_length=128)


In [50]:
import torch
from torch.utils.data import Dataset

class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [51]:
# convert labels to integers
train_labels = train_data['label'].tolist()
test_labels = test_data['label'].tolist()

# create PyTorch Datasets
train_dataset = SentimentDataset(train_encodings, train_labels)
test_dataset = SentimentDataset(test_encodings, test_labels)


In [52]:
import torch
import os
from transformers import TrainingArguments, Trainer

# Check for CUDA availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# If using CUDA, you can also print some additional info
if device == "cuda":
    print(f"CUDA Device Name: {torch.cuda.get_device_name(0)}")
    print(f"Number of CUDA Devices: {torch.cuda.device_count()}")

# Create the model
model = create_cnn_roberta_model(num_labels=2, device=device)

# Create trainer with the new model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)
# disable W&B
os.environ["WANDB_DISABLED"] = "true"

# Explicitly move model to device if you want to do operations outside the trainer
model = model.to(device)
# Train the model
trainer.train()



Using device: cuda
CUDA Device Name: NVIDIA GeForce RTX 3060 Laptop GPU
Number of CUDA Devices: 1


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/4689 [00:00<?, ?it/s]

{'loss': 0.3947, 'grad_norm': 8.35985279083252, 'learning_rate': 1.7867349114949884e-05, 'epoch': 0.32}
{'loss': 0.3238, 'grad_norm': 6.20191764831543, 'learning_rate': 1.5734698229899766e-05, 'epoch': 0.64}
{'loss': 0.2843, 'grad_norm': 29.239761352539062, 'learning_rate': 1.3602047344849649e-05, 'epoch': 0.96}
{'loss': 0.2469, 'grad_norm': 13.697500228881836, 'learning_rate': 1.1469396459799531e-05, 'epoch': 1.28}
{'loss': 0.2286, 'grad_norm': 11.133810043334961, 'learning_rate': 9.336745574749414e-06, 'epoch': 1.6}
{'loss': 0.2199, 'grad_norm': 13.390135765075684, 'learning_rate': 7.204094689699297e-06, 'epoch': 1.92}
{'loss': 0.1802, 'grad_norm': 42.12541580200195, 'learning_rate': 5.07144380464918e-06, 'epoch': 2.24}
{'loss': 0.1573, 'grad_norm': 2.9238812923431396, 'learning_rate': 2.9387929195990615e-06, 'epoch': 2.56}
{'loss': 0.1586, 'grad_norm': 36.58708953857422, 'learning_rate': 8.061420345489445e-07, 'epoch': 2.88}
{'train_runtime': 1166.5984, 'train_samples_per_second': 6

TrainOutput(global_step=4689, training_loss=0.2411911578594329, metrics={'train_runtime': 1166.5984, 'train_samples_per_second': 64.289, 'train_steps_per_second': 4.019, 'total_flos': 0.0, 'train_loss': 0.2411911578594329, 'epoch': 3.0})

In [32]:
# trainer.train()

In [53]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

# make predictions
predictions = trainer.predict(test_dataset)
preds = predictions.predictions.argmax(axis=1)

# compute metrics
precision, recall, f1, _ = precision_recall_fscore_support(test_data['label'], preds, average='binary')
accuracy = accuracy_score(test_data['label'], preds)

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")


  0%|          | 0/1563 [00:00<?, ?it/s]

Precision: 0.9134
Recall: 0.9066
F1 Score: 0.9100
Accuracy: 0.9103


In [54]:
# Save predictions
output = pd.DataFrame({
    "text": test_data['text'],
    "true_label": test_data['label'],
    "predicted_label": preds
})
output.to_csv("bert_test_predictions.csv", index=False)

# Save the custom model
import os

# Create directory if it doesn't exist
os.makedirs('./bert3_sentiment_model', exist_ok=True)

# Save the model state dict
torch.save(model.state_dict(), os.path.join('./bert3_sentiment_model', 'model.pt'))

# Save the tokenizer
tokenizer.save_pretrained('./bert3_sentiment_model')

# Save model configuration
model_config = {
    'num_labels': 2,
    'dropout_rate': 0.2,
    'hidden_size': model.hidden_size
}
import json
with open(os.path.join('./bert3_sentiment_model', 'model_config.json'), 'w') as f:
    json.dump(model_config, f)


### Test Data

In [55]:
import pandas as pd
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

In [56]:
# Load the dataset
csv_path = r'C:\Users\Parthasarathy.Harini\Downloads\NLP\data\test_data_movie.csv'
test_data = pd.read_csv(csv_path)


In [57]:
# Extract text and labels
test_texts = test_data['text'].tolist()
test_labels = test_data['label'].tolist()
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Tokenize the text data
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)


In [58]:
# Convert to PyTorch tensors
test_inputs = torch.tensor(test_encodings['input_ids'])
test_masks = torch.tensor(test_encodings['attention_mask'])
test_labels = torch.tensor(test_labels)

# Create a DataLoader
test_dataset = TensorDataset(test_inputs, test_masks, test_labels)
test_loader = DataLoader(test_dataset, batch_size=16)


In [60]:
# Load the trained model
from transformers import RobertaTokenizer
# tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
import torch

def load_cnn_distilbert_model(model_path, device='cuda'):
    # Load configuration
    with open(os.path.join(model_path, 'model_config.json'), 'r') as f:
        config = json.load(f)
    
    # Create model with same configuration
    model = CNNRoBERTa(
        num_labels=config['num_labels'],
        dropout_rate=config['dropout_rate']
    )
    
    # Load state dict
    model.load_state_dict(torch.load(os.path.join(model_path, 'model.pt')))
    model = model.to(device)
    
    return model

# Load tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('./bert3_sentiment_model')
model = load_cnn_distilbert_model('./bert3_sentiment_model', device)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(os.path.join(model_path, 'model.pt')))


In [61]:
# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

CNNRoBERTa(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Laye

In [62]:
# Evaluate the model
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        # Move batch to device
        inputs, masks, labels = [b.to(device) for b in batch]
        
        # Get model outputs
        outputs = model(input_ids=inputs, attention_mask=masks)
        logits = outputs["logits"]  # Changed this line to access dictionary
        
        # Get predictions
        preds = torch.argmax(logits, axis=1)
        
        # Move predictions and labels to CPU and convert to numpy
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Convert to numpy arrays
predictions = np.array(predictions)
true_labels = np.array(true_labels)

# Calculate metrics
accuracy = accuracy_score(true_labels, predictions)
precision = precision_score(true_labels, predictions, average='binary')
recall = recall_score(true_labels, predictions, average='binary')
f1 = f1_score(true_labels, predictions, average='binary')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.9415
Precision: 0.9443
Recall: 0.9383
F1 Score: 0.9413


In [63]:
import numpy as np
import pandas as pd
import os
import json

# Create a directory for predictions if it doesn't exist
predictions_dir = 'model_predictions1'
os.makedirs(predictions_dir, exist_ok=True)

# Load original test data
csv_path = r'C:\Users\Parthasarathy.Harini\Downloads\NLP\data\test_data_movie.csv'
test_data = pd.read_csv(csv_path)

# Add predictions to the test data
test_data['predicted_label'] = predictions

# Save complete dataset with predictions
test_data.to_csv(os.path.join(predictions_dir, 'test_data_with_predictions.csv'), index=False)

# Save as JSON
test_data_dict = test_data.to_dict(orient='records')
with open(os.path.join(predictions_dir, 'test_data_with_predictions.json'), 'w') as f:
    json.dump(test_data_dict, f)

# Also save the original prediction files
np.save(os.path.join(predictions_dir, 'predictions.npy'), predictions)
np.save(os.path.join(predictions_dir, 'true_labels.npy'), true_labels)

results_df = pd.DataFrame({
    'predictions': predictions,
    'true_labels': true_labels
})
results_df.to_csv(os.path.join(predictions_dir, 'model_results.csv'), index=False)

# Print summary
print(f"Saved {len(predictions)} predictions in directory '{predictions_dir}':")
print(f"- {predictions_dir}/test_data_with_predictions.csv (Complete dataset with predictions)")
print(f"- {predictions_dir}/test_data_with_predictions.json (Complete dataset in JSON format)")
print(f"- {predictions_dir}/predictions.npy and true_labels.npy (NumPy arrays)")
print(f"- {predictions_dir}/model_results.csv (Predictions and true labels only)")

Saved 40000 predictions in directory 'model_predictions1':
- model_predictions1/test_data_with_predictions.csv (Complete dataset with predictions)
- model_predictions1/test_data_with_predictions.json (Complete dataset in JSON format)
- model_predictions1/predictions.npy and true_labels.npy (NumPy arrays)
- model_predictions1/model_results.csv (Predictions and true labels only)


In [64]:
# Calculate evaluation metrics
accuracy = accuracy_score(true_labels, predictions)
precision = precision_score(true_labels, predictions, average='binary')
recall = recall_score(true_labels, predictions, average='binary')
f1 = f1_score(true_labels, predictions, average='binary')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Accuracy: 0.9415
Precision: 0.9443
Recall: 0.9383
F1 Score: 0.9413
