## Read the dataset csv file

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
%cd drive/MyDrive

/content/drive/MyDrive


In [4]:
!pip install accelerate --upgrade



In [5]:
!pip install datasets



In [6]:
import pandas as pd
df = pd.read_csv('dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
# Separating positive and negative samples
positive_samples = df[df['sentiment'] == 'positive'].sample(n=7500, random_state=42)
negative_samples = df[df['sentiment'] == 'negative'].sample(n=7500, random_state=42)

# Concatenating the samples to get the final subset
subset_df = pd.concat([positive_samples, negative_samples])

# Shuffling the subset if needed
subset_df = subset_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Displaying the first few rows of the subset
print(subset_df.head())

                                              review sentiment
0  As a kid, I never understood WHY anyone would ...  negative
1  ......... and you get Chori Chori Chupke Chupk...  positive
2  There were many 'spooky' westerns made in the ...  negative
3  I enjoyed this movie, granted it is mainly bec...  positive
4  This was an interesting movie. I could have do...  positive


In [8]:


# retrieving the total number of samples
total_samples = subset_df.shape[0]

# Counting the number of positive and negative samples
class_counts = subset_df['sentiment'].value_counts()

# Displaying the results
print(f'Total number of samples: {total_samples}')
print(f'Number of positive samples: {class_counts["positive"]}')
print(f'Number of negative samples: {class_counts["negative"]}')

Total number of samples: 15000
Number of positive samples: 7500
Number of negative samples: 7500


## Process the data

In [9]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [10]:
def process_data(row):

    text = row['review']
    text = str(text)
    text = ' '.join(text.split())

    encodings = tokenizer(text, padding="max_length", truncation=True, max_length=128)

    label = 0
    if row['sentiment'] == 'positive':
        label += 1

    encodings['label'] = label
    encodings['text'] = text

    return encodings

In [11]:
print(process_data({
    'review': 'this is a sample review of a movie.',
    'sentiment': 'positive'
}))

{'input_ids': [101, 2023, 2003, 1037, 7099, 3319, 1997, 1037, 3185, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'label': 1, 'text': 'this is a sample review of a movie.'}


In [12]:
processed_data = []

for i in range(len(subset_df)):
    processed_data.append(process_data(subset_df.iloc[i]))

## Generate the dataset

In [13]:
from sklearn.model_selection import train_test_split

new_df = pd.DataFrame(processed_data)

train_df, valid_df = train_test_split(
    new_df,
    test_size=0.2,
    random_state=2022
)

In [14]:
new_df.shape[0]

15000

In [15]:
import pyarrow as pa
from datasets import Dataset

train_hg = Dataset(pa.Table.from_pandas(train_df))
valid_hg = Dataset(pa.Table.from_pandas(valid_df))

In [17]:
import time
import torch
from torch.nn.utils.rnn import pad_sequence
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer

class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.dataframe = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text = self.dataframe.iloc[idx]['text']
        encoding = self.tokenizer(text, padding='max_length', truncation=True, return_tensors="pt")
        return encoding['input_ids'].squeeze(0)  


model_path = "./best_model_e_3"
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained(model_path)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


# Creating a custom dataset
custom_dataset = CustomDataset(new_df, tokenizer)

# Setting the model to evaluation mode
model.eval()

# Creating a DataLoader for batching
batch_size = 8
dataloader = DataLoader(custom_dataset, batch_size=batch_size, shuffle=False)

# Measuring the prediction time
start_time = time.time()

# Making predictions
all_predictions = []
with torch.no_grad():
    for batch in tqdm(dataloader, desc="Predicting"):
        batch = batch.to(device)

        padded_sequence = pad_sequence(batch, batch_first=True)
        outputs = model(padded_sequence.to(device))
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)
        all_predictions.extend(predictions.cpu().numpy())

# Calculating the total time taken
total_time = time.time() - start_time

# Printing the total time taken and the speed
print(f"Total time taken for predictions: {total_time:.2f} seconds")
print(f"Prediction speed: {len(df) / total_time:.2f} samples per second")

Predicting:   0%|          | 0/1875 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
Predicting: 100%|██████████| 1875/1875 [04:28<00:00,  6.98it/s]

Total time taken for predictions: 268.82 seconds
Prediction speed: 186.00 samples per second





## Evaluating the performance of a saved model

In [21]:
#Trying DataParallel for parallel processing on a single GPU
import torch
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
import time
from torch.nn.parallel import DataParallel


model.eval()

model_parallel = DataParallel(model)


batch_size = 8
dataloader = DataLoader(custom_dataset, batch_size=batch_size, shuffle=False)

# Measuring the prediction time for the original model with parallel processing
start_time = time.time()

# Making predictions
all_predictions_original = []
with torch.no_grad():
    for batch in tqdm(dataloader, desc="Predicting (Original with Parallelism)"):
        padded_sequence = pad_sequence(batch, batch_first=True)
        outputs = model_parallel(padded_sequence)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)
        all_predictions_original.extend(predictions.cpu().numpy())

# Calculating the total time taken for the original model with parallel processing
total_time_with_parallelism = time.time() - start_time

# Print the total time taken and the speed
print(f"Total time taken for predictions (Original with Parallelism): {total_time_with_parallelism:.2f} seconds")
print(f"Prediction speed (Original with Parallelism): {len(all_predictions_original) / total_time_with_parallelism:.2f} samples per second")


Predicting (Original with Parallelism): 100%|██████████| 1875/1875 [04:31<00:00,  6.91it/s]

Total time taken for predictions (Original with Parallelism): 271.24 seconds
Prediction speed (Original with Parallelism): 55.30 samples per second





In [22]:
speedup_factor = total_time / total_time_with_parallelism
print(f"Speedup factor: {speedup_factor:.2f}")

Speedup factor: 0.98


# While Trying DataParallel there is no improvement in the prediction speed so trying automatic mixed precision. I tried this approach as we cannot do model.half() with distilbert

In [17]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
from torch.cuda.amp import autocast
import time
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from tqdm import tqdm

class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.encoded_texts = [self.tokenizer(text,
                                             add_special_tokens=True,
                                             max_length=self.max_length,
                                             padding='max_length',
                                             truncation=True,
                                             return_tensors="pt").input_ids.squeeze(0)
                              for text in dataframe['text']]

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        return self.encoded_texts[idx]


model_path = "./best_model_e_3"
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained(model_path)

# Moving model to GPU and enable mixed precision
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Defining the maximum sequence length
MAX_LENGTH = 512  # Adjust as needed

# Creating a custom dataset
custom_dataset = CustomDataset(new_df, tokenizer, MAX_LENGTH)  # Correct instantiation

# Setting the model to evaluation mode
model.eval()

# Creating a DataLoader for batching
batch_size = 32

dataloader = DataLoader(custom_dataset, batch_size=batch_size, shuffle=False, num_workers=0)

# Measuring the prediction time
start_time = time.time()



# Making predictions
all_predictions = []
with torch.no_grad():
    for batch in tqdm(dataloader, desc="Predicting"):
        batch = batch.to(device)  

        with autocast():  # Enabling automatic mixed precision
            outputs = model(batch)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)
            all_predictions.extend(predictions.cpu().numpy())
# Calculating the total time taken
total_time = time.time() - start_time

# Printing the total time taken and the speed
print(f"Total time taken for predictions: {total_time:.2f} seconds")
print(f"Prediction speed: {len(new_df) / total_time:.2f} samples per second")

Predicting:   0%|          | 0/469 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
Predicting: 100%|██████████| 469/469 [01:21<00:00,  5.74it/s]

Total time taken for predictions: 81.76 seconds
Prediction speed: 183.46 samples per second





In [19]:
# Using the value 268.82(time taken by the original model for prediction) to compare with modle with mixed precision enabled.
speedup_factor = 268.82 / total_time
print(f"Speedup factor: {speedup_factor:.2f}")

Speedup factor: 3.29


The spped up factor is significant. it improved the model speed by 3 times.

Utilizing mixed precision in my model not only conserves memory but also leverages the computational efficiency of lower-precision arithmetic, resulting in significantly accelerated prediction speeds.