<a href="https://colab.research.google.com/github/sowmyarshetty/NNClass/blob/main/AmazonHomeKitchenReviewsPreprocessing_(1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install BERTopic

Collecting BERTopic
  Downloading bertopic-0.17.0-py3-none-any.whl.metadata (23 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->BERTopic)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->BERTopic)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->BERTopic)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers>=0.4.1->BERTopic)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers>=0.4.1->BERTopic)
  Downloa

In [2]:
import pandas as pd
import dask.dataframe as dd
import gdown
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

In [4]:
# Mount Google Drive (For Colab Users)
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [5]:
amazonhkdatasetfileid = '14GcJAzyN2PFg2JuyzF0pRmxlMmimrz9o'
amazonhkdatasetfilename = 'AmazonHomeKitchenReviews.csv'

url = f"https://drive.google.com/uc?export=download&id={amazonhkdatasetfileid}"

gdown.download(url,amazonhkdatasetfilename, quiet=False)


Downloading...
From (original): https://drive.google.com/uc?export=download&id=14GcJAzyN2PFg2JuyzF0pRmxlMmimrz9o
From (redirected): https://drive.google.com/uc?export=download&id=14GcJAzyN2PFg2JuyzF0pRmxlMmimrz9o&confirm=t&uuid=e2deb2dd-7229-47c0-9efd-c73b5f1f28ff
To: /content/AmazonHomeKitchenReviews.csv
100%|██████████| 692M/692M [00:09<00:00, 74.7MB/s]


'AmazonHomeKitchenReviews.csv'

* Read the dataset csv  into dataframes

In [6]:
df_data = pd.read_csv(amazonhkdatasetfilename)


In [7]:
#reduce the dataset to 10000 records
df_data = df_data.head(10000)



*   Analyse the datasets
*  Check total number of unique products and the review counts




In [8]:
df_renamed = df_data.rename(columns={'title_y' : 'product_title','title_x':'review_title','text':'review_text'})
df_renamed.groupby('product_title').size().sort_values(ascending=False).head(5)
print(df_renamed.columns)

Index(['Unnamed: 0', 'rating', 'review_title', 'review_text', 'images', 'asin',
       'parent_asin', 'user_id', 'timestamp', 'helpful_vote',
       'verified_purchase', 'product_title', 'description', 'price', 'Brand',
       'Material', 'Color', 'categories'],
      dtype='object')


* Pre-processing
* X = review_title,review_text
* y = rating

In [9]:
df_renamed.head(2)

Unnamed: 0.1,Unnamed: 0,rating,review_title,review_text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,product_title,description,price,Brand,Material,Color,categories
0,59,5,Adorable!,These are so sweet. I do wish the stopper part...,[],B01HBWGU80,B01DR2ACA0,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2019-07-23 04:29:16.671,0,True,"Little Bird Wine Bottle Stopper, Silicone Stop...",[],9.49,LouisChoice,Silicone,Assorted Color,"['Home & Kitchen', 'Kitchen & Dining', 'Kitche..."
1,87,5,"Stailess, healthier than coated pans","Great little stainless steel, balanced, good w...",[],B07T5CRVKQ,B08C7JYKZH,AEVWAM3YWN5URJVJIZZ6XPD2MKIA,2020-11-02 22:09:44.073,1,True,"Fortune Candy 8-Inch Fry Pan with Lid, 3-ply S...",[],24.99,Fortune Candy,"Stainless Steel, Aluminum",Mirror Finish,"['Home & Kitchen', 'Kitchen & Dining', 'Cookwa..."


**Text Pre-processsing **

* Used a lemmatizer for review title and review text
* This improve accuracy: By grouping similar words together and it can help the model understand the meaning of text better.
* It can reduce noise: It can help remove redundant information from your text data.
* Improve efficiency: It can help reduce the size of your vocabulary and speed up your analysis.

In [10]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download stopwords and punkt if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Tokenize text
    tokens = word_tokenize(text.lower())  # Convert to lowercase and tokenize
    # Remove stop words and lemmatize
    cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    return " ".join(cleaned_tokens)

# Apply preprocessing to your review text
df_renamed['processed_review'] = df_renamed['review_title'].astype(str) +  df_renamed['review_text'].astype(str).apply(preprocess_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [11]:
df_renamed['processed_review'].head(2)

Unnamed: 0,processed_review
0,Adorable!sweet wish stopper part little longer...
1,"Stailess, healthier than coated pansgreat litt..."


In [12]:
df_renamed["categories"].value_counts().sort_values(ascending=False).head(10)

Unnamed: 0_level_0,count
categories,Unnamed: 1_level_1
"['Home & Kitchen', 'Bedding', 'Sheets & Pillowcases', 'Sheet & Pillowcase Sets']",1153
"['Home & Kitchen', 'Home Décor Products', 'Window Treatments', 'Curtains & Drapes', 'Panels']",560
"['Home & Kitchen', 'Bedding', 'Decorative Pillows, Inserts & Covers', 'Throw Pillow Covers']",303
"['Home & Kitchen', 'Bedding', 'Sheets & Pillowcases', 'Pillowcases']",289
"['Home & Kitchen', 'Bedding', 'Blankets & Throws', 'Throws']",257
"['Home & Kitchen', 'Bath', 'Bath Rugs']",224
"['Home & Kitchen', 'Home Décor Products', 'Slipcovers', 'Sofa Slipcovers']",220
"['Home & Kitchen', 'Kitchen & Dining', 'Kitchen & Table Linens', 'Tablecloths']",203
"['Home & Kitchen', 'Kitchen & Dining', 'Dining & Entertaining', 'Glassware & Drinkware', 'Tumblers & Water Glasses']",199
"['Home & Kitchen', 'Bedding', 'Sheets & Pillowcases', 'Fitted Sheets']",158




*   Categories column Encoding
*   Multi - Hot Encoding  
* Convert the categories column from string '<list>' to list before passing it to multi lable binarizer
* To do - you can use TD-IDF to extract important category words if required


In [13]:
import ast
df_encoded = df_renamed.copy()
# Because the categories column is a string <list> , we have to convert it into a list before encoding
df_encoded['categories'] = df_encoded['categories'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)


In [14]:
print(df_encoded['categories'].head(5))

0    [Home & Kitchen, Kitchen & Dining, Kitchen Ute...
1    [Home & Kitchen, Kitchen & Dining, Cookware, P...
2    [Home & Kitchen, Kitchen & Dining, Kitchen & T...
3    [Home & Kitchen, Kitchen & Dining, Kitchen & T...
4    [Home & Kitchen, Bedding, Sheets & Pillowcases...
Name: categories, dtype: object


In [15]:
from sklearn.preprocessing import MultiLabelBinarizer

# Convert NaN or empty categories to empty lists
df_encoded['categories'] = df_encoded['categories'].apply(lambda x: x if isinstance(x, list) else [])

# Apply MultiLabelBinarizer
mlb = MultiLabelBinarizer()
categories_encoded = mlb.fit_transform(df_encoded['categories'])

# Convert to DataFrame with category names as columns
categories_df = pd.DataFrame(categories_encoded, columns=mlb.classes_)

# Merge back with original DataFrame
df = pd.concat([df_encoded, categories_df], axis=1)
# df.drop(columns=['categories'], inplace=True)

print(df.head(2))


   Unnamed: 0  rating                          review_title  \
0          59       5                             Adorable!   
1          87       5  Stailess, healthier than coated pans   

                                         review_text images        asin  \
0  These are so sweet. I do wish the stopper part...     []  B01HBWGU80   
1  Great little stainless steel, balanced, good w...     []  B07T5CRVKQ   

  parent_asin                       user_id                timestamp  \
0  B01DR2ACA0  AGKHLEW2SOWHNMFQIJGBECAF7INQ  2019-07-23 04:29:16.671   
1  B08C7JYKZH  AEVWAM3YWN5URJVJIZZ6XPD2MKIA  2020-11-02 22:09:44.073   

   helpful_vote  ...  Wine Pourers Wine Racks & Cabinets Wine Stoppers  \
0             0  ...             0                     0             1   
1             1  ...             0                     0             0   

   Wine Stoppers & Pourers Woks & Stir-Fry Pans Wreath Hangers Wreaths  \
0                        1                    0              0       0

In [16]:
# print(categories_df.columns)  # Displays the binary-encoded category column names

print(df.info)  # Categories are now binary-encoded

<bound method DataFrame.info of       Unnamed: 0  rating                          review_title  \
0             59       5                             Adorable!   
1             87       5  Stailess, healthier than coated pans   
2             89       5               Pretty colors available   
3             90       4                         Nice material   
4             93       4                      Love the zipper!   
...          ...     ...                                   ...   
9995      210824       5                                  Nice   
9996      210827       3                          Okay towels.   
9997      210880       4                  Good mattress cover.   
9998      210882       3       Severely wrinkled after washing   
9999      210911       1                               Blanket   

                                            review_text images        asin  \
0     These are so sweet. I do wish the stopper part...     []  B01HBWGU80   
1     Great little 


* Convert category into embeddings rather than converting categories into many columns
* Instead of converting categories into many columns , we can assign a unique index to each category and use an embedding layer in the neural network

* THIS FAILED



In [17]:
df_renamed["categories"].value_counts().sort_values(ascending=False).head(10)

Unnamed: 0_level_0,count
categories,Unnamed: 1_level_1
"['Home & Kitchen', 'Bedding', 'Sheets & Pillowcases', 'Sheet & Pillowcase Sets']",1153
"['Home & Kitchen', 'Home Décor Products', 'Window Treatments', 'Curtains & Drapes', 'Panels']",560
"['Home & Kitchen', 'Bedding', 'Decorative Pillows, Inserts & Covers', 'Throw Pillow Covers']",303
"['Home & Kitchen', 'Bedding', 'Sheets & Pillowcases', 'Pillowcases']",289
"['Home & Kitchen', 'Bedding', 'Blankets & Throws', 'Throws']",257
"['Home & Kitchen', 'Bath', 'Bath Rugs']",224
"['Home & Kitchen', 'Home Décor Products', 'Slipcovers', 'Sofa Slipcovers']",220
"['Home & Kitchen', 'Kitchen & Dining', 'Kitchen & Table Linens', 'Tablecloths']",203
"['Home & Kitchen', 'Kitchen & Dining', 'Dining & Entertaining', 'Glassware & Drinkware', 'Tumblers & Water Glasses']",199
"['Home & Kitchen', 'Bedding', 'Sheets & Pillowcases', 'Fitted Sheets']",158


In [None]:
# import tensorflow as tf
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences

# df_categories_tokenized = df_renamed.copy()

# # Flatten all categories into a single list for tokenization
# all_categories = [cat for sublist in df_categories_tokenized['categories'] for cat in sublist]

# # Fit tokenizer on all unique categories
# tokenizer = Tokenizer()
# tokenizer.fit_on_texts(all_categories)

# # Convert each row’s category list into a sequence of indexes
# df_categories_tokenized['categories_tokenized'] = df_categories_tokenized['categories'].apply(lambda x: tokenizer.texts_to_sequences(x))

#THE PADDING IS FAILING HERE

# # Set MAX_CATEGORIES to a reasonable value, e.g., 100
# MAX_CATEGORIES = 100

# # Apply padding to make all sequences the same length
# padded_categories = pad_sequences(df['categories_tokenized'], maxlen=MAX_CATEGORIES, padding='post')

# # Convert the padded sequences into a DataFrame
# df_padded = pd.DataFrame(padded_categories, columns=[f'category_{i+1}' for i in range(MAX_CATEGORIES)])

# # Check the padded output
# print(df_padded.head())  # Verify the first few rows of the padded categories




In [None]:
# # Check the first few entries
# print(df_categories_tokenized['categories_tokenized'].head())

# # Check if each entry is a list
# print(df_categories_tokenized['categories_tokenized'].apply(type).value_counts())  # Should output list for each row

# df_categories_tokenized['categories_tokenized'] = df_categories_tokenized['categories_tokenized'].apply(lambda x: x if isinstance(x, list) else [])

# # Check again
# print(df_categories_tokenized['categories_tokenized'].apply(type).value_counts())  # Should now show only list


# # Replace any null values with empty lists
# df_categories_tokenized['categories_tokenized'] = df_categories_tokenized['categories_tokenized'].apply(lambda x: x if isinstance(x, list) else [])

# # Ensure there are no empty lists if needed
# df_categories_tokenized['categories_tokenized'] = df_categories_tokenized['categories_tokenized'].apply(lambda x: x if len(x) > 0 else [])


# # Check if any lists are empty
# empty_lists = df_categories_tokenized['categories_tokenized'].apply(lambda x: len(x) == 0)
# print(f"Number of empty lists: {empty_lists.sum()}")

# # Optionally replace empty lists with a default value (e.g., empty list)
# df_categories_tokenized['categories_tokenized'] = df_categories_tokenized['categories_tokenized'].apply(lambda x: x if len(x) > 0 else [0])

# category_lengths = df_categories_tokenized['categories_tokenized'].apply(len)
# print(f"Max number of categories: {category_lengths.max()}")
# print(f"Min number of categories: {category_lengths.min()}")
# print(f"Mean number of categories: {category_lengths.mean()}")
# # Check the first few rows of 'category_sequences' to understand its structure
# print(df_categories_tokenized['categories_tokenized'].head())








*   Encode Review and Review TExt
* Use TD-IDF for encoding review and review text
* I had ensure there are no null values in review_Text



In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust the number of features
df['review_title'] = df['review_title'].fillna('No Review')
df['review_text'] = df['review_text'].fillna('No Review')

# Fit and transform the review title and review text
X_title = tfidf_vectorizer.fit_transform(df['review_title'])
X_text = tfidf_vectorizer.fit_transform(df['review_text'])

# Convert to dense format (optional, depending on your model)
X_title_dense = X_title.toarray()
X_text_dense = X_text.toarray()

# Check the shape of the TF-IDF encoded title and text
print(f"Shape of review title matrix: {X_title_dense.shape}")
print(f"Shape of review text matrix: {X_text_dense.shape}")


Shape of review title matrix: (10000, 3920)
Shape of review text matrix: (10000, 5000)


If you're using a neural network model, Word Embeddings such as Word2Vec, GloVe, or the Keras Embedding layer can be used to learn better semantic representations of words.

THIS FAILED

In [None]:
# from transformers import BertTokenizer, TFBertModel

# # Load pre-trained BERT model and tokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = TFBertModel.from_pretrained('bert-base-uncased')

# # Tokenize the review titles and texts
# inputs_title = tokenizer(df['review_title'].tolist(), return_tensors='tf', padding=True, truncation=True)
# inputs_text = tokenizer(df['review_text'].tolist(), return_tensors='tf', padding=True, truncation=True)

# # Get BERT embeddings (last hidden state)
# output_title = model(inputs_title).last_hidden_state
# output_text = model(inputs_text).last_hidden_state

# # Use the last token embedding (or you can average all token embeddings) for the entire sequence
# X_title_bert = output_title[:, 0, :]  # [CLS] token embedding
# X_text_bert = output_text[:, 0, :]    # [CLS] token embedding


* Since transformers work with tokenized inputs,used  a pre-trained tokenizer.
* This encodes text into token IDs, attention masks, and token type IDs—which transformers can use.


In [19]:
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Combine review title and review text
def encode_reviews(title, text, max_length=256):
    combined_text = str(title) + str(text)  # Separate title and text
    encoded = tokenizer(combined_text, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")
    return encoded["input_ids"], encoded["attention_mask"]


df[["input_ids","attention_mask"]] = df.apply(lambda row: encode_reviews(["review_title"],row["review_text"]),axis=1,result_type="expand")




tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

* Convert to tensor format for model training (2 step , first convert to int list and then tensor)

In [22]:
import torch

df.head(5)

df["input_ids"] = df["input_ids"].apply(lambda x: x.tolist() if isinstance(x, torch.Tensor) else x)
df["attention_mask"] = df["attention_mask"].apply(lambda x: x.tolist() if isinstance(x, torch.Tensor) else x)



print(df["input_ids"].apply(type).value_counts())  # Check types in the column
print(type(df["input_ids"].iloc[0]), type(df["input_ids"].iloc[0][0]))  # Should be list, int


input_ids
<class 'list'>    10000
Name: count, dtype: int64
<class 'list'> <class 'list'>


* Ratings columns has value from 1 - 5
* Convert the ratings value from 0 - 4 because distibertforsequenceclassifciation expects values to be between 0 and 4 and not 1 to 5

In [23]:
df['rating'].value_counts()
# df['rating'].value_counts()
df['new_rating'] = df['rating']-1

df['new_rating'].value_counts()

Unnamed: 0_level_0,count
new_rating,Unnamed: 1_level_1
4,6963
3,1364
2,715
0,545
1,413


In [39]:


input_ids = torch.tensor(df["input_ids"].tolist(), dtype=torch.long)
attention_mask = torch.tensor(df["attention_mask"].tolist(), dtype=torch.long)
ratings = torch.tensor(df["new_rating"].values, dtype=torch.long)  # Ensure target is integer type

print(input_ids.shape, attention_mask.shape, ratings.shape)  # Should be (num_samples, sequence_length)

squeezedinput_ids = input_ids.squeeze(1)  # Now shape is [10000, 256]
squeezedattention_mask = attention_mask.squeeze(1)  # Now shape is [10000, 256]


print(squeezedinput_ids.shape, squeezedattention_mask.shape, ratings.shape)  # Should be (num_samples, sequence_length)


torch.Size([10000, 1, 256]) torch.Size([10000, 1, 256]) torch.Size([10000])
torch.Size([10000, 256]) torch.Size([10000, 256]) torch.Size([10000])


* Split the dataset


In [40]:
from sklearn.model_selection import train_test_split
import torch

# Assuming input_ids, attention_mask, and ratings are tensors
train_input_ids, val_input_ids, train_attention_mask, val_attention_mask, train_ratings, val_ratings = train_test_split(
    squeezedinput_ids, squeezedattention_mask, ratings, test_size=0.2, random_state=42
)

# Convert to tensors
train_input_ids = torch.tensor(train_input_ids)
val_input_ids = torch.tensor(val_input_ids)
train_attention_mask = torch.tensor(train_attention_mask)
val_attention_mask = torch.tensor(val_attention_mask)
train_ratings = torch.tensor(train_ratings, dtype=torch.long)
val_ratings = torch.tensor(val_ratings, dtype=torch.long)


* Create a pytorch dataset and dataloader

In [43]:

# from torch.utils.data import Dataset

# class ReviewDataset(Dataset):
#     def __init__(self, reviews, ratings, tokenizer, max_length=256):
#         self.tokenizer = tokenizer  # Properly assign tokenizer
#         self.reviews = reviews
#         self.ratings = ratings
#         self.max_length = max_length

#         # Tokenize the reviews and encode them
#         self.encodings = self.tokenizer(
#             self.reviews,
#             padding=True,
#             truncation=True,
#             max_length=self.max_length,
#             return_tensors="pt"
#         )

#     def __len__(self):
#         return len(self.ratings)

#     def __getitem__(self, idx):
#         return {
#             "input_ids": self.encodings["input_ids"][idx],
#             "attention_mask": self.encodings["attention_mask"][idx],
#             "labels": torch.tensor(self.ratings[idx], dtype=torch.long)  # ensure labels are 0 -ratings
#         }




In [45]:
from torch.utils.data import Dataset, DataLoader

class ReviewDataset(Dataset):
    def __init__(self, input_ids, attention_mask, ratings):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.ratings = ratings

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels": self.ratings[idx]
        }

# Create train and validation datasets
train_dataset = ReviewDataset(train_input_ids, train_attention_mask, train_ratings)
val_dataset = ReviewDataset(val_input_ids, val_attention_mask, val_ratings)


* Load a pre-trained DistitBERT model
* Use DistilBERTForSequenceClassification if treating the task as classification (5 classes: 1-5 stars).

* Use DistilBERTForSequenceRegression if treating it as regression (predict a continuous value).

In [47]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import DataLoader

num_labels = 5  # Rating classes: 1, 2, 3, 4, 5

# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

device = torch.device("cpu")  # Force CPU execution


# Load model for classification
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=num_labels
)

# # Example reviews and ratings
# reviews = ["Great product! Works well.", "Not bad, but could be better.", "Very poor quality."]
# ratings = [4, 2, 0]  # Example ratings (1 to 5 stars)

# # Create dataset
# dataset = ReviewDataset(reviews, ratings, tokenizer)

# # Check the dataset's first item
# print(dataset[0])


# # Create DataLoader for batch processing
# batch_size = 32
# dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# # Example batch
# batch = next(iter(dataloader))
# print(batch["input_ids"].shape, batch["labels"].shape)  # Expected output: (batch_size, 256) (batch_size,)



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [48]:
batch_size = 32
num_epochs = 3

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)



*  Set Up Optimizer and Learning Rate Scheduler . We need an optimizer to update the model’s parameters and a scheduler to adjust the learning rate over time. AdamW is the optimizer of choice when fine-tuning transformers.


In [50]:
from torch.optim import AdamW
from transformers import get_scheduler

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Scheduler for learning rate
num_epochs = 3
num_training_steps = len(train_dataloader) * num_epochs
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)


In [51]:
from tqdm import tqdm
import torch.nn.functional as F

device = torch.device("cpu")  # Use CPU instead of GPU
model.to(device)


for epoch in range(num_epochs):
    model.train()
    train_loss = 0

    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1} Training"):
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_dataloader)

    # **Validation Phase**
    model.eval()
    val_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc="Validation"):
            batch = {k: v.to(device) for k, v in batch.items()}

            outputs = model(**batch)
            loss = outputs.loss
            val_loss += loss.item()

            preds = torch.argmax(outputs.logits, dim=1)
            correct += (preds == batch["labels"]).sum().item()
            total += batch["labels"].size(0)

    avg_val_loss = val_loss / len(val_dataloader)
    val_accuracy = correct / total

    print(f"Epoch {epoch+1}: Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")



Epoch 1 Training: 100%|██████████| 250/250 [27:34<00:00,  6.62s/it]
Validation: 100%|██████████| 63/63 [01:54<00:00,  1.82s/it]


Epoch 1: Train Loss: 0.7745, Val Loss: 0.7112, Val Accuracy: 0.7215


Epoch 2 Training: 100%|██████████| 250/250 [27:12<00:00,  6.53s/it]
Validation: 100%|██████████| 63/63 [01:54<00:00,  1.81s/it]


Epoch 2: Train Loss: 0.5770, Val Loss: 0.6757, Val Accuracy: 0.7280


Epoch 3 Training: 100%|██████████| 250/250 [26:38<00:00,  6.39s/it]
Validation: 100%|██████████| 63/63 [01:55<00:00,  1.84s/it]

Epoch 3: Train Loss: 0.4831, Val Loss: 0.6715, Val Accuracy: 0.7565





* Setup training loop  where the model will learn from our data


In [30]:
# # Check the shape of the first batch in the DataLoader
# batch = next(iter(dataloader))
# print(batch["input_ids"].shape)  # Expected: (batch_size, sequence_length)
# print(batch["attention_mask"].shape)  # Expected: (batch_size, sequence_length)


torch.Size([3, 10])
torch.Size([3, 10])


In [31]:
# print("Sample labels:", ratings[:10])
# print("Label min/max:", min(ratings), max(ratings))
# print("Label dtype:", torch.tensor(ratings).dtype)

# for batch in dataloader:
#     print("Input IDs shape:", batch["input_ids"].shape)
#     print("Attention Mask shape:", batch["attention_mask"].shape)
#     print("Labels shape:", batch["labels"].shape)
#     print("Labels dtype:", batch["labels"].dtype)
#     break



Sample labels: [4, 2, 0]
Label min/max: 0 4
Label dtype: torch.int64
Input IDs shape: torch.Size([3, 10])
Attention Mask shape: torch.Size([3, 10])
Labels shape: torch.Size([3])
Labels dtype: torch.int64


In [32]:
# import os
# os.environ["TORCH_USE_CUDA_DSA"] = "1"
# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"


In [33]:
# import torch
# from tqdm import tqdm

# device = torch.device("cpu")  # Use CPU instead of GPU
# model.to(device)

# # Training loop
# for epoch in range(num_epochs):
#     model.train()  # Set model to training mode
#     loop = tqdm(dataloader, leave=True)  # Track progress during training
#     for batch in loop:
#         batch = {k: v.to(device) for k, v in batch.items()}  # Move data to device

#         # Forward pass
#         outputs = model(**batch)
#         loss = outputs.loss

#         # Backward pass
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()
#         lr_scheduler.step()

#         # Update progress bar
#         loop.set_description(f"Epoch {epoch}")
#         loop.set_postfix(loss=loss.item())

#     print(f"Epoch {epoch} finished.")


Epoch 0: 100%|██████████| 1/1 [00:00<00:00,  1.35it/s, loss=1.7]


Epoch 0 finished.


Epoch 1: 100%|██████████| 1/1 [00:00<00:00,  3.93it/s, loss=1.64]


Epoch 1 finished.


Epoch 2: 100%|██████████| 1/1 [00:00<00:00,  3.86it/s, loss=1.67]

Epoch 2 finished.







*   Evaluate the models performance




In [34]:
# from sklearn.metrics import accuracy_score

# model.eval()  # Set model to evaluation mode
# predictions, true_labels = [], []

# with torch.no_grad():
#     for batch in val_dataloader:  # Assuming you have a validation dataloader
#         batch = {k: v.to(device) for k, v in batch.items()}
#         outputs = model(**batch)
#         preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()  # Get the predicted labels
#         labels = batch["labels"].cpu().numpy()

#         predictions.extend(preds)
#         true_labels.extend(labels)

# # Calculate accuracy
# accuracy = accuracy_score(true_labels, predictions)
# print(f"Validation Accuracy: {accuracy:.4f}")


NameError: name 'val_dataloader' is not defined