In [1]:
import numpy as np
import bz2
import re
import os

In [9]:
train_file = bz2.BZ2File('dataset/7/train.ft.txt.bz2')
test_file = bz2.BZ2File('dataset/7/test.ft.txt.bz2')

In [10]:
from tqdm import tqdm

def load_extract(file):
    texts, labels = [], []
    for line in tqdm(file, desc="Extracting and Loading Data", ):
        x = line.decode('utf-8')
        labels.append(int(x[9]) - 1) 
        texts.append(x[10:].strip()) 
    print('Done !') 
    return np.array(labels), texts

In [11]:
train_labels, train_texts = load_extract(train_file)
test_labels, test_texts = load_extract(test_file)

# remove unwanted large data variables
del train_file; del test_file 

Extracting and Loading Data: 3600000it [01:03, 57141.28it/s]


Done !


Extracting and Loading Data: 400000it [00:07, 55888.64it/s]


Done !


In [12]:
print(len(train_texts), len(test_texts))

3600000 400000


In [13]:
train_texts[0]

'Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^'

In [14]:
def clean_texts(texts):
    temp_texts = []

    for text in tqdm(texts, desc="Cleaning Texts"):

        # Replace digits with '0'
        text = re.sub(r'\d', '0', text)

        # Remove links and URLs
        if 'www.' in text or 'http:' in text or 'https:' in text or '.com' in text:
            text = re.sub(r"(?:https?://|www\.)\S+|\b\S*\.com\S*", " ", text)

        # Remove non-alphabetic characters (except spaces)
        text = re.sub(r'[^a-zA-Z\s]', ' ', text)

        # Remove extra spaces and strip leading/trailing whitespace
        text = re.sub(r'\s+', ' ', text).strip()

        temp_texts.append(text)

    return temp_texts

In [15]:
print('\nCleaning Training data')
train_texts = clean_texts(train_texts)
print('\nCleaning Test data')
test_texts = clean_texts(test_texts)


Cleaning Training data


Cleaning Texts: 100%|██████████| 3600000/3600000 [01:52<00:00, 32088.16it/s]



Cleaning Test data


Cleaning Texts: 100%|██████████| 400000/400000 [00:12<00:00, 32479.27it/s]


In [16]:
train_texts[0:10]

['Stuning even for the non gamer This sound track was beautiful It paints the senery in your mind so well I would recomend it even to people who hate vid game music I have played the game Chrono Cross but out of all of the games I have ever played it has the best music It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras It would impress anyone who cares to listen',
 'The best soundtrack ever to anything I m reading a lot of reviews saying that this is the best game soundtrack and I figured that I d write a review to disagree a bit This in my opinino is Yasunori Mitsuda s ultimate masterpiece The music is timeless and I m been listening to it for years now and its beauty simply refuses to fade The price tag on this is pretty staggering I must say but if you are going to buy any cd for this much money this is the only one that I feel would be worth every penny',
 'Amazing This soundtrack is my favorite music of all time hands down The i

In [17]:
from sklearn.model_selection import train_test_split

train_texts, valid_texts, train_labels, valid_labels = train_test_split(train_texts, train_labels, test_size=0.1, random_state=1234)

In [2]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [3]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
label_mapping = {0: "negative", 1: "positive"}
num_labels = len(label_mapping)

In [18]:
print(f"Number of training samples: {len(train_texts)}")
print(f"Number of validation samples: {len(valid_texts)}")
print(f"Number of test samples: {len(test_texts)}")
print(f"Number of sentiment classes: {num_labels}")

Number of training samples: 3240000
Number of validation samples: 360000
Number of test samples: 400000
Number of sentiment classes: 2


In [5]:
# We'll use DistilBERT for its balance of performance and speed.
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [6]:
# Create a custom PyTorch Dataset
class AmazonReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
# --- Optimized Tokenization for Large Datasets ---
def tokenize_and_save(texts=None, labels=None, tokenizer, max_length, file_prefix):
    """
    Tokenizes text data in batches and saves the encodings and labels.
    If files exist, loads them instead of re-tokenizing.
    """
    encodings_path = f"encodings/{file_prefix}_encodings.pt"
    labels_path = f"encodings/{file_prefix}_labels.pt"

    if os.path.exists(encodings_path) and os.path.exists(labels_path):
        print(f"Loading pre-tokenized data from {file_prefix}...")
        encodings = torch.load(encodings_path)
        labels = torch.load(labels_path)
        # Convert tensors back to lists/dicts if necessary for the Dataset class
        encodings = {key: val.tolist() for key, val in encodings.items()}
        labels = labels.tolist()
        return encodings, labels
    else:
        print(f"Tokenizing data for {file_prefix}...")
        # Process in batches to manage memory and potentially speed up
        batch_size = 1000 # Adjust based on your system's memory
        all_input_ids = []
        all_attention_mask = []
        all_token_type_ids = [] # DistilBERT doesn't use token_type_ids, but good to keep general

        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            batch_encodings = tokenizer(batch_texts, truncation=True, padding=True, max_length=max_length)
            all_input_ids.extend(batch_encodings['input_ids'])
            all_attention_mask.extend(batch_encodings['attention_mask'])
            if 'token_type_ids' in batch_encodings:
                all_token_type_ids.extend(batch_encodings['token_type_ids'])

        encodings = {
            'input_ids': all_input_ids,
            'attention_mask': all_attention_mask
        }
        if all_token_type_ids: # Only add if present
            encodings['token_type_ids'] = all_token_type_ids

        # Save tokenized data
        torch.save({key: torch.tensor(val) for key, val in encodings.items()}, encodings_path)
        torch.save(torch.tensor(labels), labels_path)
        print(f"Tokenized data saved to {file_prefix}_encodings.pt and {file_prefix}_labels.pt")
        return encodings, labels

In [None]:
# train_texts = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
# valid_texts = tokenizer(valid_texts, truncation=True, padding=True, max_length=128)
# test_texts = tokenizer(test_texts, truncation=True, padding=True, max_length=128)

In [19]:
# Tokenize and load/save datasets
train_encodings, train_labels = tokenize_and_save(train_texts, train_labels, tokenizer, 128, "train_data")
val_encodings, val_labels = tokenize_and_save(valid_texts, valid_labels, tokenizer, 128, "val_data")
test_encodings, test_labels = tokenize_and_save(test_texts, test_labels, tokenizer, 128, "test_data")

Loading pre-tokenized data from train_data...
Loading pre-tokenized data from val_data...
Loading pre-tokenized data from test_data...


In [20]:
# Create PyTorch Dataset objects
train_dataset = AmazonReviewDataset(train_encodings, train_labels)
valid_dataset = AmazonReviewDataset(val_encodings, valid_labels)
test_dataset = AmazonReviewDataset(test_encodings, test_labels)

In [21]:
print("\nDataset preparation complete.")
print(f"Example of tokenized input_ids (first training sample): {train_dataset[0]['input_ids']}")
print(f"Example of attention_mask (first training sample): {train_dataset[0]['attention_mask']}")
print(f"Example of label (first training sample): {train_dataset[0]['labels']}")


Dataset preparation complete.
Example of tokenized input_ids (first training sample): tensor([  101,  4895, 14854,  2100,  2003,  2157,  4895, 14854,  2100, 11771,
         1045,  2064,  1056,  2903,  1996,  2204,  4391,  1996,  3772,  2001,
         2919,  1045,  1049,  5373,  1037,  5470,  1997,  4748, 23144,  2638,
         3347, 26401,  2021,  1045,  2347,  1056,  7622,  2007,  2009,  2009,
        11471,  2033, 10021,  1998,  1045,  2018,  2000,  2428,  2954,  2000,
         2994,  8300,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0

In [27]:
# Load the pre-trained DistilBERT model with a classification head
# num_labels should match the number of unique sentiment categories you have.
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
print("\nModel loaded with a classification head.")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Model loaded with a classification head.


In [30]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./models',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=128,  # batch size per device during training
    per_device_eval_batch_size=128,   # batch size per device during evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,               # log every 100 steps
    eval_strategy="epoch",     # evaluate each epoch
    save_strategy="best",            # save model checkpoint each epoch
    save_total_limit=1,              
    load_best_model_at_end=True,     # load the best model (according to eval_loss) at the end of training
    metric_for_best_model="eval_loss", # metric to use to compare models
    report_to="none"                 # Disable reporting to W&B, MLflow etc.
)

In [31]:
# This function will be passed to the Trainer for calculating metrics during evaluation.
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [32]:
# Initialize the Trainer
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset,            # evaluation dataset
    compute_metrics=compute_metrics      # the callback for computing metrics of interest
)

In [None]:
print("\nStarting model training...")
trainer.train()


Starting model training...


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
print("\nEvaluating model on the test set...")
test_results = trainer.evaluate(test_dataset)
print(f"Test Set Evaluation Results: {test_results}")

In [None]:
output_model_dir = "./fine_tuned_sentiment_model"
model.save_pretrained(output_model_dir)
tokenizer.save_pretrained(output_model_dir)

In [None]:
print(f"\nFine-tuned model and tokenizer saved to: {output_model_dir}")
print("You can now load this model for inference in your Streamlit app.")

In [3]:
from bs4 import BeautifulSoup
import requests
import re

In [2]:
product = "galaxy tab S9"

In [10]:
header = ({'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:141.0) Gecko/20100101 Firefox/141.0',
            'Accept-Language': 'en-US, en;q=0.5'})

In [1]:
from scraper import AmazonProductDetails

scrapy = AmazonProductDetails()

In [9]:
product = scrapy.get_product_details('macbook air')

In [10]:
reviews = product['reviews']['review']

In [11]:
reviews

["\nGreat device even though it's a 2020 model. The screen feels absolutely premium and the overall experience is fantastic. Battery lasts longer than other laptops at this price, speed is immaculate, the design is old but looks simple and feels good. If you're searching for a laptop for just browsing, streaming and studying, this is a must buy for this price <333\n",
 "\n1. Performance:The M1 MacBook Air is incredibly fast and efficient. The M1 chip's performance is on par with or even surpasses many Intel-based MacBook Pros, making it one of the most powerful laptops in its class. Whether you're a casual user, a creative professional, or a developer, it handles most tasks with ease.2. Battery Life:One of the standout features of the M1 MacBook Air is its impressive battery life. Users report getting 12-15 hours of real-world use, which is a significant improvement over previous Intel-based MacBook Air models. You can go an entire workday or more without needing to charge it.3. Fanles

In [5]:
from sentiment_analyzer import SentimentAnalyzer

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
sa = SentimentAnalyzer(model_name='fine_tuned_sentiment_model')

Loaded Tokenizer and Model for model fine_tuned_sentiment_model


In [12]:
sa.predict_sentiment(reviews)

['Positive', 'Positive', 'Positive', 'Positive', 'Positive']

In [8]:
reviews

['\nUpgraded to iPhone 16 Pro and totally loving it. Super smooth performance, excellent camera, and stunning display. Battery easily lasts a day. Feels very premium in hand. Bit pricey, but worth it for the experience\n',
 '\nThis product is good genuine product no heating issue lcd quality is very good\n',
 '\nAwesome phone\n',
 '\n\n\n\n\n                    The media could not be loaded.\n                \n\n\n\nIt did some lags while using it and i don’t even play games on my phone. It is good from the battery life perspective.\n',
 '\nSuperb phone,,,\n',
 '\nThe Product is Good but Extremely Expensive\n',
 '\nI was actually hesitant to order an iPhone from Amazon because there have been instances where the product wasn’t genuine. However, I was finally relieved to see that the package was genuine, and the iPhone itself was the real deal. 😂I ordered the black model, and the color is sleek. I love it! 🖤If you’re coming from an older iPhone model like the 13 Pro or earlier, or if yo