In [1]:
import torch
from transformers import BertModel, BertTokenizer
import json
import pandas as pd
import gzip
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import BertTokenizer, BertForSequenceClassification

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [19]:
import json
import pandas as pd

# Step 1: Load Data from JSON File
def load_json_lines(path):
    data = []
    with open(path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    return data

def get_df_from_json(data):
    return pd.DataFrame(data)

# Correct the path to your JSON file
file_path = "/kaggle/input/all-beauty-dataset/All_Beauty_5.json"  

# Load the JSON data
json_data = load_json_lines(file_path)

# Convert JSON data to DataFrame
df = get_df_from_json(json_data)

# Print the first few rows to inspect the data
print(df.head())

# Check if 'reviewText' column exists
if 'reviewText' in df.columns:
    # Filter the DataFrame to keep rows where 'reviewText' is a string
    df = df[df["reviewText"].apply(lambda x: isinstance(x, str))]
else:
    print("Column 'reviewText' does not exist in the DataFrame")

# Print the shape of the DataFrame after filtering
print(df.shape)

# Print the first few rows to inspect the filtered data
print(df.head())


   overall  verified   reviewTime      reviewerID        asin  \
0      5.0      True   09 1, 2016  A3CIUOJXQ5VDQ2  B0000530HU   
1      5.0      True  11 14, 2013  A3H7T87S984REU  B0000530HU   
2      1.0      True  08 18, 2013  A3J034YH7UG4KT  B0000530HU   
3      5.0     False   05 3, 2011  A2UEO5XR3598GI  B0000530HU   
4      5.0      True   05 6, 2011  A3SFRT223XXWF7  B00006L9LC   

                                               style     reviewerName  \
0  {'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...         Shelly F   
1  {'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...     houserules18   
2  {'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...             Adam   
3  {'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...           Rich K   
4                          {'Size:': ' 200ml/6.7oz'}  C. C. Christian   

                                          reviewText  \
0                   As advertised. Reasonably priced   
1  Like the oder and the feel when I put it on my...   
2 

In [20]:
input_texts = df["reviewText"].tolist()
labels = df["overall"].tolist()

In [21]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
encoded_inputs = tokenizer(input_texts, padding=True, truncation=True, return_tensors='pt')

In [22]:
labels = torch.tensor(labels, dtype=torch.float32).unsqueeze(1)
dataset = TensorDataset(encoded_inputs['input_ids'], encoded_inputs['attention_mask'], labels)

train_size = int(0.8 * len(dataset))
eval_size = len(dataset) - train_size
train_dataset, eval_dataset = random_split(dataset, [train_size, eval_size])


In [23]:
batch_size = 8
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=batch_size)

In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=1).to(device)
optimizer = torch.optim.AdamW(bert_model.parameters(), lr=2e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
num_epochs = 5
for epoch in range(num_epochs):
    # Training
    bert_model.train()
    total_train_loss = 0.0
    num_train_batches = 0
    for batch in train_loader:
        input_ids, attention_mask, label = [t.to(device) for t in batch]
        optimizer.zero_grad()
        outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask, labels=label)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()
        num_train_batches += 1
    avg_train_loss = total_train_loss / num_train_batches

    # Evaluation
    bert_model.eval()
    total_eval_loss = 0.0
    num_eval_batches = 0
    with torch.no_grad():
        for batch in eval_loader:
            input_ids, attention_mask, label = [t.to(device) for t in batch]
            outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask, labels=label)
            total_eval_loss += outputs.loss.item()
            num_eval_batches += 1
    avg_eval_loss = total_eval_loss / num_eval_batches

    print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {avg_train_loss:.4f}, Evaluation Loss: {avg_eval_loss:.4f}")

Epoch [1/5], Training Loss: 0.0368, Evaluation Loss: 0.0836
Epoch [2/5], Training Loss: 0.0309, Evaluation Loss: 0.0835
Epoch [3/5], Training Loss: 0.0238, Evaluation Loss: 0.0769
Epoch [4/5], Training Loss: 0.0221, Evaluation Loss: 0.0723
Epoch [5/5], Training Loss: 0.0189, Evaluation Loss: 0.0842


In [37]:
review_text = "I love it "

tokenized_review = tokenizer(review_text, padding=True, truncation=True, return_tensors='pt')

input_ids = tokenized_review['input_ids'].to(device)
attention_mask = tokenized_review['attention_mask'].to(device)

with torch.no_grad():
    bert_model.eval()
    outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
    predicted_rating = outputs.logits.item()

print(f"Predicted rating: {predicted_rating:.2f}")

Predicted rating: 5.20
