In [1]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
i = 0
# Load BERT tokenizer and model (PyTorch version)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Define the function to get sentence embeddings
def get_sentence_embeddings_batch(sentences):
    try:
        # Tokenize and encode the batch of sentences
        inputs = tokenizer(sentences, padding=True, truncation=True, max_length=512, return_tensors='pt')
        
        # Get the BERT model outputs
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Compute the mean of token embeddings for each sentence
        embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
        
        return embeddings
    
    except Exception as e:
        print(f"Error processing sentences: {sentences}\nException: {e}")
        return None

In [3]:
df = pd.read_csv("C:/Users/saiha/Downloads/all_kindle_review .csv")

In [4]:
df.head(2)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,asin,helpful,rating,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,11539,B0033UV8HI,"[8, 10]",3,"Jace Rankin may be short, but he's nothing to ...","09 2, 2010",A3HHXRELK8BHQG,Ridley,Entertaining But Average,1283385600
1,1,5957,B002HJV4DE,"[1, 1]",5,Great short read. I didn't want to put it dow...,"10 8, 2013",A2RGNZ0TRF578I,Holly Butler,Terrific menage scenes!,1381190400


In [5]:
df=df[["rating","reviewText"]]

In [6]:
df['rating'] = df['rating'].apply(lambda x: 0 if x <= 3 else 1)

In [7]:
df["reviewText"] = df["reviewText"].astype(str)

In [11]:
def process_in_batches(df, batch_size):
    embeddings = []
    total_rows = len(df)
    processed_rows = 0
    
    for i in range(0, total_rows, batch_size):
        batch = df['reviewText'][i:i + batch_size].tolist()
        batch_embeddings = get_sentence_embeddings_batch(batch)
        
        if batch_embeddings is not None:
            embeddings.extend(batch_embeddings)
            processed_rows += len(batch)
            print(f"Processed rows: {processed_rows}/{total_rows}")
    
    print(f"Total rows processed: {processed_rows}")
    return embeddings

batch_size = 10

# Apply batch processing
df['embedding'] = process_in_batches(df, batch_size)

Processed rows: 10/12000
Processed rows: 20/12000
Processed rows: 30/12000
Processed rows: 40/12000
Processed rows: 50/12000
Processed rows: 60/12000
Processed rows: 70/12000
Processed rows: 80/12000
Processed rows: 90/12000
Processed rows: 100/12000
Processed rows: 110/12000
Processed rows: 120/12000
Processed rows: 130/12000
Processed rows: 140/12000
Processed rows: 150/12000
Processed rows: 160/12000
Processed rows: 170/12000
Processed rows: 180/12000
Processed rows: 190/12000
Processed rows: 200/12000
Processed rows: 210/12000
Processed rows: 220/12000
Processed rows: 230/12000
Processed rows: 240/12000
Processed rows: 250/12000
Processed rows: 260/12000
Processed rows: 270/12000
Processed rows: 280/12000
Processed rows: 290/12000
Processed rows: 300/12000
Processed rows: 310/12000
Processed rows: 320/12000
Processed rows: 330/12000
Processed rows: 340/12000
Processed rows: 350/12000
Processed rows: 360/12000
Processed rows: 370/12000
Processed rows: 380/12000
Processed rows: 390/1

In [12]:
df

Unnamed: 0,rating,reviewText,embedding
0,0,"Jace Rankin may be short, but he's nothing to ...","[-0.26751292, -0.073397286, 0.20872085, 0.0364..."
1,1,Great short read. I didn't want to put it dow...,"[0.019671557, -0.26345688, 0.25776228, 0.15100..."
2,0,I'll start by saying this is the first of four...,"[0.0180494, -0.3501674, 0.4953711, 0.047568485..."
3,0,Aggie is Angela Lansbury who carries pocketboo...,"[-0.13682088, -0.1545928, 0.29191345, 0.115250..."
4,1,I did not expect this type of book to be in li...,"[-0.07526857, -0.22887912, 0.35532576, 0.31291..."
...,...,...,...
11995,1,Valentine cupid is a vampire- Jena and Ian ano...,"[-0.24591917, -0.25305796, 0.32048354, 0.06601..."
11996,1,I have read all seven books in this series. Ap...,"[-0.054832324, -0.29002014, 0.3199352, -0.0511..."
11997,0,This book really just wasn't my cuppa. The si...,"[-0.00044096378, -0.08219343, 0.22778961, 0.03..."
11998,0,"tried to use it to charge my kindle, it didn't...","[0.015902918, 0.09698219, 0.1808137, 0.0179025..."


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

X = np.vstack(df['embedding'].values)
y = df['rating'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the logistic regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.8483333333333334
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.85      0.85      1190
           1       0.85      0.85      0.85      1210

    accuracy                           0.85      2400
   macro avg       0.85      0.85      0.85      2400
weighted avg       0.85      0.85      0.85      2400



In [14]:
import pickle

model_path = 'logistic_regression_model.pkl'

with open(model_path, 'wb') as file:
    pickle.dump(clf, file)

print(f"Model saved to {model_path}")


Model saved to logistic_regression_model.pkl


In [15]:
def get_sentence_embeddings(sentence):
        inputs = tokenizer(sentence, padding=True, truncation=True, max_length=512, return_tensors='pt')
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
        
        return embeddings

array([0], dtype=int64)