In [1]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Load your dataset into a DataFrame (replace 'your_dataset.csv' with your actual dataset file)
df = pd.read_csv(r'C:\Users\prana\bertaurant\bertModel\Restaurant_Reviews.tsv', delimiter='\t')


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Liked   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [4]:
df.columns

Index(['Review', 'Liked'], dtype='object')

In [5]:
df.Liked.unique()

array([1, 0], dtype=int64)

In [6]:
df['Liked'].value_counts()

Liked
1    500
0    500
Name: count, dtype: int64

In [4]:
# Split the dataset into train and validation sets (80% train, 20% validation)
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)  

In [8]:
train_df.shape   #800 rows and 2 columns

(800, 2)

In [9]:
val_df.shape     #200 rows and 2 columns

(200, 2)

In [5]:
# Initialize the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
print(tokenizer)

BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}


In [13]:
print(model)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [14]:
ip=tokenizer("Hello World")
print(ip)


{'input_ids': [101, 7592, 2088, 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}


In [15]:
# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, no_deprecation_warning=True)
num_epochs = 1
optimizer

AdamW (
Parameter Group 0
    betas: (0.9, 0.999)
    correct_bias: True
    eps: 1e-06
    lr: 2e-05
    weight_decay: 0.0
)

In [19]:

for epoch in range(num_epochs):
    print("Model is training")
    model.train()
    total_loss = 0

    for i, row in train_df.iterrows():
        review = row['Review']
        print("Review is ",review)
        label = row['Liked']
        print("Liked is ",label)

        encoding = tokenizer(review, truncation=True, padding=True, return_tensors='pt')
        print("Tokenised review is ",encoding)
        input_ids = encoding['input_ids']
        print("input id  is ",input_ids)
        attention_mask = encoding['attention_mask']
        print("Attention mask is ",attention_mask)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=torch.tensor([label]))
        print("output is ",outputs)
        loss = outputs.loss
        print("loss is ",loss)
        total_loss += loss.item()
        print("Total loss is ",total_loss)
        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_df)
    print("avg loss is ",average_loss)
    print(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {average_loss:.4f}')

Model is training
Review is  The worst was the salmon sashimi.
Liked is  0
Tokenised review is  {'input_ids': tensor([[  101,  1996,  5409,  2001,  1996, 11840, 24511, 27605,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
input id  is  tensor([[  101,  1996,  5409,  2001,  1996, 11840, 24511, 27605,  1012,   102]])
Attention mask is  tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
output is  SequenceClassifierOutput(loss=tensor(0.4535, grad_fn=<NllLossBackward0>), logits=tensor([[ 0.4123, -0.1433]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
loss is  tensor(0.4535, grad_fn=<NllLossBackward0>)
Total loss is  0.4534747898578644
Review is  An excellent new restaurant by an experienced Frenchman.
Liked is  1
Tokenised review is  {'input_ids': tensor([[  101,  2019,  6581,  2047,  4825,  2011,  2019,  5281, 26529,  1012,
           102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0

KeyboardInterrupt: 

In [20]:
# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for i, row in train_df.iterrows():
        review = row['Review']
        label = row['Liked']

        encoding = tokenizer(review, truncation=True, padding=True, return_tensors='pt')
        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=torch.tensor([label]))
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward() 
        optimizer.step()

    average_loss = total_loss / len(train_df)
    print(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {average_loss:.4f}')

Epoch 1/1, Train Loss: 0.3403


In [9]:
# Evaluate the model on the validation set
model.eval()                   #switching from training to evaluating

val_labels = []  #actual labels
val_predictions = []  #predicted labels 

for i, row in val_df.iterrows():
    review = row['Review']
    print(review)
    label = row['Liked']

    encoding = tokenizer(review, truncation=True, padding=True, return_tensors='pt')
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']

    outputs = model(input_ids, attention_mask=attention_mask) 
    print(outputs)
    logits = outputs.logits                     #probability for a text to be negative or positive
    print(logits)
    predictions = torch.argmax(logits, dim=1)   #returns index of the maximum probability
    print(predictions)

    val_labels.append(label)                   # add actual label  
    val_predictions.append(predictions.item()) # add predicted label
    
accuracy = accuracy_score(val_labels, val_predictions)         #using both the values,predict accuracy.
classification_report_str = classification_report(val_labels, val_predictions)

If you haven't gone here GO NOW!
SequenceClassifierOutput(loss=None, logits=tensor([[-0.1356,  0.3925]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
tensor([[-0.1356,  0.3925]], grad_fn=<AddmmBackward0>)
tensor([1])
Try them in the airport to experience some tasty food and speedy, friendly service.
SequenceClassifierOutput(loss=None, logits=tensor([[-0.2623,  0.5143]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
tensor([[-0.2623,  0.5143]], grad_fn=<AddmmBackward0>)
tensor([1])
The restaurant is very clean and has a family restaurant feel to it.
SequenceClassifierOutput(loss=None, logits=tensor([[-0.2249,  0.5255]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
tensor([[-0.2249,  0.5255]], grad_fn=<AddmmBackward0>)
tensor([1])
I personally love the hummus, pita, baklava, falafels and Baba Ganoush (it's amazing what they do with eggplant!).
SequenceClassifierOutput(loss=None, logits=tensor([[-0.2182,  0.4869]], grad_fn=<AddmmBack

KeyboardInterrupt: 

In [23]:
# Evaluate the model on the validation set
model.eval()                   #switching from training to evaluating

val_labels = []
val_predictions = []

for i, row in val_df.iterrows():
    review = row['Review']
    label = row['Liked']

    encoding = tokenizer(review, truncation=True, padding=True, return_tensors='pt')
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']

    outputs = model(input_ids, attention_mask=attention_mask) 
    logits = outputs.logits                     #probability for a text to be negative or positive
    predictions = torch.argmax(logits, dim=1)   #returns index of the maximum probability

    val_labels.append(label)                   # add actual label  
    val_predictions.append(predictions.item()) # add predicted label
    
accuracy = accuracy_score(val_labels, val_predictions)         #using both the values,predict accuracy.
classification_report_str = classification_report(val_labels, val_predictions)

In [28]:
# Calculate evaluation metrics
accuracy = accuracy_score(val_labels, val_predictions)         #using both the values,predict accuracy.
classification_report_str = classification_report(val_labels, val_predictions)  # return classification report

print(f'Validation Accuracy: {accuracy:.4f}')
print(classification_report_str)

Validation Accuracy: 0.7500
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.86      0.86      0.86         7

    accuracy                           0.75         8
   macro avg       0.43      0.43      0.43         8
weighted avg       0.75      0.75      0.75         8



In [33]:
def predict_sentiment(input_text):
    # Tokenize the input text
    encoding = tokenizer(input_text, truncation=True, padding=True, return_tensors='pt')
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']

    # Pass the input through the model
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    
    # Get the predicted class (0 for negative, 1 for positive)
    logits = outputs.logits
    print(logits)
    predicted_class = torch.argmax(logits, dim=1).item()
    print(predicted_class)

    # Determine the sentiment based on the predicted class
    if predicted_class == 0:
        return "Negative"
    else:
        return "Positive"

In [35]:
# Example usage:
new_input_text = "I really enjoyed the food at that restaurant."
sentiment = predict_sentiment(new_input_text)  # Use the loaded model
print(f"Predicted Sentiment: {sentiment}")

tensor([[-1.6389,  3.0280]])
1
Predicted Sentiment: Positive


In [36]:
# Example usage:
new_input_text = "Worst service ever"
sentiment = predict_sentiment(new_input_text)  # Use the loaded model
print(f"Predicted Sentiment: {sentiment}")

tensor([[ 1.1668, -2.6603]])
0
Predicted Sentiment: Negative
