In [1]:
import numpy as np
import pandas as pd
import re
import string
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import nltk

# Load your dataset (replace 'reviews.csv' with your dataset path)
# The dataset should have columns: 'A Detailed Review of the Place' and 'On a Scale of 1-5 Rate the Place'
df = pd.read_csv('reviews.csv', sep=',')

def preprocess_text(text):
    # Add your text preprocessing steps here
    text = re.sub(r'http\S+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    return text

df['A Detailed Review of the Place'] = df['A Detailed Review of the Place'].apply(preprocess_text)

# Split the data into training and testing sets
X = df['A Detailed Review of the Place']
y = df['On a Scale of 1-5 Rate the Place']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize DistilBERT tokenizer
max_seq_length = 128
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', max_length=max_seq_length, truncation=True, padding='max_length')

# Tokenize and encode the data
x_train_encoded = tokenizer(list(X_train), return_tensors='tf', padding=True, truncation=True, max_length=max_seq_length)
x_test_encoded = tokenizer(list(X_test), return_tensors='tf', padding=True, truncation=True, max_length=max_seq_length)

# Initialize DistilBERT model for regression
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=1)  # Regression model, single output neuron

loss = tf.keras.losses.MeanSquaredError()
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
model.compile(optimizer=optimizer, loss=loss, metrics=['mae'])

# Increase the number of epochs for better training
epochs = 6  # Increase the number of epochs for better results

# Model training on the training dataset
history = model.fit(
    x={'input_ids': x_train_encoded['input_ids'], 'attention_mask': x_train_encoded['attention_mask']},
    y=y_train,
    epochs=epochs,
    batch_size=16
)

# Evaluate the model on the test dataset
mse, mae = model.evaluate(
    x={'input_ids': x_test_encoded['input_ids'], 'attention_mask': x_test_encoded['attention_mask']},
    y=y_test,
    verbose=0
)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")

# Function to predict rating for a review
def predict_rating(review):
    # Tokenize and encode the review
    review_encoded = tokenizer([preprocess_text(review)], return_tensors='tf', padding=True, truncation=True, max_length=max_seq_length)

    # Predict rating
    rating = model.predict({'input_ids': review_encoded['input_ids'], 'attention_mask': review_encoded['attention_mask']})

    return rating[0][0]

# Example usage of the predict_rating function
predicted_rating = predict_rating("It wasn't that great of a place")
print(f"Predicted Rating: {predicted_rating}")


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Mean Squared Error: 0.7009488344192505
Mean Absolute Error: 0.6126383543014526
Predicted Rating: [2.3879557]


In [8]:
predicted_rating = predict_rating("It was a really great place")
print(f"Predicted Rating: {round(predicted_rating[0])}")

Predicted Rating: 4


In [9]:
predicted_rating = predict_rating("Amazing place, really loved it.")
print(f"Predicted Rating: {round(predicted_rating[0])}")

Predicted Rating: 5


In [10]:
predicted_rating = predict_rating("It is not that great of a place")
print(f"Predicted Rating: {round(predicted_rating[0])}")

Predicted Rating: 3


In [12]:
# Save the model architecture and weights
model.save('distilbert_sentiment_model', save_format='tf')

# Optionally, save the tokenizer as well for future use
tokenizer.save_pretrained('distilbert_tokenizer')


























INFO:tensorflow:Assets written to: distilbert_sentiment_model\assets


INFO:tensorflow:Assets written to: distilbert_sentiment_model\assets


('distilbert_tokenizer\\tokenizer_config.json',
 'distilbert_tokenizer\\special_tokens_map.json',
 'distilbert_tokenizer\\vocab.txt',
 'distilbert_tokenizer\\added_tokens.json')