In [1]:
import numpy as np
import pandas as pd
import re
import string
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder

# Load your dataset (replace 'reviews.csv' with your dataset path)
# The dataset should have columns: 'Place', 'Review', and 'Sentiment'
df = pd.read_csv('reviews.csv', sep=',')
def preprocess_text(text):
    # Add your text preprocessing steps here
    text = re.sub(r'http\S+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    return text

df['A Detailed Review of the Place'] = df['A Detailed Review of the Place'].apply(preprocess_text)

# Map ratings to sentiment labels
def map_rating_to_sentiment(rating):
    return rating

df['sentiment'] = df['On a Scale of 1-5 Rate the Place'].apply(map_rating_to_sentiment)

# Map sentiment labels to integers
label_encoder = LabelEncoder()
df['Sentiment'] = label_encoder.fit_transform(df['sentiment'])

# Initialize DistilBERT tokenizer
max_seq_length = 128
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', max_length=max_seq_length, truncation=True, padding='max_length')

# Tokenize and encode the data
x_encoded = tokenizer(list(df['A Detailed Review of the Place']), return_tensors='tf', padding=True, truncation=True, max_length=max_seq_length)
y_encoded = df['Sentiment']

# Initialize DistilBERT model
num_labels = len(df['Sentiment'].unique())
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Compile the model with adjusted hyperparameters
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

# Increase the number of epochs for better training
epochs = 6  # Increase the number of epochs for better results

# Model training on the entire dataset
history = model.fit(
    x={'input_ids': x_encoded['input_ids'], 'attention_mask': x_encoded['attention_mask']},
    y=y_encoded,
    epochs=epochs,
    batch_size=16
)

# Function to predict sentiment for a review
def predict_sentiment(review):
    # Tokenize and encode the review
    review_encoded = tokenizer([preprocess_text(review)], return_tensors='tf', padding=True, truncation=True, max_length=max_seq_length)

    # Predict sentiment
    sentiment = model.predict({'input_ids': review_encoded['input_ids'], 'attention_mask': review_encoded['attention_mask']})[0]
    sentiment_label = label_encoder.inverse_transform([np.argmax(sentiment)])[0]

    return sentiment_label

# Example usage of the predict_sentiment function

  from .autonotebook import tqdm as notebook_tqdm
2023-09-28 16:57:40.478146: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-28 16:57:40.496307: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-28 16:57:40.615756: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-28 16:57:40.617092: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of the PyTorch model were not used when initializi

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [2]:
# Save the model to a directory
model.save_pretrained('distilbert_model_new')

# Save the tokenizer to the same directory
tokenizer.save_pretrained('distilbert_model_new')

# Save the label encoder using joblib
import joblib
joblib.dump(label_encoder, 'label_encoder.joblib')

['label_encoder.joblib']

In [None]:
def map_rating_to_sentiment(rating):
    if rating <= 2:
        return "Bored"
    elif rating == 3:
        return "Mixed"
    elif rating >= 4:
        return "Happy"

In [3]:
user_review = "This place sucks. I hated it."
predicted_sentiment = predict_sentiment(user_review)
print(f"Predicted sentiment for the review: {predicted_sentiment}")

Predicted sentiment for the review: 1.0


In [4]:
user_review = "Good historical place to visit but too crowded. The place needs to be be maintained by the authorities for cleanliness as the locals, hawkers, shops and the tourists litter the place with disposals. People over there should be advised and fines to be imposed on breaking the rules. It is a pity that such a beautiful historic place is not maintained. One should not forget to have the falooda and lassi nearby. Shopping for glass bangles is a must for the ladies."
predicted_sentiment = predict_sentiment(user_review)
print(f"Predicted sentiment for the review: {predicted_sentiment}")

Predicted sentiment for the review: 3.0


In [5]:
user_review = "Noob bullshit "
predicted_sentiment = predict_sentiment(user_review)
print(f"Predicted sentiment for the review: {predicted_sentiment}")

Predicted sentiment for the review: 5.0


In [6]:
user_review = "I will not go again.Worst place"
predicted_sentiment = predict_sentiment(user_review)
print(f"Predicted sentiment for the review: {predicted_sentiment}")

Predicted sentiment for the review: 2.0


In [7]:
user_review = "Must visit"
predicted_sentiment = predict_sentiment(user_review)
print(f"Predicted sentiment for the review: {predicted_sentiment}")

Predicted sentiment for the review: 5.0


In [8]:
user_review = "it was okay place. I would recommend it to anyone."
predicted_sentiment = predict_sentiment(user_review)
print(f"Predicted sentiment for the review: {predicted_sentiment}")


Predicted sentiment for the review: 4.0


In [9]:
avg_rating={}
likely_partner={}

In [10]:
place_sentiments = ['Charminar', 'Golconda Fort', 'Wonderla', 'Ramoji Film City', 'Nehru Zoological Park', 'Birla Science Museum', 'Hussain Sagar Lake', 'Birla mandir', 'Cable Bridge', 'NTR Garden']

In [11]:
def place_process(place):
    place_temp = df[df['Place You Want to Review About'] == place]
    partner_counts=place_temp['Who Was Your Company During Your Visit'].value_counts()
    max_partner = partner_counts.idxmax()
    likely_partner[place]=max_partner
    
    
    sum_ratings = place_temp['On a Scale of 1-5 Rate the Place'].sum()
    num_entries = place_temp.shape[0]
    average_rating = sum_ratings / num_entries
    avg_rating[place]=average_rating

In [12]:
for place in place_sentiments:
    place_process(place)
avg_rating

{'Charminar': 4.0181818181818185,
 'Golconda Fort': 4.554545454545455,
 'Wonderla': 4.733333333333333,
 'Ramoji Film City': 4.181818181818182,
 'Nehru Zoological Park': 4.166666666666667,
 'Birla Science Museum': 4.291139240506329,
 'Hussain Sagar Lake': 4.033333333333333,
 'Birla mandir': 4.6,
 'Cable Bridge': 3.108695652173913,
 'NTR Garden': 3.725}

In [13]:
for place in place_sentiments:
    place_process(place)
likely_partner

{'Charminar': 'Family',
 'Golconda Fort': 'Couples',
 'Wonderla': 'Friends',
 'Ramoji Film City': 'Family',
 'Nehru Zoological Park': 'Family',
 'Birla Science Museum': 'Family',
 'Hussain Sagar Lake': 'Family',
 'Birla mandir': 'Family',
 'Cable Bridge': 'Friends',
 'NTR Garden': 'Family'}