In [2]:
import pandas as pd
import re
from nltk.corpus import stopwords
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from transformers import RobertaTokenizer, RobertaModel
import torch



In [3]:
# Additional libraries for text preprocessing and feature engineering
import re
from nltk.corpus import stopwords
import numpy as np

In [4]:
# READ THE FILE INTO A DATAFRAME
df = pd.read_csv("../../Datasets/Yelp Dataset Reduced.csv")

In [5]:
df.head()

Unnamed: 0,ID,USER_ID,PRODUCT_ID,RATING,DATE,LABEL,REVIEW_TEXT,AVERAGE_RATING,RATING_DEVIATION,TOTAL_PRODUCT_REVIEWS,...,RATING_CATEGORY,SINGLE_RATING_CATEGORY,REVIEW_COUNT_DATE,SAME_DATE_MULTIPLE_REVIEWS,MAX_USER_REVIEWS_DAY,TIMESTAMP_DIFFERENCE,AVERAGE_USER_REVIEW_LENGTH,TOTAL_USER_REVIEWS,PERCENTAGE_POSITIVE_REVIEWS,RATIO_POSITIVE_NEGATIVE
0,144828,66563,416,4,10-12-2014,-1,Great.....,3.767293,0.232707,2183,...,1,1,1,0,2,0 days,11.5,2,100.0,6.157377
1,157607,74755,449,4,26-03-2013,1,My family and I had Bubby's brunch on a Saturd...,3.396552,0.603448,812,...,1,1,2,0,2,1723 days,724.666667,12,100.0,3.121827
2,70401,49165,237,3,11-10-2011,1,"I really like this place, but they need to get...",3.799003,0.799003,602,...,1,1,2,0,1,0 days,314.0,1,100.0,6.082353
3,124810,75653,363,5,14-01-2014,1,This is one of my favorite places in the US. A...,3.990361,1.009639,2075,...,1,1,1,0,1,0 days,280.0,1,100.0,9.121951
4,42068,32402,100,4,02-12-2014,1,Make sure you go with a small group of friends...,3.951812,0.048188,2677,...,1,1,2,0,1,398 days,255.666667,3,100.0,8.734545


In [6]:
# Define text preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove punctuation and special characters
    text = re.sub(r"\s+", " ", text)  # Remove extra whitespace
    stop_words = set(stopwords.words("english"))
    text = " ".join([word for word in text.split() if word not in stop_words])    
    return text

df["REVIEW_TEXT"] = df["REVIEW_TEXT"].apply(preprocess_text)
text_data = df["REVIEW_TEXT"]
text_data_list = text_data.tolist()

In [7]:
df.rename(columns={'Unnamed: 0': 'ID'}, inplace=True)
df['TIMESTAMP_DIFFERENCE'] = df['TIMESTAMP_DIFFERENCE'].str.replace(' days', '')
df['TIMESTAMP_DIFFERENCE'] = df['TIMESTAMP_DIFFERENCE'].astype(int)
numerical_data = df.drop(['ID', 'USER_ID', 'PRODUCT_ID', 'DATE', 'REVIEW_TEXT', 'LABEL'], axis=1)
label_data = df['LABEL']

In [9]:
# Load RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [10]:
# Tokenize input text
encoded_inputs = tokenizer(text_data_list, padding=True, truncation=True, return_tensors='pt')

In [12]:
# Load RoBERTa model
model = RobertaModel.from_pretrained('roberta-base')


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# Encode sentences using RoBERTa model with reduced batch size
batch_size = 10  # Adjust batch size as needed
num_samples = len(text_data_list)
print('num_samples', num_samples)
embeddings = []

for i in range(0, num_samples, batch_size):
    batch_texts = text_data_list[i:i+batch_size]
    batch_encoded_inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**batch_encoded_inputs)
        batch_embeddings = outputs.last_hidden_state[:, 0, :].numpy()
    embeddings.append(batch_embeddings)
    print('batch encoded', (i+batch_size))
# Concatenate batch embeddings along the batch dimension
embeddings = np.concatenate(embeddings, axis=0)

num_samples 19940
batch encoded 10
batch encoded 20
batch encoded 30
batch encoded 40
batch encoded 50
batch encoded 60
batch encoded 70
batch encoded 80
batch encoded 90
batch encoded 100
batch encoded 110
batch encoded 120
batch encoded 130
batch encoded 140
batch encoded 150
batch encoded 160
batch encoded 170
batch encoded 180
batch encoded 190
batch encoded 200
batch encoded 210
batch encoded 220
batch encoded 230
batch encoded 240
batch encoded 250
batch encoded 260
batch encoded 270
batch encoded 280
batch encoded 290
batch encoded 300
batch encoded 310
batch encoded 320
batch encoded 330
batch encoded 340
batch encoded 350
batch encoded 360
batch encoded 370
batch encoded 380
batch encoded 390
batch encoded 400
batch encoded 410
batch encoded 420
batch encoded 430
batch encoded 440
batch encoded 450
batch encoded 460
batch encoded 470
batch encoded 480
batch encoded 490
batch encoded 500
batch encoded 510
batch encoded 520
batch encoded 530
batch encoded 540
batch encoded 550
b

In [14]:
label_data = df['LABEL']
embeddings_data = pd.DataFrame(embeddings)
print(numerical_data.shape)
print(embeddings_data.shape)


(19940, 15)
(19940, 768)


In [15]:
# Concatenate numerical and RoBERTa embeddings
features = np.concatenate([numerical_data, embeddings], axis=1)

In [17]:
# Oversampling
smote = SMOTE(random_state=42)
x_resampled, y_resampled = smote.fit_resample(features, label_data)

In [16]:
print(numerical_data.shape)
print(embeddings.shape)

(19940, 15)
(19940, 768)


In [18]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42)

In [19]:
# Initialize the StandardScaler
scaler = StandardScaler()

In [20]:
# Apply the scaler to the numerical columns
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [21]:
# Train an MLP classifier
mlp_classifier = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
mlp_classifier.fit(X_train_scaled, y_train)

In [22]:
# Make predictions on the test set
y_pred = mlp_classifier.predict(X_test_scaled)

In [23]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9570569310393062
Confusion Matrix:
 [[3555   20]
 [ 287 3287]]
Classification Report:
               precision    recall  f1-score   support

          -1       0.93      0.99      0.96      3575
           1       0.99      0.92      0.96      3574

    accuracy                           0.96      7149
   macro avg       0.96      0.96      0.96      7149
weighted avg       0.96      0.96      0.96      7149



In [None]:
# Predict probabilities for test data
y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]

In [None]:
# Compute ROC curve and ROC area
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Calculate sensitivity (true positive rate)
sensitivity = conf_matrix[1, 1] / (conf_matrix[1, 0] + conf_matrix[1, 1])

# Calculate specificity (true negative rate)
specificity = conf_matrix[0, 0] / (conf_matrix[0, 0] + conf_matrix[0, 1])

# Print sensitivity and specificity
print("Sensitivity (True Positive Rate):", sensitivity)
print("Specificity (True Negative Rate):", specificity)