In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import RandomUnderSampler
import transformers
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [8]:
# Additional libraries for text preprocessing and feature engineering
import re
from nltk.corpus import stopwords
import numpy as np

In [9]:
# READ THE FILE INTO A DATAFRAME
df = pd.read_csv("../../Fake review detection dataset/Yelp Dataset Reduced.csv")

In [10]:
df.head()

Unnamed: 0,ID,USER_ID,PRODUCT_ID,RATING,DATE,LABEL,REVIEW_TEXT,AVERAGE_RATING,RATING_DEVIATION,TOTAL_PRODUCT_REVIEWS,...,RATING_CATEGORY,SINGLE_RATING_CATEGORY,REVIEW_COUNT_DATE,SAME_DATE_MULTIPLE_REVIEWS,MAX_USER_REVIEWS_DAY,TIMESTAMP_DIFFERENCE,AVERAGE_USER_REVIEW_LENGTH,TOTAL_USER_REVIEWS,PERCENTAGE_POSITIVE_REVIEWS,RATIO_POSITIVE_NEGATIVE
0,144828,66563,416,4,10-12-2014,-1,Great.....,3.767293,0.232707,2183,...,1,1,1,0,2,0 days,11.5,2,100.0,6.157377
1,157607,74755,449,4,26-03-2013,1,My family and I had Bubby's brunch on a Saturd...,3.396552,0.603448,812,...,1,1,2,0,2,1723 days,724.666667,12,100.0,3.121827
2,70401,49165,237,3,11-10-2011,1,"I really like this place, but they need to get...",3.799003,0.799003,602,...,1,1,2,0,1,0 days,314.0,1,100.0,6.082353
3,124810,75653,363,5,14-01-2014,1,This is one of my favorite places in the US. A...,3.990361,1.009639,2075,...,1,1,1,0,1,0 days,280.0,1,100.0,9.121951
4,42068,32402,100,4,02-12-2014,1,Make sure you go with a small group of friends...,3.951812,0.048188,2677,...,1,1,2,0,1,398 days,255.666667,3,100.0,8.734545


In [11]:
import re
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove punctuation and special characters
    text = re.sub(r"\s+", " ", text)  # Remove extra whitespace
    stop_words = set(stopwords.words("english"))
    text = " ".join([word for word in text.split() if word not in stop_words])    
    return text

df["REVIEW_TEXT"] = df["REVIEW_TEXT"].apply(preprocess_text)
text_data = df["REVIEW_TEXT"]
text_data_list = text_data.tolist()

In [12]:
df.rename(columns = {'Unnamed: 0':'ID'}, inplace = True)
df['TIMESTAMP_DIFFERENCE'] = df['TIMESTAMP_DIFFERENCE'].str.replace(' days', '')
df['TIMESTAMP_DIFFERENCE'] = df['TIMESTAMP_DIFFERENCE'].astype(int)
# numerical_data = df.drop(['ID','USER_ID','PRODUCT_ID','DATE','REVIEW_TEXT','LABEL'],axis=1)
label_data = df['LABEL']

In [13]:
text_data

0                                                    great
1        family bubbys brunch saturday morning got 10am...
2        really like place need get menu sorted finally...
3        one favorite places us awesome reservations pl...
4        make sure go small group friends like share fo...
                               ...                        
19935    arrived towards end brunch period items lookin...
19936    seemed matter looked places go brooklyn traif ...
19937    grabbed couple friends walked 8pm saturday nig...
19938    super excited try xixa sister restaurant traif...
19939    halal cart defines food cooked served bunch hu...
Name: REVIEW_TEXT, Length: 19940, dtype: object

In [14]:
from transformers import AutoModel
# Load the pre-trained BERT model and tokenizer
model_name = "bert-base-cased"  # Or other suitable model depending on your dataset
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, num_labels=2)

In [15]:
#!jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10

In [16]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")
import time
# Our sentences to encode
print("Copying text data into sentences")
sentences = text_data_list

print("encoding sentences using model.encode")
# Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)

# print("print embeddings")
# # Print the embeddings
# for sentence, embedding in zip(sentences, embeddings):
#     start_time = time.time()    
#     # print("Sentence:", sentence)
#     # print("Embedding:", embedding)
#     # print("")
#     end_time = time.time()
#     elapsed_time = end_time - start_time 
#     print(f"Completed batch in {elapsed_time:.2f} seconds")
#     # print(f"Sentence : {sentence}")

Copying text data into sentences
encoding sentences using model.encode


In [17]:
embeddings.shape

(19940, 384)

In [18]:
label_data = df['LABEL']

In [19]:
#!pip install torch

In [37]:
#print(numerical_data.shape)
print(embeddings.shape)
print(label_data.shape)

(19940, 384)
(19940,)


In [21]:
#numerical_data

In [38]:
# Concatenate numerical and BERT embeddings
features = embeddings

In [39]:
features

array([[-0.062523  , -0.05636611, -0.07009882, ..., -0.00772695,
        -0.01247107,  0.04390927],
       [-0.00160092,  0.01958902,  0.04499611, ...,  0.08497263,
        -0.0060062 , -0.04186679],
       [-0.03074861, -0.06061548,  0.06373494, ..., -0.03846001,
        -0.04504208,  0.01557505],
       ...,
       [-0.02291161, -0.0037912 ,  0.08483507, ...,  0.01208541,
        -0.07726438,  0.00649546],
       [-0.01515959, -0.0742491 ,  0.00179713, ...,  0.04611028,
        -0.04332537,  0.03000393],
       [ 0.02490894, -0.05243617, -0.0227087 , ...,  0.02886555,
        -0.03789877,  0.01725319]], dtype=float32)

In [40]:
# Oversampling with adasyn
adasyn = ADASYN(random_state=42)   #smote = SMOTE(random_state=42)
X_resampled, y_resampled = adasyn.fit_resample(features, label_data )

In [41]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled , y_resampled, test_size=0.25, random_state=42)

print('X_train.shape:', X_train.shape)

print('X_test.shape', X_test.shape)

X_train.shape: (26991, 384)
X_test.shape (8997, 384)


In [42]:
print(df["LABEL"].shape)
#print(numerical_data.shape)
print(embeddings.shape)

(19940,)
(19940, 384)


In [None]:
# # Initialize the StandardScaler
# scaler = StandardScaler()
# # Apply the scaler to the numerical columns
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.fit_transform(X_test)

In [48]:
# Train an SVM classifier
svm_classifier = SVC(kernel='linear', probability=True)
svm_classifier.fit(X_train, y_train)

In [44]:
# Make predictions on the test set
y_pred = svm_classifier.predict(X_test)

In [45]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.6683338890741358
Confusion Matrix:
 [[3108 1429]
 [1555 2905]]
Classification Report:
               precision    recall  f1-score   support

          -1       0.67      0.69      0.68      4537
           1       0.67      0.65      0.66      4460

    accuracy                           0.67      8997
   macro avg       0.67      0.67      0.67      8997
weighted avg       0.67      0.67      0.67      8997



In [47]:
# Predict probabilities for test data
y_pred_proba = svm_classifier.predict_proba(X_test)[:, 1]

AttributeError: predict_proba is not available when probability=False

In [None]:
# Compute ROC curve and ROC area
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Calculate sensitivity (true positive rate)
sensitivity = conf_matrix[1, 1] / (conf_matrix[1, 0] + conf_matrix[1, 1])

# Calculate specificity (true negative rate)
specificity = conf_matrix[0, 0] / (conf_matrix[0, 0] + conf_matrix[0, 1])

# Print sensitivity and specificity
print("Sensitivity (True Positive Rate):", sensitivity)
print("Specificity (True Negative Rate):", specificity)