In [35]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from imblearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [36]:
# Additional libraries for text preprocessing and feature engineering
import re
from nltk.corpus import stopwords
import numpy as np

In [37]:
# Load dataset
df = pd.read_csv("../../Fake review detection dataset/Yelp Dataset Reduced.csv")

In [38]:
df.head()

Unnamed: 0,ID,USER_ID,PRODUCT_ID,RATING,DATE,LABEL,REVIEW_TEXT,AVERAGE_RATING,RATING_DEVIATION,TOTAL_PRODUCT_REVIEWS,...,RATING_CATEGORY,SINGLE_RATING_CATEGORY,REVIEW_COUNT_DATE,SAME_DATE_MULTIPLE_REVIEWS,MAX_USER_REVIEWS_DAY,TIMESTAMP_DIFFERENCE,AVERAGE_USER_REVIEW_LENGTH,TOTAL_USER_REVIEWS,PERCENTAGE_POSITIVE_REVIEWS,RATIO_POSITIVE_NEGATIVE
0,144828,66563,416,4,10-12-2014,-1,Great.....,3.767293,0.232707,2183,...,1,1,1,0,2,0 days,11.5,2,100.0,6.157377
1,157607,74755,449,4,26-03-2013,1,My family and I had Bubby's brunch on a Saturd...,3.396552,0.603448,812,...,1,1,2,0,2,1723 days,724.666667,12,100.0,3.121827
2,70401,49165,237,3,11-10-2011,1,"I really like this place, but they need to get...",3.799003,0.799003,602,...,1,1,2,0,1,0 days,314.0,1,100.0,6.082353
3,124810,75653,363,5,14-01-2014,1,This is one of my favorite places in the US. A...,3.990361,1.009639,2075,...,1,1,1,0,1,0 days,280.0,1,100.0,9.121951
4,42068,32402,100,4,02-12-2014,1,Make sure you go with a small group of friends...,3.951812,0.048188,2677,...,1,1,2,0,1,398 days,255.666667,3,100.0,8.734545


In [39]:
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove punctuation and special characters
    text = re.sub(r"\s+", " ", text)  # Remove extra whitespace
    stop_words = set(stopwords.words("english"))
    text = " ".join([word for word in text.split() if word not in stop_words])    
    return text

df["REVIEW_TEXT"] = df["REVIEW_TEXT"].apply(preprocess_text)

In [40]:
df.rename(columns = {'Unnamed: 0':'ID'}, inplace = True)
df['TIMESTAMP_DIFFERENCE'] = df['TIMESTAMP_DIFFERENCE'].str.replace(' days', '')
df['TIMESTAMP_DIFFERENCE'] = df['TIMESTAMP_DIFFERENCE'].astype(int)
num_features = df.drop(['ID','USER_ID','PRODUCT_ID','DATE','REVIEW_TEXT','LABEL'],axis=1)

In [41]:
# # Split data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(df[['REVIEW_TEXT'] ], df['LABEL'], test_size=0.25, random_state=42) # + num_features.columns.tolist()

# print('X_train.shape:', X_train.shape)

# print('X_test.shape', X_test.shape)

In [42]:
# # Check the shape of X_train and y_train
# print("X_train shape:", X_train.shape)
# print("y_train shape:", y_train.shape)

# # Check the shape of X_test and y_test
# print("X_test shape:", X_test.shape)
# print("y_test shape:", y_test.shape)

# print(type(X_test))
# print(type(y_test))

In [43]:
# # Use TF-IDF vectorizer to convert text into numerical features
# vectorizer = TfidfVectorizer()
# X = vectorizer.fit_transform(df["REVIEW_TEXT"])
# y = df["LABEL"]  

In [44]:
# # Oversampling with SMOTE
# smote = SMOTE(random_state=42)
# X_resampled, y_resampled = smote.fit_resample(X, df["LABEL"])

In [45]:
# X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [46]:
#!pip install tqdm

In [47]:
# from tqdm import tqdm

# # ... (rest of your code)

# class TqdmWrapper(object):
#   def __init__(self, estimator):
#     self.estimator = estimator
    
#   def fit(self, X, y, **fit_params):
#     # Wrap the actual fit method with tqdm using getnnz() for total
#     with tqdm(total=X.getnnz()) as bar:
#       self.estimator.fit(X, y, **fit_params)
#       bar.update()

In [48]:
from sklearn.impute import SimpleImputer
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
import pandas as pd

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, max_features=5000, ngram_range=(1, 3)):
        self.max_features = max_features
        self.ngram_range = ngram_range
        self.vectorizer = None  # Initialize vectorizer attribute

    def fit(self, X, y=None):
        self.vectorizer = TfidfVectorizer(max_features=self.max_features, ngram_range=self.ngram_range)
        self.vectorizer.fit(X)
        return self

    def transform(self, X):
        if self.vectorizer is None:
            raise ValueError("Vectorizer has not been fitted. Call fit method first.")
        return self.vectorizer.transform(X)
        
# Create the full pipeline with preprocessing and SVC
pipeline = ImbPipeline([
    ('preprocessor', TextPreprocessor()),
    ('adasyn', ADASYN()),
    ('classifier', SVC(kernel='linear', probability=True))
])

In [49]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['REVIEW_TEXT'], df['LABEL'], test_size=0.25, random_state=42)

print('X_train.shape:', X_train.shape)
print('X_test.shape', X_test.shape)

X_train.shape: (14955,)
X_test.shape (4985,)


In [50]:
# Fit the pipeline with training data
pipeline.fit(X_train, y_train)

In [51]:
# from sklearn.impute import SimpleImputer
# from imblearn.pipeline import Pipeline as ImbPipeline
# from imblearn.over_sampling import SMOTE
# from sklearn.model_selection import train_test_split
# from sklearn.base import TransformerMixin, BaseEstimator

# class TextPreprocessor(BaseEstimator, TransformerMixin):
#     def __init__(self, max_features=5000, ngram_range=(1, 3)):
#         self.max_features = max_features
#         self.ngram_range = ngram_range

#     def fit(self, X, y=None):
#         self.vectorizer = TfidfVectorizer(max_features=self.max_features, ngram_range=self.ngram_range)
#         self.smote = SMOTE()
#         X_vect = self.vectorizer.fit_transform(X)
#         X_resampled, y_resampled = self.smote.fit_resample(X_vect, y)
#         self.original_classes_distribution = y.value_counts().to_dict()
#         self.resampled_classes_distribution = pd.Series(y_resampled).value_counts().to_dict()
#         return self

#     def transform(self, X):
#         return self.vectorizer.transform(X)

# # Create the full pipeline with preprocessing and SVC
# pipeline = ImbPipeline([
#     ('preprocessor', TextPreprocessor()),
#     ('classifier', SVC(kernel='rbf', probability=True))
# ])


In [52]:
# Apply preprocessing steps (including balancing) to the training data
X_train_preprocessed = pipeline['preprocessor'].transform(X_train)

In [53]:
# Check the shape of X_train and y_train
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

# Check the shape of X_test and y_test
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (14955,)
y_train shape: (14955,)
X_test shape: (4985,)
y_test shape: (4985,)


In [54]:
# Transform the testing data
X_test_preprocessed = pipeline.transform(X_test)

AttributeError: This 'Pipeline' has no attribute 'transform'

In [None]:
# # Training the pipeline with progress bar
# model = SVC(kernel='rbf',probability=True)
# model.fit(X_train, y_train)

In [None]:
# # SVM Classifer
# model = SVC(kernel='rbf',probability=True)
# model.fit(X_train, y_train)

In [None]:
print(X_train)

In [None]:
# Fit the pipeline on the training data
pipeline.fit(X_train_preprocessed, y_train)

In [None]:
# Make predictions on the testing data
y_pred = pipeline.predict(X_test)

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
# Predict probabilities for test data
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]

In [None]:
# Compute ROC curve and ROC area
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Calculate sensitivity (true positive rate)
sensitivity = conf_matrix[1, 1] / (conf_matrix[1, 0] + conf_matrix[1, 1])

# Calculate specificity (true negative rate)
specificity = conf_matrix[0, 0] / (conf_matrix[0, 0] + conf_matrix[0, 1])

# Print sensitivity and specificity
print("Sensitivity (True Positive Rate):", sensitivity)
print("Specificity (True Negative Rate):", specificity)