In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("rtatman/deceptive-opinion-spam-corpus")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\pokes\.cache\kagglehub\datasets\rtatman\deceptive-opinion-spam-corpus\versions\2


In [4]:
import numpy as np
import pandas as pd
import torch
from torch import nn
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score

In [3]:
df = pd.read_csv(path + "\\deceptive-opinion.csv")
X_text = df["text"].values
y = df["deceptive"].apply(lambda x: 1 if x == "deceptive" else 0).values

kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [15]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

wnl = WordNetLemmatizer()

nltk.download('stopwords')

X_text = [' '.join([wnl.lemmatize(w) for w in word_tokenize(text.lower()) if w not in stopwords.words('english')]) for text in X_text]

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pokes\AppData\Roaming\nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pokes\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
X_text = np.array(X_text)

In [17]:
accs, precisions, recalls, f1s = [], [], [], []
for train_index, test_index in kf.split(X_text):
  X_train, X_test = X_text[train_index], X_text[test_index]
  y_train, y_test = y[train_index], y[test_index]
  vectorizer = TfidfVectorizer()
  X_train_tfidf = vectorizer.fit_transform(X_train)
  X_test_tfidf = vectorizer.transform(X_test)

  model = RandomForestClassifier(n_estimators=100, random_state=42)
  model.fit(X_train_tfidf, y_train)

  y_pred = model.predict(X_test_tfidf)

  accuracy = np.mean(y_pred == y_test)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  f1 = f1_score(y_test, y_pred)
  accs.append(accuracy)
  precisions.append(precision)
  recalls.append(recall)
  f1s.append(f1)
  print(f"Fold accuracy: {accuracy:.4f}, precision: {precision:.4f}, recall: {recall:.4f}, f1: {f1:.4f}")
print(f"Average accuracy: {np.mean(accs):.4f}, precision: {np.mean(precisions):.4f}, recall: {np.mean(recalls):.4f}, f1: {np.mean(f1s):.4f}")

Fold accuracy: 0.8625, precision: 0.8214, recall: 0.9079, f1: 0.8625
Fold accuracy: 0.8438, precision: 0.8706, recall: 0.8409, f1: 0.8555
Fold accuracy: 0.8406, precision: 0.8025, recall: 0.8630, f1: 0.8317
Fold accuracy: 0.8469, precision: 0.8166, recall: 0.8846, f1: 0.8492
Fold accuracy: 0.8562, precision: 0.8690, recall: 0.8588, f1: 0.8639
Average accuracy: 0.8500, precision: 0.8360, recall: 0.8711, f1: 0.8526
