In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

TFIDF vectorizer

In [2]:
# Using Tfidf vectorizer on the original data

data = pd.read_csv('/kaggle/input/fake-reviews-swm/fake reviews dataset.csv')
texts = data['text_'].tolist()

# Extract features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)

# Perform clustering using KMeans
kmeans = KMeans(n_clusters=2, random_state=2)
y_pred = kmeans.fit_predict(X)

sum_1 = 0
count_sum_1 = 0
sum_0 = 0
count_sum_0 = 0
for i in range(len(y_pred)):
    if y_pred[i] == 1:
        sum_1 = sum_1 + data['rating'][i]
        count_sum_1 = count_sum_1 + 1
    else:
        sum_0 = sum_0 + data['rating'][i]
        count_sum_0 = count_sum_0 + 1

print("avg. rating for 1: ", (sum_1/count_sum_1))
print("avg. rating for 0: ", (sum_0/count_sum_0))

avg. rating for 1:  4.251649519123239
avg. rating for 0:  4.294382504288165


In [3]:
# Using Tfidf vectorizer on the new data

data = pd.read_csv('/kaggle/input/new-fake-reviews-swm/new_fake_reviews_data.csv')
texts = data['text_'].tolist()

# Extract features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)

# Perform clustering using KMeans
kmeans = KMeans(n_clusters=2, random_state=2)
y_pred = kmeans.fit_predict(X)

sum_1 = 0
count_sum_1 = 0
sum_0 = 0
count_sum_0 = 0
for i in range(len(y_pred)):
    if y_pred[i] == 1:
        sum_1 = sum_1 + data['rating'][i]
        count_sum_1 = count_sum_1 + 1
    else:
        sum_0 = sum_0 + data['rating'][i]
        count_sum_0 = count_sum_0 + 1

print("avg. rating for 1: ", (sum_1/count_sum_1))
print("avg. rating for 0: ", (sum_0/count_sum_0))

avg. rating for 1:  4.112971194365872
avg. rating for 0:  4.3584317798260415


BERT embeddings and kmeans clustering

In [3]:
# Using BERT embeddings on the original data

data = pd.read_csv('/kaggle/input/fake-reviews-swm/fake reviews dataset.csv')
texts = data['text_'][:4000].tolist()

# Load pre-trained BERT model
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5"
embed = hub.KerasLayer(module_url, trainable=False)

# Extract BERT embeddings
X = embed(texts).numpy()

# Cluster data
kmeans = KMeans(n_clusters=2, random_state=2)
y_pred = kmeans.fit_predict(X)

sum_1 = 0
count_sum_1 = 0
sum_0 = 0
count_sum_0 = 0
for i in range(len(y_pred)):
    if y_pred[i] == 1:
        sum_1 = sum_1 + data['rating'][i]
        count_sum_1 = count_sum_1 + 1
    else:
        sum_0 = sum_0 + data['rating'][i]
        count_sum_0 = count_sum_0 + 1

print("avg. rating for 1: ", (sum_1/count_sum_1))
print("avg. rating for 0: ", (sum_0/count_sum_0))

avg. rating for 1:  4.292486942547208
avg. rating for 0:  4.180675049636003


In [3]:
# Using BERT embeddings on the new data

data = pd.read_csv('/kaggle/input/new-fake-reviews-swm/new_fake_reviews_data.csv')
texts = data['text_'][:4000].tolist()

# Load pre-trained BERT model
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5"
embed = hub.KerasLayer(module_url, trainable=False)

# Extract BERT embeddings
X = embed(texts).numpy()

# Cluster data
kmeans = KMeans(n_clusters=2, random_state=2)
y_pred = kmeans.fit_predict(X)

sum_1 = 0
count_sum_1 = 0
sum_0 = 0
count_sum_0 = 0
for i in range(len(y_pred)):
    if y_pred[i] == 1:
        sum_1 = sum_1 + data['rating'][i]
        count_sum_1 = count_sum_1 + 1
    else:
        sum_0 = sum_0 + data['rating'][i]
        count_sum_0 = count_sum_0 + 1

print("avg. rating for 1: ", (sum_1/count_sum_1))
print("avg. rating for 0: ", (sum_0/count_sum_0))

avg. rating for 1:  4.2683544303797465
avg. rating for 0:  4.232326820603908


BERT embeddings and kmeans clustering with PCA

In [12]:
# Using BERT embeddings on the original data

from sklearn.decomposition import PCA

data = pd.read_csv('/kaggle/input/fake-reviews-swm/fake reviews dataset.csv')
texts = data['text_'][:4000].tolist()

# Load pre-trained BERT model
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5"
embed = hub.KerasLayer(module_url, trainable=False)

# Extract BERT embeddings
X = embed(texts).numpy()

# Using PCA
pca = PCA(n_components=50)
X = pca.fit_transform(X)

# Cluster data
kmeans = KMeans(n_clusters=2, random_state=2)
y_pred = kmeans.fit_predict(X)

sum_1 = 0
count_sum_1 = 0
sum_0 = 0
count_sum_0 = 0
for i in range(len(y_pred)):
    if y_pred[i] == 1:
        sum_1 = sum_1 + data['rating'][i]
        count_sum_1 = count_sum_1 + 1
    else:
        sum_0 = sum_0 + data['rating'][i]
        count_sum_0 = count_sum_0 + 1

print("avg. rating for 1: ", (sum_1/count_sum_1))
print("avg. rating for 0: ", (sum_0/count_sum_0))

avg. rating for 1:  4.289610910549539
avg. rating for 0:  4.185136031851361


In [13]:
# Using BERT embeddings on the new data

from sklearn.decomposition import PCA

data = pd.read_csv('/kaggle/input/new-fake-reviews-swm/new_fake_reviews_data.csv')
texts = data['text_'][:4000].tolist()

# Load pre-trained BERT model
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5"
embed = hub.KerasLayer(module_url, trainable=False)

# Extract BERT embeddings
X = embed(texts).numpy()

# Using PCA
pca = PCA(n_components=50)
X = pca.fit_transform(X)

# Cluster data
kmeans = KMeans(n_clusters=2, random_state=2)
y_pred = kmeans.fit_predict(X)

sum_1 = 0
count_sum_1 = 0
sum_0 = 0
count_sum_0 = 0
for i in range(len(y_pred)):
    if y_pred[i] == 1:
        sum_1 = sum_1 + data['rating'][i]
        count_sum_1 = count_sum_1 + 1
    else:
        sum_0 = sum_0 + data['rating'][i]
        count_sum_0 = count_sum_0 + 1

print("avg. rating for 1: ", (sum_1/count_sum_1))
print("avg. rating for 0: ", (sum_0/count_sum_0))

avg. rating for 1:  4.231863442389758
avg. rating for 0:  4.269360269360269
