#Loading Data

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/fake reviews dataset.csv")
df

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...
...,...,...,...,...
40427,Clothing_Shoes_and_Jewelry_5,4.0,OR,I had read some reviews saying that this bra r...
40428,Clothing_Shoes_and_Jewelry_5,5.0,CG,I wasn't sure exactly what it would be. It is ...
40429,Clothing_Shoes_and_Jewelry_5,2.0,OR,"You can wear the hood by itself, wear it with ..."
40430,Clothing_Shoes_and_Jewelry_5,1.0,CG,I liked nothing about this dress. The only rea...


#1. Data Cleaning

In [4]:
# @title
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40432 entries, 0 to 40431
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   category  40432 non-null  object 
 1   rating    40432 non-null  float64
 2   label     40432 non-null  object 
 3   text_     40432 non-null  object 
dtypes: float64(1), object(3)
memory usage: 1.2+ MB


In [5]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [6]:
df['label'] = encoder.fit_transform(df['label'])
df['category'] = encoder.fit_transform(df['category'])

In [7]:
df

Unnamed: 0,category,rating,label,text_
0,3,5.0,0,"Love this! Well made, sturdy, and very comfor..."
1,3,5.0,0,"love it, a great upgrade from the original. I..."
2,3,5.0,0,This pillow saved my back. I love the look and...
3,3,1.0,0,"Missing information on how to use it, but it i..."
4,3,5.0,0,Very nice set. Good quality. We have had the s...
...,...,...,...,...
40427,1,4.0,1,I had read some reviews saying that this bra r...
40428,1,5.0,0,I wasn't sure exactly what it would be. It is ...
40429,1,2.0,1,"You can wear the hood by itself, wear it with ..."
40430,1,1.0,0,I liked nothing about this dress. The only rea...


###CG (computer-generated) = 0  and HG (human-created) = 1

In [8]:
from scipy.stats import chi2_contingency
contingency_table = pd.crosstab(df['rating'], df['label'])
print(contingency_table)

label       0      1
rating              
1.0      1063   1092
2.0       962   1005
3.0      1952   1834
4.0      3920   4045
5.0     12319  12240


In [9]:
chi2, p_value, dof, expected = chi2_contingency(contingency_table)

# Print the results
print(f"\nChi-Square Statistic: {chi2}")
print(f"P-value: {p_value}")
print(f"Degrees of Freedom: {dof}")
print("\nExpected Frequencies:")
print(expected)


Chi-Square Statistic: 7.223855752278348
P-value: 0.12452087428905528
Degrees of Freedom: 4

Expected Frequencies:
[[ 1077.5  1077.5]
 [  983.5   983.5]
 [ 1893.   1893. ]
 [ 3982.5  3982.5]
 [12279.5 12279.5]]


#2. Text Preprocesing

In [10]:
df=df.drop('category',axis=1)

In [11]:
df

Unnamed: 0,rating,label,text_
0,5.0,0,"Love this! Well made, sturdy, and very comfor..."
1,5.0,0,"love it, a great upgrade from the original. I..."
2,5.0,0,This pillow saved my back. I love the look and...
3,1.0,0,"Missing information on how to use it, but it i..."
4,5.0,0,Very nice set. Good quality. We have had the s...
...,...,...,...
40427,4.0,1,I had read some reviews saying that this bra r...
40428,5.0,0,I wasn't sure exactly what it would be. It is ...
40429,2.0,1,"You can wear the hood by itself, wear it with ..."
40430,1.0,0,I liked nothing about this dress. The only rea...


In [12]:
!pip install tensorflow-text



In [13]:
df['rating']=df['rating'].round().astype(int)

In [14]:
df

Unnamed: 0,rating,label,text_
0,5,0,"Love this! Well made, sturdy, and very comfor..."
1,5,0,"love it, a great upgrade from the original. I..."
2,5,0,This pillow saved my back. I love the look and...
3,1,0,"Missing information on how to use it, but it i..."
4,5,0,Very nice set. Good quality. We have had the s...
...,...,...,...
40427,4,1,I had read some reviews saying that this bra r...
40428,5,0,I wasn't sure exactly what it would be. It is ...
40429,2,1,"You can wear the hood by itself, wear it with ..."
40430,1,0,I liked nothing about this dress. The only rea...


In [44]:
from sklearn.model_selection import train_test_split
features=['text_','rating']
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df[features], df['label'], test_size=0.2, stratify=df['label'], random_state=42)

# Further split training set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train, random_state=42)

In [16]:
X_train

Unnamed: 0,text_,rating
863,Arrived with several minor scratches on it. No...,3
6994,I took it on a weekend trip and it was comfort...,4
2473,This would hypothetically be a fantastic way t...,1
40328,My daughter wanted this back in a size 12 and ...,3
33519,Love that Melissa and Doug products are so org...,5
...,...,...
22422,we get this on subscription from Chewy now but...,5
15053,"If you are not a fan of the movie, you will no...",5
784,"Really to strong, was expecting the pleasant a...",3
38524,Quality is better than I expected for the pric...,5


#Defining model

In [20]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Concatenate, Dropout, GlobalAveragePooling1D
import tensorflow_hub as hub
import tensorflow_text as text


# Load BERT preprocessing and encoder modules from TensorFlow Hub
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [21]:


# Define the input layers
text_input = Input(shape=(), dtype=tf.string, name='text_input')
rating_input = Input(shape=(1,), name='rating')

# Preprocess the text input using BERT preprocessing module
text_tokens = bert_preprocess(text_input)
text_embeddings = bert_encoder(text_tokens)

# Using both 'pooled_output' and 'sequence_output'
pooled_output = text_embeddings['pooled_output']  # Sentence-level
sequence_output = text_embeddings['sequence_output']  # Word-level

# Apply Global Average Pooling to sequence_output to get sentence-level features
text_features = GlobalAveragePooling1D()(sequence_output)

# Combine text features (pooled and sentence-level) and integer rating input
combined_input = Concatenate()([pooled_output, text_features, rating_input])

dropout_rate = 0.2  # Define the dropout rate

# Add hidden layers
hidden1 = Dense(256, activation='relu')(combined_input)
hidden1_dropout = Dropout(dropout_rate)(hidden1)  # Apply dropout
hidden2 = Dense(124, activation='relu')(hidden1_dropout)
hidden2_dropout = Dropout(dropout_rate)(hidden2)
hidden3 = Dense(64, activation='relu')(hidden2_dropout)
hidden3_dropout = Dropout(dropout_rate)(hidden3)

# Output layer for binary classification
output = Dense(1, activation='sigmoid')(hidden3_dropout)

# Create the model
model = tf.keras.Model(inputs=[text_input, rating_input], outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print the summary of the model
model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 text_input (InputLayer)     [(None,)]                    0         []                            
                                                                                                  
 keras_layer (KerasLayer)    {'input_type_ids': (None,    0         ['text_input[0][0]']          
                             128),                                                                
                              'input_word_ids': (None,                                            
                             128),                                                                
                              'input_mask': (None, 128)                                           
                             }                                                                

#traing model

In [27]:
X_train

Unnamed: 0,text_,rating
863,Arrived with several minor scratches on it. No...,3
6994,I took it on a weekend trip and it was comfort...,4
2473,This would hypothetically be a fantastic way t...,1
40328,My daughter wanted this back in a size 12 and ...,3
33519,Love that Melissa and Doug products are so org...,5
...,...,...
22422,we get this on subscription from Chewy now but...,5
15053,"If you are not a fan of the movie, you will no...",5
784,"Really to strong, was expecting the pleasant a...",3
38524,Quality is better than I expected for the pric...,5


In [31]:
model.fit(x=[X_train['text_'],X_train['rating']], y=y_train, epochs=12, validation_data=([X_val['text_'],X_val['rating']],y_val))

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 4/12
Epoch 5/12
Epoch 5/12
Epoch 6/12
Epoch 6/12
Epoch 7/12
Epoch 7/12
Epoch 8/12
Epoch 8/12
Epoch 9/12
Epoch 9/12
Epoch 10/12
Epoch 10/12
Epoch 11/12
Epoch 11/12
Epoch 12/12
Epoch 12/12


<keras.src.callbacks.History at 0x7d779cf85240>

<keras.src.callbacks.History at 0x7d779cf85240>

In [32]:
model.evaluate(x=[X_test['text_'],X_test['rating']], y=y_test)



[0.17193862795829773, 0.9245703220367432]

In [33]:
model.save('/content/drive/MyDrive/Colab Notebooks/my_model no catagory V2')

In [34]:
import pandas as pd
import numpy as np

# Given information
reviews = "Net is really strong comparing to other popular brand’s soft net, these are more long lasting. Cushioning are somewhat less but negligible comparing to the other qualities of the shoes. If you want to buy shoes for daily purpose then go for it, for running also."
rating = 5
category = 4

# Create a Series for the text column
text_series = pd.Series([reviews], name='text_')

# Create a Series for the rating column
rating_series = pd.Series([rating], name='rating')

# Create a Series for the category column
category_series = pd.Series([category], name='category')

# Arrange the format as a list
formatted_result = [
    text_series,
    rating_series,
]

print(formatted_result)


[0    Net is really strong comparing to other popula...
Name: text_, dtype: object, 0    5
Name: rating, dtype: int64]


#models text

In [35]:
model.predict(formatted_result)



array([[0.99794894]], dtype=float32)

In [36]:
loaded_model = tf.keras.models.load_model('/content/drive/MyDrive/Colab Notebooks/my_model no catagory')

In [45]:
current_model_predictions = model.predict([X_test['text_'], X_test['rating']])  # Using your current model
loaded_model_predictions = loaded_model.predict([X_test['text_'], X_test['rating']])  # Using the loaded model




In [46]:
from sklearn.metrics import accuracy_score

current_model_accuracy = accuracy_score(y_test, current_model_predictions > 0.6)  # Assuming binary classification
loaded_model_accuracy = accuracy_score(y_test, loaded_model_predictions > 0.6)  # Assuming binary classification

print("Current Model Accuracy:", current_model_accuracy)
print("Loaded Model Accuracy:", loaded_model_accuracy)


Current Model Accuracy: 0.9276616792382837
Loaded Model Accuracy: 0.8648448126622975
