# **Importing Libraries**

In [None]:
# Importing the libraries
import pandas as pd
import numpy as np

# **Data Preprocessing**

In [None]:
# Importing the dataset
df = pd.read_csv('yelp.csv')

In [None]:
# Text Preprocessing (Removing punctuation marks and other characters and making the text data lowercase)
import re
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

df_filtered = df_filtered.copy()
df_filtered['Review'] = df_filtered['Review'].apply(preprocess_text)

In [None]:
# Text Vectorizing (Converting the text data into numerical values)
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df_filtered['Review']).toarray()

In [None]:
# Train Test Splitting
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, df_filtered['Label'], test_size=0.2, random_state=42)

In [None]:
# Standardizing
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# **Data Visualization**

In [None]:
# Importing necessary visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Pie Chart : To visualize the ratings distribution of each real and fake reviews
# For Real Reviews
df_positive_label = df[df["Label"] == 1]

# Count the frequency of each rating having real label
rating_counts = df_positive_label["Rating"].value_counts()

# Create a pie chart
plt.pie(rating_counts, labels=rating_counts.index, autopct='%1.1f%%', startangle=90)
plt.title("Frequency of Ratings for Label = 1")
plt.axis('equal')
plt.show()

# For Fake Reviews
df_negative_label = df[df["Label"] == -1]

# Count the frequency of each rating having fake label
rating_counts = df_negative_label["Rating"].value_counts()

# Create a pie chart
plt.pie(rating_counts, labels=rating_counts.index, autopct='%1.1f%%', startangle=90)
plt.title("Frequency of Ratings for Label = -1")
plt.axis('equal')
plt.show()

In [None]:
# Boxplot : To visualize the length of reviews and compare them as real and fake reviews
df['Review_Length'] = df['Review'].apply(len)

# Plotting the data points
plt.figure(figsize=(8, 6))
sns.boxplot(x='Label', y='Review_Length', data=df, hue='Label', palette='pastel', dodge=True)
plt.title('Review Length Distribution by Label')
plt.xlabel('Label (1: Real, -1: Fake)')
plt.ylabel('Review Length')
plt.legend(title=None)
plt.show()

In [None]:
# Line Graph / Time Series Graph : To visualize the number of reviews given over the time both real and fake (Date Vs No of Reviews)
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year

# Plotting the datapoints
plt.figure(figsize=(10, 6))
sns.lineplot(x='Year', y='count', hue='Label', data=df.groupby(['Year', 'Label']).size().reset_index(name='count'), marker='o')
plt.title('Temporal Trends in Number of Reviews by Label')
plt.xlabel('Year')
plt.ylabel('Number of Reviews')
plt.show()

In [None]:
# Bar Graph : To visualize the 10 most frequent words present in all fake reviews and giving their frequency
vectorizer = CountVectorizer(stop_words='english', max_features=1000)
X = vectorizer.fit_transform(df['Review'])

# Create a DataFrame with word frequencies
word_freq_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Add the 'Label' column to the DataFrame
word_freq_df['Label'] = df['Label']

# Calculate the average frequency of each word for real and fake reviews
average_word_freq = word_freq_df.groupby('Label').mean().transpose()

# Choose the top N words
top_words = average_word_freq.sort_values(by=-1).head(10)  # Replace 10 with the desired number of words

# Plotting
plt.figure(figsize=(12, 6))
sns.barplot(data=top_words.reset_index(), x='index', y=-1)
plt.title('Top Words in Fake Reviews')
plt.xlabel('Words')
plt.ylabel('Average Frequency')
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
# Wordcloud
from wordcloud import WordCloud
reviews_text = ' '.join(df['Review'].astype(str))

# Generate Word Cloud
wordcloud = WordCloud(width=800, height=400, random_state=42, background_color='white').generate(reviews_text)

# Plotting
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud for Reviews')
plt.show()

In [None]:
# Heatmap (Comparing the relations among User_Id, Product_Id, Rating, Label)
# Calculate the correlation among columns matrix
corr_matrix = df_sample[['User_id', 'Product_id', 'Rating', 'Label']].corr()

# Plotting the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=.5)
plt.title('Heatmap: Correlation between User_Id, Product_Id, Rating, and Label')
plt.show()

In [None]:
# Bar Chart (Applying sentiment analysis and comparing the real and fake reviews)
from nltk.sentiment import SentimentIntensityAnalyzer

# Initialize the sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Perform sentiment analysis and classify each review
df_sample['Sentiment'] = df_sample['Review'].apply(lambda x: 'Positive' if sia.polarity_scores(x)['compound'] >= 0.5 else ('Negative' if sia.polarity_scores(x)['compound'] <= -0.5 else 'Neutral'))

# Create a grouped bar chart
plt.figure(figsize=(10, 8))
sns.countplot(x='Sentiment', hue='Label', data=df_sample, palette={1: 'blue', -1: 'red'})
plt.title('Sentiment Analysis: Polarized Sentiment Distribution for Real and Fake Reviews')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

# **Pre Model Building Tasks**

In [None]:
# Modifying the Dataset based on the insights from visualizations
# [Note : Perform these tasks before splitting the data in train and test]

# 1. Using the 4 & 5 stars rating from Rating column, Taking the reviews from the year 2014
df_filtered = df_sampled[(df_sampled['Rating'].isin([4, 5])) & (df_sampled['Date'].str.contains('2014'))]

# 2. Dropping the User_id and Product_id columns
df_filtered.drop(columns=['User_id', 'Product_id'], inplace=True)

# 3. Using Sentiment Analysis to filter out positive reviews for the model
from textblob import TextBlob
def calculate_sentiment(text):
    analysis = TextBlob(text)
    # Assign sentiment labels based on polarity (adjust threshold as needed)
    return 'Positive' if analysis.sentiment.polarity > 0 else 'Negative'

df_filtered = df_filtered.copy()
# Apply sentiment analysis to the 'Review' column
df_filtered['Sentiment'] = df_filtered['Review'].apply(calculate_sentiment)

# Filter out positive reviews
df_filtered = df_filtered[df_filtered['Sentiment'] == 'Positive']
df_filtered.drop(columns=['Sentiment'], inplace=True)

# **Model Building, Training, Testing and Evaluating**

In [None]:
# Support Vector Machine
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score

# Building and Training
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_svm = svm_model.predict(X_test_scaled)

# Evaluate SVM model
accuracy_svm = accuracy_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm)

# Showing the Accuracy and F1 Score of the Model
print("SVM Model:")
print(f"Accuracy: {accuracy_svm:.4f}")
print(f"F1 Score: {f1_svm:.4f}")

In [None]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

# Building and Training
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluate Random Forest model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)

# Showing the Accuracy and F1 Score of the Model
print("\nRandom Forest Model:")
print(f"Accuracy: {accuracy_rf:.4f}")
print(f"F1 Score: {f1_rf:.4f}")