In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
from sklearn.preprocessing import StandardScaler

In [None]:
#Reading raw data from csv files provided by Shebuti Rayana
reviews = pd.read_csv("yelp_all.csv")
restaurants = pd.read_csv("ProductIdMapping.csv")
metadata = pd.read_csv("metadata.csv")

In [1]:
#Renaming columns to make the dataset more comprehensible
metadata = metadata.rename(columns = {"prodID":"restaurantID", "label":"fakeLabel"})
reviews = reviews.rename(columns = {"Unnamed: 0": "ID" , "fakeLabel":"restaurantID", "productID": "userID"})
restaurants = restaurants.rename(columns = {"Restaurant":"restaurant","ProductId":"restaurantID"})

NameError: name 'metadata' is not defined

In [None]:
#At this stage we combine the divided information into one single dataset. We include the restaurant information on the complete dataset and include columns for every review's rating and
#the label indicating if that review was flagged as fake or not. To facilitate comprehension, the fakeLabel column was adapted to the following logic: 1 if review was flaged as fake, 0
#otherwise.
#NOTE: We performed several rounds of testing to determine the most efficient way to join the metadata information and the reviews themselves. The best alternative because of its simplicity
#proved to be simply element-wise attribution. This is possible because rows in both csv files are exactly in the same order, which had been previously validated.
complete_reviews = reviews.join(restaurants.set_index('restaurantID'), on='restaurantID')
complete_reviews['fakeLabel'] = np.where(metadata['fakeLabel'] < 0, 1, 0)
complete_reviews['rating'] = metadata['rating']

In [None]:
#One feature to be included in the model is the length of the review. Therefore, it is necessary to add the wordcount for every review
complete_reviews['wordCount'] = complete_reviews['reviewText'].str.count(' ') + 1

In [None]:
#Another feature is the sentiment analysis (positive or negative). Three different methods were used to determine the sentiment: Vader, within the Natural Language Toolkit framework, Blob
#and Flair. The review text was fed into the pre-trained models and the corresponding sentiment (between -1 and 1) was saved in the data set.
#NOTE: Flair yielded the most accurate results in a shorter version of the dataset (analyzing random samples by reading the reviews). However, it is the most expensive to run in terms of
#computational resources. Future versions of the model will include sentiment data from Flair instead of Blob and Vader.
complete_reviews['sentimentBLOB'] = complete_reviews['reviewText'].apply(lambda x: TextBlob(x).sentiment[0])
complete_reviews['BLOBPosNeg'] = np.where(complete_reviews['sentimentBLOB'] < 0, 0, 1)

nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()
complete_reviews['sentimentVADER'] = complete_reviews['reviewText'].apply(lambda x: sid.polarity_scores(x)['compound'])
complete_reviews['VADERPosNeg'] = np.where(complete_reviews['sentimentVADER'] < 0, 0, 1)

In [None]:
complete_reviews.to_csv('yelp_processed.csv', encoding = "UTF-8")

In [None]:
#Creates a file only with metadata (without the text reviews) to be added to an initial model. The text reviews are analyzed by a
#separate module at this initial stage
metadata_processed = complete_reviews.drop(columns=['ID','restaurantID','userID','reviewText','restaurant'])
metadata_processed.to_csv('yelp_metadata_processed.csv', encoding = "UTF-8")

#Standardizes numeric variables (rating and wordcount)
scaler = StandardScaler()
metadata_standardized = metadata_processed.copy()
metadata_standardized['rating'] = scaler.fit_transform(metadata_standardized[['rating']])
metadata_standardized['wordCount'] = scaler.fit_transform(metadata_standardized[['wordCount']])
metadata_standardized.to_csv('yelp_metadata_standardized.csv', encoding = "UTF-8")