In [1]:
# Author: Shelly Hashali Kurera Warnakulasooriya (17043308)
# Created: 31/01/2020
# Revised: 04/04/2020
# Description: Detection of deception using Sentiment Analysis. The dataset was retrieved from Kaggle and is based on Amazon manufactured products.
# Coding: UTF-8

# 1 Detection of inauthentic content using Sentiment Analysis on product reviews

In [2]:
# Importing the libraries needed
import csv                               
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from random import shuffle
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
import numpy as np
import nltk
from nltk.tokenize import word_tokenize

In [3]:
# Function: Loading data from the text file
def loadData(path, Text=None):
    with open(path, encoding='utf8') as f:
        reader = csv.reader(f, delimiter='\t')
        next(reader)
        for line in reader:
            (Id, Text, Label) = parseReview(line)
            rawData.append((Id, Text, Label))
            preprocessedData.append((Id, preProcess(Text), Label))
            
# Function: Splitting into training and testing set
def splitData(percentage):
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (_, Text, Label) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector(preProcess(Text)),Label))
    for (_, Text, Label) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toFeatureVector(preProcess(Text)),Label))

In [4]:
# Function: Converting line from input file, returns a string containing the review itself and the string indicating the label
def parseReview(reviewLine):
    s=""
    if reviewLine[1]=="__label1__":
        s = "fake"
    else: 
        s = "real"
    return (reviewLine[0], reviewLine[8], s)


In [5]:
# Function: Tokenizing words of a review (text pre-processing)
def preProcess(text):
    # Should return a list of tokens
    return word_tokenize(text)

In [6]:
# Global dictionary of features
featureDict = {}

# Function: Feature Vectorization
def toFeatureVector(tokens):
    localDict = {}
    for token in tokens:
        if token not in featureDict:
            featureDict[token] = 1
        else:
            featureDict[token] = +1
   
        if token not in localDict:
            localDict[token] = 1
        else:
            localDict[token] = +1
    
    return localDict

In [7]:
# Function: Training and validating classifier
def trainClassifier(trainData):
    pipeline =  Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(trainData)

In [8]:
# Function: Crossvalidating
def crossValidate(dataset, folds):
    shuffle(dataset)
    cv_results = []
    foldSize = int(len(dataset)/folds)
    for i in range(0,len(dataset),foldSize):
        classifier = trainClassifier(dataset[:i]+dataset[foldSize+i:])
        y_pred = predictLabels(dataset[i:i+foldSize],classifier)
        a = accuracy_score(list(map(lambda d : d[1], dataset[i:i+foldSize])), y_pred)
        (p,r,f,_) = precision_recall_fscore_support(list(map(lambda d : d[1], dataset[i:i+foldSize])), y_pred, average ='macro')
        cv_results.append((a,p,r,f))
    cv_results = (np.mean(np.array(cv_results),axis=0))
    return cv_results

In [9]:
# Function: Predicting labels given a classifier
def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: t[0], reviewSamples))

def predictLabel(reviewSample, classifier):
    return classifier.classify(toFeatureVector(preProcess(reviewSample)))

# Initial assessment

In [10]:
rawData = []          # The filtered data from the dataset file 
preprocessedData = [] # The pre-processed reviews 
trainData = []        # The training data as a percentage of the total dataset 
testData = []         # The test data as a percentage of the total dataset

# Output classes
fakeLabel = 'fake'
realLabel = 'real'

# Reference path to the text file containing the dataset
reviewPath = 'amazon_reviews.txt'

# Loading the data into a raw dataset
loadData(reviewPath) 

# Splitting the raw dataset into training and testing datasets, respectively 80% and 20%
print("Size of %d training set and %d testing set" % (len(trainData), len(testData)),sep='\n')
splitData(0.8) 
# Printing the training and testing dataset sizes, and the number of features for the training set
print("Size of %d training set and %d testing set" % (len(trainData), len(testData)),
      "Training Samples: ", len(trainData), " Number of features: ", len(featureDict), sep='\n')
print("Mean of cross-validations (Accuracy, Precision, Recall, F-score): ", crossValidate(trainData, 10))

Size of 0 training set and 0 testing set
Size of 16800 training set and 4200 testing set
Training Samples: 
16800
 Number of features: 
52962
Mean of cross-validations (Accuracy, Precision, Recall, F-score):  [0.61857143 0.61865055 0.61855507 0.61828425]




# Assessment introducing lemmatization, stop words removal and punctuations

In [11]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

In [12]:
table = str.maketrans({key: None for key in string.punctuation})

# Function: Text pre-processing using lemmatization, stop words removal and punctuations, returns a list of tokens
def preProcess(text):
    lemmatizer = WordNetLemmatizer()
    filtered_tokens=[]
    stop_words = set(stopwords.words('english'))
    text = text.translate(table)
    for w in text.split(" "):
        if w not in stop_words:
            filtered_tokens.append(lemmatizer.lemmatize(w.lower()))
    return filtered_tokens

In [13]:
rawData = []          # The filtered data from the dataset file 
preprocessedData = [] # The pre-processed reviews 
trainData = []        # The training data as a percentage of the total dataset 
testData = []         # The test data as a percentage of the total dataset

# Output classes
fakeLabel = 'fake'
realLabel = 'real'

# Reference path to the text file containing the dataset
reviewPath = 'amazon_reviews.txt'

# Loading the data into a raw dataset
loadData(reviewPath) 

# Splitting the raw dataset into training and testing datasets, respectively 80% and 20%
print("Size of %d training set and %d testing set" % (len(trainData), len(testData)),sep='\n')
splitData(0.8) 
# Printing the training and testing dataset sizes, and the number of features for the training set
print("Size of %d training set and %d testing set" % (len(trainData), len(testData)),
      "Training Samples: ", len(trainData), " Number of features: ", len(featureDict), sep='\n')
print("Mean of cross-validations (Accuracy, Precision, Recall, F-score): ", crossValidate(trainData, 10))

Size of 0 training set and 0 testing set
Size of 16800 training set and 4200 testing set
Training Samples: 
16800
 Number of features: 
73058
Mean of cross-validations (Accuracy, Precision, Recall, F-score):  [0.62809524 0.62796188 0.62773024 0.62757104]


# Assessment introducing bi-grams and utilising different values for C in the Linear SVC Function 

In [14]:
table = str.maketrans({key: None for key in string.punctuation})

# Function: Text pre-processing using lemmatization, stop words removal, punctuations and bi-grams, returns a list of tokens
def preProcess(text):
    lemmatizer = WordNetLemmatizer()
    filtered_tokens=[]
    lemmatized_tokens = []
    stop_words = set(stopwords.words('english'))
    text = text.translate(table)
    for w in text.split(" "):
        if w not in stop_words:
            lemmatized_tokens.append(lemmatizer.lemmatize(w.lower()))
        filtered_tokens = [' '.join(l) for l in nltk.bigrams(lemmatized_tokens)] + lemmatized_tokens
    return filtered_tokens

In [15]:
# Function: Training and validating classifier
def trainClassifier(trainData):
    pipeline =  Pipeline([('svc', LinearSVC(C=0.01))])
    return SklearnClassifier(pipeline).train(trainData)

In [16]:
rawData = []          # The filtered data from the dataset file 
preprocessedData = [] # The pre-processed reviews 
trainData = []        # The training data as a percentage of the total dataset 
testData = []         # The test data as a percentage of the total dataset

# Output classes
fakeLabel = 'fake'
realLabel = 'real'

# Reference path to the text file containing the dataset
reviewPath = 'amazon_reviews.txt'

# Loading the data into a raw dataset
loadData(reviewPath) 

# Splitting the raw dataset into training and testing datasets, respectively 80% and 20%
print("Size of %d training set and %d testing set" % (len(trainData), len(testData)),sep='\n')
splitData(0.8) 
# Printing the training and testing dataset sizes, and the number of features for the training set
print("Size of %d training set and %d testing set" % (len(trainData), len(testData)),
      "Training Samples: ", len(trainData), " Number of features: ", len(featureDict), sep='\n')
print("Mean of cross-validations (Accuracy, Precision, Recall, F-score): ", crossValidate(trainData, 10))

Size of 0 training set and 0 testing set
Size of 16800 training set and 4200 testing set
Training Samples: 
16800
 Number of features: 
542871
Mean of cross-validations (Accuracy, Precision, Recall, F-score):  [0.69232143 0.69386122 0.69240455 0.69167888]


# Assessment taking into consideration different features such as rating, verified purchase and product category

In [17]:
# Function: Loading data from the text file (considering the different columns)
def loadData(path, Text=None):
    with open(path, encoding='utf8') as f:
        reader = csv.reader(f, delimiter='\t')
        next(reader)
        for line in reader:
            (Id, Rating, verified_Purchase, product_Category, Text, Label) = parseReview(line)
            rawData.append((Id, Rating, verified_Purchase, product_Category, Text, Label))
        
# Function: Splitting 
def splitData(percentage):
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (_, Rating, verified_Purchase, product_Category, Text, Label) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector(Rating, verified_Purchase, product_Category, preProcess(Text)),Label))
    for (_, Rating, verified_Purchase, product_Category, Text, Label) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toFeatureVector(Rating, verified_Purchase, product_Category, preProcess(Text)),Label))

In [18]:
# Function: Converting line from input file, identifies label and returns the different columns and the class
def parseReview(reviewLine):
    s=""
    if reviewLine[1]=="__label1__":
        s = "fake"
    else: 
        s = "real"
    return (reviewLine[0], reviewLine[2], reviewLine[3],reviewLine[4], reviewLine[8], s)


In [19]:
table = str.maketrans({key: None for key in string.punctuation})

# Function: Text pre-processing using lemmatization, stop words removal, punctuations and bi-grams, returns a list of tokens
def preProcess(text):
    # Should return a list of tokens
    lemmatizer = WordNetLemmatizer()
    filtered_tokens=[]
    lemmatized_tokens = []
    stop_words = set(stopwords.words('english'))
    text = text.translate(table)
    for w in text.split(" "):
        if w not in stop_words:
            lemmatized_tokens.append(lemmatizer.lemmatize(w.lower()))
        filtered_tokens = [' '.join(l) for l in nltk.bigrams(lemmatized_tokens)] + lemmatized_tokens
    return filtered_tokens

In [20]:
# Function: Feature Vectorization considering rating, verified purchase, category and text
featureDict = {} # Global dictionary of features

def toFeatureVector(Rating, verified_Purchase, product_Category, tokens):
    localDict = {}
    
#Rating
    featureDict["R"] = 1   
    localDict["R"] = Rating

#Verified_Purchase 
    featureDict["VP"] = 1      
    if verified_Purchase == "N":
        localDict["VP"] = 0
    else:
        localDict["VP"] = 1

#Product_Category 
    if product_Category not in featureDict:
        featureDict[product_Category] = 1
    else:
        featureDict[product_Category] = +1
            
    if product_Category not in localDict:
        localDict[product_Category] = 1
    else:
        localDict[product_Category] = +1         
#Text        
    for token in tokens:
        if token not in featureDict:
            featureDict[token] = 1
        else:
            featureDict[token] = +1
            
        if token not in localDict:
            localDict[token] = 1
        else:
            localDict[token] = +1
    
    return localDict

In [21]:
rawData = []          # The filtered data from the dataset file 
preprocessedData = [] # The pre-processed reviews 
trainData = []        # The training data as a percentage of the total dataset 
testData = []         # The test data as a percentage of the total dataset

# Output classes
fakeLabel = 'fake'
realLabel = 'real'

# Reference path to the text file containing the dataset
reviewPath = 'amazon_reviews.txt'

# Loading the data into a raw dataset
loadData(reviewPath) 

# Splitting the raw dataset into training and testing datasets, respectively 80% and 20%
print("Size of %d training set and %d testing set" % (len(trainData), len(testData)),sep='\n')
splitData(0.8) 
# Printing the training and testing dataset sizes, and the number of features for the training set
print("Size of %d training set and %d testing set" % (len(trainData), len(testData)),
      "Training Samples: ", len(trainData), " Number of features: ", len(featureDict), sep='\n')
print("Mean of cross-validations (Accuracy, Precision, Recall, F-score): ", crossValidate(trainData, 10))

Size of 0 training set and 0 testing set
Size of 16800 training set and 4200 testing set
Training Samples: 
16800
 Number of features: 
512245
Mean of cross-validations (Accuracy, Precision, Recall, F-score):  [0.81934524 0.82037159 0.81932087 0.81911599]


# Testing on final classifier

In [22]:
#  Testing classifier
classifier = trainClassifier(trainData)
predictions = predictLabels(testData, classifier)
true_labels = list(map(lambda d: d[1], testData))
a = accuracy_score(true_labels, predictions)
p, r, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='macro')
print("Accuracy: ", a)
print("Precision: ", p)
print("Recall: ", a)
print("F1-score: ", f1)

Accuracy:  0.8042857142857143
Precision:  0.8080454049606811
Recall:  0.8042857142857143
F1-score:  0.8036867139280308
