# Import:

In [1]:
#import libraries
import csv                               # csv reader
import re
import numpy as np
from random import shuffle
import enchant
import math

# PRE-PROCESSING
from nltk import bigrams
from nltk.classify import SklearnClassifier
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.metrics import edit_distance

# FEATURES
import spacy
from collections import Counter, OrderedDict
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sp = spacy.load("en_core_web_sm")

# CLASSIFICATION
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn import metrics

# Load and Split Data:

In [2]:
# load data and append to rawData
def loadData(file, Text=None):
    rawData = []
    with open(file, encoding = "utf8") as f:
        reader = csv.reader(f, delimiter=',')
        for line in reader:
            rawData.append(parseReview(line))
        rawData = rawData[1:]  #remove header row
        return rawData

# convert line into text/label pair
def parseReview(line):
    Label = line[0]
    Polarity = line[2]
    Text = line[4]
    return (Text,Polarity,Label)


# A method to split the data between trainData and testData
def splitData(data,percentage):
    splitSample = len(data)*percentage
    i = 0
    for Text,Polarity,Label in data:
        if i< splitSample:
            trainData.append((Text,Polarity,Label))
            #trainData.append((preProcess(Text),Polarity,Label))
            #trainData.append(((toFeatureVector((preProcess(Text)),Polarity)),Label))
        if i>= splitSample:
            testData.append((Text,Polarity,Label))
            #testData.append((preProcess(Text),Polarity,Label))
            #testData.append(((toFeatureVector((preProcess(Text)),Polarity)),Label))
        i+=1

# Preprocessing:

In [3]:
# TEXT PREPROCESSING

# Input: a string of one review
def preProcess(text):
    #This function takes in the text part of each review in order to turn it into tokens
    #For the feature dictionary we just want to remove any punctuation and just take the words/numbers as features
    #Here, the text is 'cleaned' by removing line breaks and punctuation, and then splitting the resulting string of words
    #At each whitespace, to create a list structure of tokens.
    
    doc = sp(text)                             # save spacy document text
    
    #tokenise
    clean = re.sub(r"(<br />)"," ",text)                 #remove any line breaks
    clean = re.sub(r"([@.$&#%~\*_=\[\]/,\-\+>;:!?'\"”\)\(])"," ",clean) #remove any punctuation
    clean = clean.lower()                                #set all letters to lower case
    tokens = re.split(r"\s+",clean)                      #split at white space into tokens
    
    tokens = removeStop(tokens)                         #remove stopwords
    #tokens = spellCorrect(tokens)                       #correct spelling
    tokens = stemTokens(tokens)                          #stem tokens
    
    grams =(list(bigrams(tokens)))                       #Use the nltk bigrams package to return bigrams of filtered text
    
    return (grams,doc)


In [4]:
#OTHER PRE-PROCESSING METHODS

#remove stopwords
def removeStop(tokens):
    stop_words = set(stopwords.words('english'))       #assign stopwords as the english stopwords list from nltl.corpus
    tokens = [w for w in tokens if not w in stop_words]#remove all tokens which are stopwords
    return tokens

def stemTokens(tokens):
    stemmer = PorterStemmer()                          #create stemmer object
    for i in range(len(tokens)):                            
        tokens[i] = stemmer.stem(tokens[i])            #Use the Porter Stemmer from the nltk package to stem words
    return tokens
    
def spellCorrect(tokens):
    tokens_new = []                                    #set up empty array for tokens returned
    spell_dict = enchant.Dict('en')                    #create spelling dictionary object using PyEnchant's english dictionary
    for t in tokens:                                   #search through the tokens in the review text
        if t == "":                                    #pass any empty strings that have snuck through
            pass                                         
        else:                                            
            t = replaceToken(t,spell_dict)             #call spellCorrect function to evaluate token
            tokens_new = np.append(tokens_new,t)       #append returned value to token array
    return tokens_new
            
def replaceToken(word,d):                              #function to try and correct mispelled words
    replace = []                                       #set up empty array for replacement words
    if d.check(word) == False:                         #check to see if word is in dictionary, i.e. probably spelled correctly
        a = d.suggest(word)                   #If the spelling is incorrect, suggest similar words it could be
        for i in range(len(a)):                       
            if a[i] and edit_distance(word,a[i]) <= 1: #if one of the similar words can be found with only one letter changed
                replace.append(a[i])                   #append this similar word to the empty 'replace' array
        if len(replace) == 1:                          #if there is only one word in the replace array
            return replace[0]                          #change the token to this word
        else:                                       
            return word                                #otherwise return original word- therefore this method won't always work
    else:                                              # but it also won't change a mispelled word to one with similar letters 
        return word                                    #but very different meaning hopefully
                                          

# Feature Extraction and Vectorisation:

In [5]:
# function to get text features
def getFeatures(tokens,doc):
    feature_tokens = []
    #ner tokens
    ner_tokens = [X.label_ for X in doc.ents]
    for t in ner_tokens:
        feature_tokens.extend([(t,"")])
    
    #sentiment analysis
    sid = SentimentIntensityAnalyzer()
    ss = sid.polarity_scores(doc.text)
    sent_dict = {
      ("neg",""): ss['neg'],
      ("neu",""): ss['neu'],
      ("pos",""): ss['pos']
    }

    # POS tags
    tagged_tokens = []
    for token in doc:
        #tagged_tokens.append(token.dep_)      # Dep tag
        tagged_tokens.append(token.pos_)      # UPOS tag
        #tagged_tokens.append(token.tag_)      # POS tag

    tagged_bigrams =(list(bigrams(tagged_tokens)))
    feature_tokens.extend(tagged_bigrams)
    
    return feature_tokens,sent_dict


# function to return only dictionary items with a value over a minimum doc count
def min_doc_freq(counts,k):   
    count_min = {}
    for t in counts:
        if counts[t] >= k:
            count_min[t] = counts[t]            
    #print("\nCounts:",len(counts))
    #print("Min:",len(count_min)) 
    return count_min
    

# function to create feature dictionary
def toFeatureVector(tokens,Polarity,extra_features=[]):
    #seperate tokens and text (text for features)
    doc = tokens[1]
    tokens = tokens[0]
    
    counts = Counter(tokens)  # for now a simple count
    counts = dict(counts)
    p =(bigrams([Polarity,""]))      #create tuple of verified data to enter into dictionary
    p = ('polarity',Polarity)
    counts[p] = 1                    #this was to avoid equal priority errors when classifying data
    
    #counts = min_doc_freq(counts,2)
    
    extra_features,sent_dict = getFeatures(tokens,doc)     # get extra features  
    counts.update(sent_dict)                               # add sentiment dictionary
    
    #add the extra features, for now just adding one count for each extra feature
    for feature in extra_features:
        if feature in counts:
            counts[feature] += 1
        else:
            counts[feature] = 1
    return counts

In [6]:
def featureVectorisation(corpus, fitting=False):    
    # uses the global variable of the corpus Vectorizer to improve things
    if fitting:
        corpusVectorizer.fit([toFeatureVector(Text,Polarity) for Text,Polarity,Label in corpus])
    doc_feature_matrix = corpusVectorizer.transform([toFeatureVector(Text,Polarity) for Text,Polarity,Label in corpus])
    
    return doc_feature_matrix


# Classification:

In [7]:
# TRAINING AND VALIDATING OUR CLASSIFIER
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('tfidf', TfidfTransformer()),('chi2', SelectKBest(chi2, k=50000)),('svc', LinearSVC(loss = 'hinge'))])
    return SklearnClassifier(pipeline).train(trainData)

In [8]:
# perform cross-validation
def crossValidate(data, folds):
    shuffle(data)
    cv_results = []
    foldSize = int(len(data)/folds)
    k = 0                                                 # k is used to select the test data from the training data

    for x in range(folds):                                # for loop that iterates for each fold
        print("\nFold %d: %d - %d" % (x+1, k, k+foldSize))
        testData = data[k:k+foldSize]                     # assign test and train data      
        trainData = data[0:k]+data[k+foldSize:]     
        y_true = [x[1] for x in testData]                 # find ground truth labels
        clf = trainClassifier(trainData)                  # train classifier on training data
        y_pred = clf.classify_many(map(lambda t: t[0], testData))
        
        #evaluate predictions
        cv_results.append(metrics.precision_recall_fscore_support(y_true, y_pred, average='weighted'))
        k = k+foldSize 
        
    avgResults = [np.mean([x[0] for x in cv_results]),
                   np.mean([x[1] for x in cv_results]),
                   np.mean([x[2] for x in cv_results])]
    
    print("Cross-Validation Results:")
    print("Average Precision: %f\nAverage Recall: %f\nAverage F-Score:%f" % (avgResults[0],avgResults[1],avgResults[2]))
    return avgResults


In [9]:
# PREDICTING LABELS GIVEN A CLASSIFIER

def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: t[0], reviewSamples))

def predictLabel(reviewSample, classifier):
    return classifier.classify(toFeatureVector(preProcess(reviewSample)))

# Main

In [10]:
trainData = []        # the pre-processed training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = []         # the pre-processed test data as a percentage of the total dataset (currently 20%, or 4200 samples)

# The output classes
fakeLabel = 'deceptive'
realLabel = 'truthful'

# References to the data files
reviewPath = 'deceptive-opinion.csv'

# Parse the dataset and put it in a raw data list
print("Loading dataset...")
rawData = loadData(reviewPath)
shuffle(rawData)
print("   Number of Data Samples:",len(rawData))
print("   First Raw Data Sample:\n\n",rawData[0])

# Split the raw dataset into a set of training data and a set of test data (80/20)
print("\n\nSplitting dataset...")
splitData(rawData,0.8)
print("   Number of Training Data Samples:",len(trainData),"\n   Number of Test Data Samples:",len(testData))


# Pre-process training Data
print("\n\nPre-processing dataset...")
preProcessed_trainData = []
for Text,Polarity,Label in trainData:
        preProcessed_trainData.append((preProcess(Text),Polarity,Label))
print("   First Pre-processed Data Sample:\n\n",preProcessed_trainData[0])

Loading dataset...
   Number of Data Samples: 1600
   First Raw Data Sample:

 ("While visiting the Chicago area, we chose the Hotel Monaco Chicago for our stay. As one of the premier luxury hotels in Chicago, we held high expectations but were quickly disappointed in what we recieved. The problems began as soon as we arrived. The attendant was not friendly nor did she have the smile that is suppose to be the symbol of Kimpton. And, when we went to our room only to discover that we had not been given what we had paid for, the situation only got worse. Rather than apologize for the error (my credit card receipt was on hand) we were blamed for the indescrpencies in rooms and told that not only was the room we had reserved and paid for more than two months unavailable, we would not recieve a refund of the price difference of the two rooms. I could not believe that a top 40 U.S. hotel that is suppose to be committed to excellence could make such a blunder! With no choice but to retreat bac

In [11]:
# Extract Features training Data
print("\n\nExtracting Features from dataset...")
final_trainData = []
for Text,Polarity,Label in preProcessed_trainData:
        final_trainData.append(((toFeatureVector(Text,Polarity)),Label))
print("   First rawData Sample:\n\n",final_trainData[0])



Extracting Features from dataset...
   First rawData Sample:

 ({('visit', 'chicago'): 1, ('chicago', 'area'): 1, ('area', 'chose'): 1, ('chose', 'hotel'): 1, ('hotel', 'monaco'): 1, ('monaco', 'chicago'): 1, ('chicago', 'stay'): 1, ('stay', 'one'): 1, ('one', 'premier'): 1, ('premier', 'luxuri'): 1, ('luxuri', 'hotel'): 1, ('hotel', 'chicago'): 2, ('chicago', 'held'): 1, ('held', 'high'): 1, ('high', 'expect'): 1, ('expect', 'quickli'): 1, ('quickli', 'disappoint'): 1, ('disappoint', 'reciev'): 1, ('reciev', 'problem'): 1, ('problem', 'began'): 1, ('began', 'soon'): 1, ('soon', 'arriv'): 1, ('arriv', 'attend'): 1, ('attend', 'friendli'): 1, ('friendli', 'smile'): 1, ('smile', 'suppos'): 1, ('suppos', 'symbol'): 1, ('symbol', 'kimpton'): 1, ('kimpton', 'went'): 1, ('went', 'room'): 1, ('room', 'discov'): 1, ('discov', 'given'): 1, ('given', 'paid'): 1, ('paid', 'situat'): 1, ('situat', 'got'): 1, ('got', 'wors'): 1, ('wors', 'rather'): 1, ('rather', 'apolog'): 1, ('apolog', 'error'):

In [12]:
# crossValidate function on the training set to get your results
cv_results = crossValidate(final_trainData,10)


Fold 1: 0 - 128
Training Classifier...

Fold 2: 128 - 256
Training Classifier...

Fold 3: 256 - 384
Training Classifier...

Fold 4: 384 - 512
Training Classifier...

Fold 5: 512 - 640
Training Classifier...

Fold 6: 640 - 768
Training Classifier...

Fold 7: 768 - 896
Training Classifier...

Fold 8: 896 - 1024
Training Classifier...

Fold 9: 1024 - 1152
Training Classifier...

Fold 10: 1152 - 1280
Training Classifier...
Cross-Validation Results:
Average Precision: 0.831117
Average Recall: 0.826562
Average F-Score:0.826409


# Evaluate on test set

In [13]:
# Finally, check the accuracy of classifier by training on all the training data
# and testing on the test set
print("   First Test Data Sample:\n\n",testData[0])

# Pre-process test Data
print("\n\nPre-processing dataset...")
preProcessed_testData = []
for Text,Polarity,Label in testData:
        preProcessed_testData.append((preProcess(Text),Polarity,Label))
print("   First Pre-processed Test Data Sample:\n\n",preProcessed_testData[0])

   First Test Data Sample:

 ('The hotel was undergoing renovations so it was dirty and noisy. The entrance was really scary since it was covered with scaffolding, making it dark. They charge a lot for internet hook up, wifi is not included. The staff was very slow, I am assuming they were all new. They could not say where anything was or what attractions were nearby. The room was set up weird, the bathroom opened facing the bed, rather than the hall as most do. Room temp was hard to control, even though they had a digital thermostat. Overall I was very disappointed, had I been staying more than one night I would have switched hotels.\n', 'negative', 'truthful')


Pre-processing dataset...
   First Pre-processed Test Data Sample:

 (([('hotel', 'undergo'), ('undergo', 'renov'), ('renov', 'dirti'), ('dirti', 'noisi'), ('noisi', 'entranc'), ('entranc', 'realli'), ('realli', 'scari'), ('scari', 'sinc'), ('sinc', 'cover'), ('cover', 'scaffold'), ('scaffold', 'make'), ('make', 'dark'), ('da

In [14]:
# Extract Features test Data
print("\n\nExtracting Features from Test dataset...")
final_testData = []
for Text,Polarity,Label in preProcessed_testData:
        final_testData.append(((toFeatureVector(Text,Polarity)),Label))
print("   First rawData Sample:\n\n",final_testData[0])



Extracting Features from Test dataset...
   First rawData Sample:

 ({('hotel', 'undergo'): 1, ('undergo', 'renov'): 1, ('renov', 'dirti'): 1, ('dirti', 'noisi'): 1, ('noisi', 'entranc'): 1, ('entranc', 'realli'): 1, ('realli', 'scari'): 1, ('scari', 'sinc'): 1, ('sinc', 'cover'): 1, ('cover', 'scaffold'): 1, ('scaffold', 'make'): 1, ('make', 'dark'): 1, ('dark', 'charg'): 1, ('charg', 'lot'): 1, ('lot', 'internet'): 1, ('internet', 'hook'): 1, ('hook', 'wifi'): 1, ('wifi', 'includ'): 1, ('includ', 'staff'): 1, ('staff', 'slow'): 1, ('slow', 'assum'): 1, ('assum', 'new'): 1, ('new', 'could'): 1, ('could', 'say'): 1, ('say', 'anyth'): 1, ('anyth', 'attract'): 1, ('attract', 'nearbi'): 1, ('nearbi', 'room'): 1, ('room', 'set'): 1, ('set', 'weird'): 1, ('weird', 'bathroom'): 1, ('bathroom', 'open'): 1, ('open', 'face'): 1, ('face', 'bed'): 1, ('bed', 'rather'): 1, ('rather', 'hall'): 1, ('hall', 'room'): 1, ('room', 'temp'): 1, ('temp', 'hard'): 1, ('hard', 'control'): 1, ('control', 'e

In [15]:
# Train classifer on all train data
classifier = trainClassifier(final_trainData)  # train the classifier
testTrue = [t[1] for t in final_testData]   # get the ground-truth labels from the data
testPred = predictLabels(final_testData, classifier)  # classify the test data to get predicted labels
finalScores = metrics.precision_recall_fscore_support(testTrue, testPred, average='weighted') # evaluate
print("Done training!")
print("\nPrecision: %f\nRecall: %f\nF Score:%f" % finalScores[:3])

Training Classifier...
Done training!

Precision: 0.823861
Recall: 0.815625
F Score:0.814676
