In [33]:
# Import Packages

import os
import re
import numpy as np
import pandas as pd
import math
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
import operator
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from scipy.stats import bernoulli


### Adding s to starting/ending of each review and Lemmatization

In [34]:
def modifytext(filename):
    f = open(os.getcwd() + "/DATASET/train/" + filename, "r")
    lemmer=WordNetLemmatizer()
    porter_stem = PorterStemmer()
    mod_text = ''
    for line in f:
        line = ' '.join([lemmer.lemmatize(word) for word in line.split()])              # Lemmatization
#         line = ' '.join([porter_stem.stem(word) for word in line.split()])               # Stemming
        line = '<s>' + ' ' + line
        mod_text += line
    
    return mod_text

### Unigram Count

In [35]:
# O(n) complexity
def unigram(corpus):
    unigram_count = {}
    total_word_count = len(corpus)

    for item in corpus: 
        if (item in unigram_count):   
            unigram_count[item] += 1
        else: 
            unigram_count[item] = 1
    
    return unigram_count

### Bigram count

In [36]:
# Complexity O(n)
def bigram(corpus):
    corpus_bigram = []
    for i in range(len(corpus)-1):
        corpus_bigram.append(corpus[i]+ ' ' + corpus[i+1])

    bigram_count = {}
    for item in corpus_bigram: 
        if (item in bigram_count):           
            bigram_count[item] += 1
        else: 
            bigram_count[item] = 1

    return bigram_count

### Bigram "+k" smoothing (+1 is too coarse) 

In [37]:
# Complexity O(1)
def k_smoothing(word1, word2, bigram_count, unigram_count, k):
    unigram_word_types = unigram_count.keys()
    bigram = word1 + ' ' + word2
    prob = 0
    if bigram in bigram_count:
        prob = float('%.4f'%(round((bigram_count[bigram] + k),2)/(unigram_count[word1] + k*len(unigram_word_types))))
        
    else:
        prob = k/(unigram_count[word1] + k*len(unigram_word_types))
        
    return prob

In [38]:
def k_smoothing_unigram(word, unigram_count, k):
    if word in unigram_count:
        prob = (unigram_count[word] + k)/(sum(unigram_count.values()) + k*len(unigram_count.keys()))
    else:
        prob = (k)/(sum(unigram_count.values()) + k*len(unigram_count.keys()))
    return prob

In [39]:
# This was tried as one of the methods but didn't perform well, so we didn't use it.

def k_smoothing_interpolation(word1, word2, bigram_count, unigram_count, k, lamda):
    
    unigram_word_types = unigram_count.keys()
    bigram = word1 + ' ' + word2
    prob = 0
    
    if bigram in bigram_count:
        prob = (1-lamda)*float('%.4f'%(round((bigram_count[bigram] + k),2)/(unigram_count[word1] + k*len(unigram_word_types)))) + lamda*(unigram_count[word2] + k)/(sum(unigram_count.values()) + k*len(unigram_count.keys()))
    elif word1 in unigram_count and word2 in unigram_count:
        prob = (1-lamda)*k/(unigram_count[word1] + k*len(unigram_word_types)) +  lamda*(unigram_count[word2] + k)/(sum(unigram_count.values()) + k*len(unigram_count.keys()))
    else:
        prob = (1-lamda)*k/(unigram_count['UNK'] + len(unigram_word_types)) + lamda*(unigram_count['UNK'] + k)/(sum(unigram_count.values()) + k*len(unigram_count.keys()))
    
    return prob

### Perplexity Score 

In [40]:
def perplexity(review, bigram_count, unigram_count, k, lamda):  # review should be sent as a list of words
    test_corpus_bigram = []
    for i in range(len(review)-1):
        test_corpus_bigram.append(review[i]+ ' ' + review[i+1])
    
    N = len(review)
    sum_prob = 0
    for bigram in test_corpus_bigram:
        str = bigram.split(' ')
        prob = k_smoothing(str[0], str[1], bigram_count, unigram_count, k)
#         prob = k_smoothing_interpolation(str[0], str[1], bigram_count, unigram_count, k, lamda)
        sum_prob += float('%.4f'%(round(math.log(prob),2)))
        
    PP = math.exp(-sum_prob/N)
    return PP

In [41]:
def perplexity_unigram(review, unigram_count, k):
    sum_prob = 0
    N = len(review)
    for word in review:
        prob = k_smoothing_unigram(word, unigram_count, k)
        sum_prob += float('%.4f'%(round(math.log(prob),2)))
    
    PP = math.exp(-sum_prob/N)
    return PP

### Removing Stop Words


In [42]:
def remove_stopwords(corpus):
    stop_words = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
#     stop_words = ['the','and','a','to','was','i','for','it','is','be','as','so','are','all','when']
    filter_corpus = []
    for word in corpus:
        if word not in stop_words:
            filter_corpus.append(word)
    return filter_corpus

### Unknown Words Addition 

In [43]:
# Used to add UNK to the training corpus
def unknown_words(corpus):
    for i in range(len(corpus)):
        toss = np.random.binomial(size=1, n=1, p= 0.01)
        if toss == 1:
            corpus[i] = 'UNK'
    
    return corpus

In [44]:
# Used to add UNK in validation and Testing corpus 
def add_unknown_words(corpus, unigram_count):
    for i in range(len(corpus)):
        if corpus[i] not in unigram_count:
            corpus[i] = 'UNK'
    return corpus

### Language Model and Perplexity based Classification

In [45]:
np.random.seed(42)
filename = 'truthful.txt'
f = open(os.getcwd() + "/DATASET/train/" + filename, "r")
text = modifytext(filename)    # adding <s> to starting and ending each review.
text = text.lower()
# text = text.replace(" '", "")
# text = text.replace(",", "")
truthful_corpus = text.split()
# truthful_corpus = remove_stopwords(truthful_corpus)
truthful_corpus = unknown_words(truthful_corpus)
f.close()

filename = 'deceptive.txt'
f = open(os.getcwd() + "/DATASET/train/" + filename, "r")
text = modifytext(filename)    # adding <s> to starting and ending each review.
text = text.lower()
# text = text.replace(" '", "")
# text = text.replace(",", "")
deceptive_corpus = text.split()
# deceptive_corpus = remove_stopwords(deceptive_corpus)
deceptive_corpus = unknown_words(deceptive_corpus)
f.close()

truthful_unigram_count = unigram(truthful_corpus)
truthful_bigram_count = bigram(truthful_corpus)

deceptive_unigram_count = unigram(deceptive_corpus)
deceptive_bigram_count = bigram(deceptive_corpus)


# Loading the validation dataset
k = 0.01                       # smoothing parameter
k1 = 1.0
lamda = 1.0
ypred = []
ytruth = []

filename = 'truthful.txt'
f = open(os.getcwd() + "/DATASET/validation/" + filename, "r")

# Complexity O(n2)

for line in f:
    text = line
    text = '<s>' + ' ' + text
    text = text.lower()
#     text = text.replace(" '", "")
#     text = text.replace(",", "")
    truthful_corpus_valid = text.split()    # Reads one review at a time
    truthful_corpus_valid_1 = add_unknown_words(truthful_corpus_valid, truthful_unigram_count)
#     truthful_corpus_valid = remove_stopwords(truthful_corpus_valid)
#     pp_truthful = perplexity(truthful_corpus_valid_1, truthful_bigram_count, truthful_unigram_count, k, lamda)
    truthful_corpus_valid_2 = add_unknown_words(truthful_corpus_valid, deceptive_unigram_count)
#     pp_deceptive = perplexity(truthful_corpus_valid_2, deceptive_bigram_count, deceptive_unigram_count, k, lamda)
    
    pp_truthful = perplexity_unigram(truthful_corpus_valid_1, truthful_unigram_count, k1)
    pp_deceptive = perplexity_unigram(truthful_corpus_valid_2, deceptive_unigram_count, k1)

    if pp_truthful < pp_deceptive:
        ypred.append(0)
    else:
        ypred.append(1)
    ytruth.append(0)
f.close()

filename = 'deceptive.txt'
f = open(os.getcwd() + "/DATASET/validation/" + filename, "r")

# Complexity O(n2)
for line in f:
    text = line
    text = '<s>' + ' ' + text
    text = text.lower()
#     text = text.replace(" '", "")
#     text = text.replace(",", "")
    deceptive_corpus_valid = text.split()          # Reads one review at a time
#     deceptive_corpus_valid = remove_stopwords(deceptive_corpus_valid)
    deceptive_corpus_valid_1 = add_unknown_words(deceptive_corpus_valid, truthful_unigram_count)
#     pp_truthful = perplexity(deceptive_corpus_valid_1, truthful_bigram_count, truthful_unigram_count, k, lamda)
    deceptive_corpus_valid_2 = add_unknown_words(deceptive_corpus_valid, deceptive_unigram_count)
#     pp_deceptive = perplexity(deceptive_corpus_valid_2, deceptive_bigram_count, deceptive_unigram_count, k, lamda)
    pp_truthful = perplexity_unigram(deceptive_corpus_valid_1, truthful_unigram_count, k1)
    pp_deceptive = perplexity_unigram(deceptive_corpus_valid_2, deceptive_unigram_count, k1)
    if pp_truthful < pp_deceptive:
        ypred.append(0)
    else:
        ypred.append(1)
    ytruth.append(1)
f.close()

acc_score = accuracy_score(ytruth, ypred)
print('Accuracy Score: %s'%(acc_score))


Accuracy Score: 0.8828125


### Model execution on test set

In [64]:
filename = 'test.txt'
f = open(os.getcwd() + "/DATASET/test/" + filename, "r")
k = 0.01                       # smoothing parameter
k1 = 1.0
ypred = []
lemmer=WordNetLemmatizer()

# Complexity O(n2)
for line in f:
    text = line
    text =' '.join([lemmer.lemmatize(word) for word in text.split()])              # Lemmatization
    text = '<s>' + ' ' + text
    text = text.lower()
#     text = text.replace(" '", "")
#     text = text.replace(",", "")
    deceptive_corpus_valid = text.split()          # Reads one review at a time
#     deceptive_corpus_valid = remove_stopwords(deceptive_corpus_valid)
    deceptive_corpus_valid_1 = add_unknown_words(deceptive_corpus_valid, truthful_unigram_count)
#     pp_truthful = perplexity(deceptive_corpus_valid_1, truthful_bigram_count, truthful_unigram_count, k, lamda)
    deceptive_corpus_valid_2 = add_unknown_words(deceptive_corpus_valid, deceptive_unigram_count)
#     pp_deceptive = perplexity(deceptive_corpus_valid_2, deceptive_bigram_count, deceptive_unigram_count, k, lamda)
    pp_truthful = perplexity_unigram(deceptive_corpus_valid_1, truthful_unigram_count, k1)
    pp_deceptive = perplexity_unigram(deceptive_corpus_valid_2, deceptive_unigram_count, k1)
    if pp_truthful < pp_deceptive:
        ypred.append(0)
    else:
        ypred.append(1)
f.close()
output = pd.DataFrame(ypred)
output.columns = ['Prediction']
output.index.name = 'Id'
path = os.getcwd() + '/result.csv'
output.to_csv(path)
output.head()


Unnamed: 0_level_0,Prediction
Id,Unnamed: 1_level_1
0,0
1,0
2,0
3,1
4,0


In [5]:
np.random.binomial(size=10, n=1, p= 0.01)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])