In [1]:
import csv
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk import ngrams

In [4]:
#Read data from training set
def readData(fileName):
    words = []
    file = open(fileName, "r")

    for word in file.read().split():
      words.append(word)

    file.close()
    return words

#Read data from test set
def readTestFile(fileName):
  reviews = []
  file = open(fileName, "r")
  lines = file.readlines()

  for line in lines:
      reviews.append(line)

  file.close()
  return reviews


#Create n-gram features for the data
#Comment out the unwanted features
def create_ngram_features(words):
    unigram_vocab = ngrams(words, 1)
    bigram_vocab = ngrams(words, 2)
    trigram_vocab = ngrams(words, 3)
    quadgram_vocab = ngrams(words, 4)
    pentagram_vocab = ngrams(words, 5)
    
    my_dict = {}

    for ng in unigram_vocab:
        if ng in my_dict:
            my_dict[ng] = my_dict[ng] + 1
        else:
            my_dict[ng] = 1
    for ng in bigram_vocab:
        if ng in my_dict:
            my_dict[ng] = my_dict[ng] + 1
        else:
            my_dict[ng] = 1
    for ng in trigram_vocab:
        if ng in my_dict:
            my_dict[ng] = my_dict[ng] + 1
        else:
            my_dict[ng] = 1 
    for ng in quadgram_vocab:
        if ng in my_dict:
            my_dict[ng] = my_dict[ng] + 1
        else:
            my_dict[ng] = 1
    for ng in pentagram_vocab:
        if ng in my_dict:
            my_dict[ng] = my_dict[ng] + 1
        else:
            my_dict[ng] = 1
            
    my_dict.update({'total_word_count':len(words)})
    return my_dict

#Reading the test file
test = readTestFile("DATASET/test/test.txt")

#Constructing the bigram model for truthful corpus
truthful = readTestFile("DATASET/train/truthful.txt")   

#Constructing the bigram model for deceptive corpus
deceptive = readTestFile("DATASET/train/deceptive.txt") 

#Classify the test corpus with existing Language Models
truthfulVal = readTestFile("DATASET/validation/truthful.txt")
deceptiveVal = readTestFile("DATASET/validation/deceptive.txt")


truthful_data = []
for truthful_reviews in truthful:
  words = truthful_reviews.split()
  truthful_data.append((create_ngram_features(words), "truthful"))    

deceptive_data = []
for deceptive_reviews in deceptive:
  words = deceptive_reviews.split()
  deceptive_data.append((create_ngram_features(words), "deceptive"))    


truthful_val_data = []
for truthful_reviews in truthfulVal:
  words = truthful_reviews.split()
  truthful_val_data.append((create_ngram_features(words), "truthful"))    

deceptive_val_data = []
for deceptive_reviews in deceptiveVal:
  words = deceptive_reviews.split()
  deceptive_val_data.append((create_ngram_features(words), "deceptive")) 

train_set = truthful_data + deceptive_data
test_set =  truthful_val_data + deceptive_val_data

classifier = NaiveBayesClassifier.train(train_set)

accuracy = nltk.classify.util.accuracy(classifier, test_set)
print('n-gram accuracy:', accuracy)

n-gram accuracy: 0.89453125


In [5]:
#Read test data
test_data = []
for reviews in test:
  words = reviews.split()
  test_data.append((create_ngram_features(words))) 

In [7]:
#Compute test results
list_results = []
truth_review = 0
for i in range(len(test_data)):
    result = classifier.classify(test_data[i])
    if result == 'truthful':
        truth_review += 1
    list_results.append(result)

In [None]:
#Output csv document for prediction
with open("Prediction6.csv", "w") as file:
    index = 0
    for review in list_results:
        if review == 'truthful':
            file.write("" + str(index) + ", 0 \n")
        else:
            file.write("" + str(index) + ", 1 \n")
        index+=1

file.close()