In [78]:
#Open the dataset 'a1_d3.txt', it is stored in the same directory.
import io
data_path = './a1_d3.txt'
with open(data_path, 'r') as f:
    lines = f.read().split('\n')

#'lines' stores the instances in a raw format.
# print(lines)

In [79]:
#######Preprocessing
#Some basic preprocessing steps to work with a better version of the data.

from string import punctuation 

#Create a punctuation list, which would help in identifying the punctuations
punct =[]
punct += list(punctuation)
punct += '’'
punct.remove("'")

#This function removes the punctuations from the input text and returns the processed text.
def remove_punctuations(text):
    for punctuation in punct:
        text = text.replace(punctuation, ' ')
    return text

#These are the common leftovers, after removing stopwords or punctuations
leftovers = ["'ve","'s","'m","i'm","n't"]

#These are the common stopwords for English Language, which are needed to be removed from the Vocabulary corpus.
stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves",
             "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their",
             "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", 
             "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and",
             "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between",
             "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", 
             "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both",
             "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too",
             "very", "s", "t", "can", "will", "just", "don", "should", "now"]

_stopwords = set(stopwords+leftovers)

def Process(lines):
    processedLines=[]
    for line in lines:
      processedLines.append(ProcessUtil(line))
    return processedLines

#This utility function does the following steps:
########### 1. converts text to lower case
########### 2. removes punctuations
########### 3. tokenises the sentences into words.
def ProcessUtil(line):

    line = line.lower() # convert text to lower-case
    line = remove_punctuations(line) #remove punctuations
    line = line.split() # tokenise the line into words by splitting whitespaces.
    return [word for word in line if word not in _stopwords]

processedLines = Process(lines)


In [80]:
processedData=[]
# [['a',''b'.....'label(0/1)'],[]]
for l in processedLines:
  if(len(l)):
    words = []
    label = -1
    for x in range(len(l)-1):
      words.append(l[x])
    label = l[len(l)-1]
    entry=[]
    entry.append(words)
    entry.append(label)
    processedData.append(entry)

# print(processedData)

#processedData is a list of listZs'
#each listZ is a list containing 2 elements, 0th element is the list of words(tokens), 1st element is the associated label




In [81]:
#This function creates a word-frequency dictionary for both the classes
def CreateWordcountDicts(training_instances):
    class0Wordcount={}
    class1Wordcount={}

    
    for i in training_instances:
      label = i[1]
    
      #if label is 1, add to class1 dictionary
      if(label=='1'):
        words = i[0]
        for w in words:
          if w not in class1Wordcount.keys():
            class1Wordcount[w]=1
          else:
            class1Wordcount[w]+=1

      #if label is 0, add to class0 dictionary
      if(label=='0'):
        words = i[0]
        for w in words:
          if w not in class0Wordcount.keys():
            class0Wordcount[w]=1
          else:
            class0Wordcount[w]+=1

    return class0Wordcount,class1Wordcount

In [82]:
#This function creates a word-frequency dictionary for the training instances
def GetListofDistinctWords(testing_instances):
    #This would be a list of dicts.
    wordsInTestData=[]
    
    for i in testing_instances:
      words=i[0]
      toAdd={}
      for w in words:
        if w not in toAdd.keys():
          toAdd[w]=1
        else:
          toAdd[w]+=1
      
      wordsInTestData.append(toAdd)
        
    return wordsInTestData


In [83]:
#This function just extracts the labels associated with the training instances.
def GetActualLabels(testing_instances):
    actual = []
    for i in testing_instances:
        #Store the actual class label of testing instance in actual list.
        label = i[1]
        actual.append(label)
        
    return actual

In [84]:
#This function calculates the prior probablities for both classes, using the training instances.
def CalcPriors(training_instances):

    prior1=0
    prior0=0
    numClass1=0
    numClass0=0

    for i in training_instances:
      label=i[1]
      if(label=='1'):
        numClass1+=1
      if(label=='0'):
        numClass0+=1

    total = numClass1 + numClass0

    prior1 = numClass1/total
    prior0 = numClass0/total

#     print(prior1)
#     print(prior0)

    return prior1, prior0

In [85]:
#This is the function that implements the Naive Bayes Classifier
#The major steps involved are shown in the function definition below.
def Classify(class1Wordcount,class0Wordcount,wordsInTestData,prior1,prior0):

    predictions = []
    V = len(class1Wordcount.keys())+len(class0Wordcount.keys())
    alpha = 0.001

    for i in wordsInTestData:
        
        words = i.keys()
        ###For Class 1
        #Step1 - Finding Conditional Probabilities 

        WordCntClass1=sum(class1Wordcount.values())

        #calculating P(Xi/y=1) = freq(Xi)/(Word count of all words in class 1)
        
        #Likelihood function P(X/y=1) = product of all P(Xi/y=1)
        likelihood1=1
        
        for word in words:
          if word in class1Wordcount.keys():
            likelihood1*=(alpha+class1Wordcount[word])/(V+WordCntClass1)
          #Check else condition daalna hai ya nahi...  
          else:
            likelihood1*=(alpha)/(V+WordCntClass1)
            
            likelihood1 = (likelihood1)**i[word]
        #Step2 - Finding posterior probability, p1 = P(y=1/X) , not dividing by the product of evidence probabilities, i.em P(Xi)'s

        p1 = likelihood1*prior1

        ###For Class 0
        #Step1 - Finding Conditional Probabilities 

        WordCntClass0=sum(class0Wordcount.values())


        #calculating P(Xi/y=0) = freq(Xi)/(Word count of all words in class 1)
        #likelihood function P(X/y=0) = product of all P(Xi/y=0)
        likelihood0=1
        
        for word in words:
          if word in class0Wordcount.keys():
            likelihood0*=(alpha+class0Wordcount[word])/(V+WordCntClass0)
          #Check else condition daalna hai ya nahi...  
          else:
            likelihood0*=(alpha)/(V+WordCntClass0)
            
            likelihood0 = (likelihood0)**i[word]
        #Step2 - Finding posterior probability, p0 = P(y=0/X) , not dividing by the product of evidence probabilities, i.em P(Xi)'s

        p0 = likelihood0*prior0

        #Now based on values of p1 and p0 , predict the class of the training instance.

        if(p1>=p0):
          predictions.append('1')
        else:
          predictions.append('0')


    return predictions

In [86]:
#This is to calculate the Accuracy, Precision, Recall and F-Score for the resultts of the classifier.
def getAccuracyAndFScore(predictions,actual):
    TP, TN, FP, FN = 0 , 0 , 0 , 0
    for i in range(len(predictions)):
        if predictions[i]== '1' and actual[i]== '1' :
          TP += 1
        elif predictions[i]== '0' and actual[i]== '0' :
          TN += 1
        elif predictions[i]== '1' and actual[i]== '0' :
          FP += 1
        elif predictions[i]== '0' and actual[i]== '1' :
          FN += 1
    print(TP,TN,FP,FN)    
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    fscore = 2*precision*recall/ (precision + recall)
    accuracy = (TP+TN)/(TP+TN+FP+FN)
    
    return accuracy,fscore

In [87]:
#5 Fold Cross Validation
from sklearn.model_selection import KFold
from numpy import array
data = processedData
kfold = KFold(5, True, 1)
# enumerate splits
fold_accuracies = []
fold_fscores = []
for train, test in kfold.split(data):
  training_data=[]
  testing_data=[]
  for tr in range(len(train)):
    training_data.append(data[train[tr]])
  for te in range(len(test)):
    testing_data.append(data[test[te]])
    #for each fold, classify the testing instances using the model trained by the training instances.

  training_instances=training_data
  testing_instances=testing_data
  
  class0Wordcount,class1Wordcount = CreateWordcountDicts(training_instances)

  #Create list of distinct words and store their frequencies in the testing instances
  wordsInTestData = GetListofDistinctWords(testing_instances)
  #Now, wordsInTestData would be a list of dicts, where each dict would contain the unique word-frequency tuples present in the testing instance.

  #Calculate prior probabilities.
  prior1, prior0 = CalcPriors(training_instances)

  #Now predict the class of training instances, using Naive Bayes Classifier
  predictions = Classify(class1Wordcount,class0Wordcount,wordsInTestData,prior1,prior0)

  #Store the labels of the testing instances
  actual = GetActualLabels(testing_instances)
  
  #calculating accuracy of model
  accuracy,fscore = getAccuracyAndFScore(predictions,actual)
  fold_accuracies.append(accuracy)
  fold_fscores.append(fscore)
  




 


70 74 34 22
73 76 19 32
88 66 28 18
76 78 21 25
72 79 25 24


In [88]:
#This cell just reports the accuracies and Fscores for each fold.

import statistics
print(fold_accuracies)
print(fold_fscores)
acc_mean = statistics.mean(fold_accuracies)
acc_std = statistics.pstdev(fold_accuracies)
acc_std = "{:.2f}".format(acc_std)

fs_mean = statistics.mean(fold_fscores)
fs_std = statistics.pstdev(fold_fscores)
fs_std = "{:.2f}".format(fs_std)

print('\nAccuracy of classifier:  ',acc_mean,'±',acc_std)
print('\nF-score of classifier:  ',fs_mean,'±',fs_std)

[0.72, 0.745, 0.77, 0.77, 0.755]
[0.7142857142857143, 0.7411167512690355, 0.7927927927927928, 0.7676767676767676, 0.7461139896373057]

Accuracy of classifier:   0.752 ± 0.02

F-score of classifier:   0.7523972031323232 ± 0.03
