In [3]:
# import libraries
import json, re, sys, os
import numpy as np
import random as ra
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rahul__ohlan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### 1. Implementing the Adept Naive Bayes Classifier

In [4]:
def countPhrases(message):    
    
    counts = {"*" : 0}
    
    for MWE in word_tokenize(message):
        phrase = "".join([block for block in MWE])
        if re.search("[a-zA-Z]", phrase):
            counts[phrase] = counts.get(phrase,0) + 1
            counts["*"] +=1
    return counts

In [5]:
def loadPhrases(record):   
                                
    
    codes = record["codes"]   


    counts = countPhrases(record["tweet"])   
    

    for code in codes:                     

        codeCounts = dict(counts)
        if code == "null":
            continue
        else:
            if codes[code] == "1":

                codeCounts["_"+code] = 1
                yield code,codeCounts

            else:
                codeCounts["_not." + code] =1           
                yield "not." + code,codeCounts

In [7]:
def aggregateByPhrase(d1,d2):  
    
    for key in d2:
        
        d1[key] = d1.get(key,0) + d2[key]
        
    return d1

In [10]:
def computeEntropies(record,N):
    code,entropies = record      

    n = float(entropies["_N"]) 
    entropies["*"] = sum([entropies[phrase] for phrase in entropies if phrase != "_N" and phrase!="*" and phrase!="_"+code])
                        
    m = float(entropies["*"])
    del entropies["_N"]
    del entropies["*"]
    alpha = n/N              
    
    for phrase in entropies:
        if not re.match("^_",phrase):
            entropy = -np.log2((entropies[phrase]+alpha)/(alpha*N+m))
            entropies[phrase] = entropy
            
    entropies["_DEFAULT"] = -np.log2(alpha/(alpha*N + m))      
    
    return code,entropies

In [11]:
def findN(record):
    ID,counts = record
    yield "_ALLCODES",counts
    counts["_N"] = float(len(counts.keys())-1)
    yield ID,counts

In [12]:
def processTweet(tweet,prior_entropy,likelihoodEntropy):     
    result = {
        "posteriors" : {code: prior_entropy[code] for code in prior_entropy}, 
                                                                                   
                                                                              
        "M_words" : 0,
        "counts" : dict(),     
        "N" : 0,
        "M" : 0,
        "tweet" : tweet
    }
    
    review_M = 0
    tempCounts = countPhrases(tweet)     
    for phrase in tempCounts:           
        if phrase!= "*":

            result['counts'][phrase] = result['counts'].get(phrase,0) + tempCounts[phrase]     
            
    todel = []
    
    for phrase in result['counts']:          
        if phrase!="*":
            INLIKE = False     
            for code in likelihoodEntropy:    
                if likelihoodEntropy[code].get(phrase,False):    
                    result['posteriors'][code] -= result['counts'][phrase] * likelihoodEntropy[code][phrase]
                    INLIKE = True
                    
            if INLIKE:  
                result['M'] += result['counts'][phrase] 
                result["M_words"] += result['counts'][phrase] * len(re.split(" ",phrase))
                result["N"] += 1
                
                for code in likelihoodEntropy:
                    if not likelihoodEntropy[code].get(phrase,False):
                        result['posteriors'][code] -= result['counts'][phrase]*likelihoodEntropy[code]["_DEFAULT"]
                
                result['counts'][phrase] = int(result['counts'][phrase])
                
            else:
                todel.append(phrase)     
                    
    for phrase in todel:
        del result['counts'][phrase]
        
    newPosteriors = {}
    
    for code in result['posteriors']:
        splits = re.split("\.",code)     
        if len(splits) ==2:                
            code_val = splits[1]
            newPosteriors.setdefault(code_val,{})
            newPosteriors[code_val]["0"] = result["posteriors"][code]
            
        else:
            newPosteriors.setdefault(code,{})
            newPosteriors[code]["1"] = result['posteriors'][code]
            
    result["posteriors"] = newPosteriors   
    
    
    for code in result['posteriors']:
        
        if len(result['posteriors'][code].keys()) == 2:   
            P0 = 1/(1 + (2**( result['posteriors'][code]["1"]-result['posteriors'][code]["0"])))    
            
            P1 = 1 - P0
            
            result['posteriors'][code] = {"0": P0, "1":P1}
            
        else:
            result['posteriors'][code] = {"0":1, "1":0}
            
    result["M"] = int(result["M"])
    result["M_words"] = int(result["M_words"])
    
    
    return result 

#### 2. Training and Assessing Classifier Performance with 10 Fold Cross Validation

In [9]:
codes = [
    "collective_force",
    "collective_peace",
    "singular_force",
    "singular_peace",
    "collective",
    "singular",
    "peace",
    "force",
    "action"
]

thresholds = [(x)/100 for x in range(1,100)]  

records = list()

with open('data/naive_bayes_violent_tweets/trainingData.json','r') as redbull:

    data_set = json.load(redbull)
    for i in data_set:
        record = data_set[i]

        oldcodes = dict(record["codes"])

        for code in oldcodes:
            for newcode in re.split('_',code):
                if not record['codes'].get(newcode,0):
                    record["codes"][newcode] = oldcodes[code]
            if not record["codes"].get("action",0):
                record["codes"]["action"] = oldcodes[code]
        records.append(record)

ra.seed(30)

ra.shuffle(records)

In [22]:
numfolds = 10    
folds = []

foldsize = int((len(records)/numfolds) + 0.5 )      
if foldsize * numfolds < len(records):    
    foldsize += 1

allresults = []

for foldnum in range(numfolds):      
    
    trainingrecords = []            
    testingrecords = []
    
    for k in range(numfolds):
        
        if k==foldnum:
            testingrecords.extend(list(records[k*foldsize : (k+1)*foldsize]))
            
        else:
            trainingrecords.extend(list(records[k*foldsize : (k+1)*foldsize]))

    # ************** training begins here with the first trainingrecords dataset **************

    allcounts = {}             
    for code in codes:
        allcounts[code] = {}
        allcounts["not."+code] = {}

    total_tweets = 0
    
    for record in trainingrecords:        
        total_tweets += 1                 
        for k,v in loadPhrases(record):   
            allcounts[k] = aggregateByPhrase(allcounts[k],v)
            
    for code in codes:
        allcounts['_ALLCODES'] = aggregateByPhrase(dict(allcounts[code]), allcounts["not."+code])
        break
        
    for ky in allcounts.keys():
        allcounts[ky]["_N"] = float(len(allcounts[ky].keys()) -1)
        
    N = float(len(allcounts["_ALLCODES"].keys())) -2
    
    
    del allcounts["_ALLCODES"]
    
    entropies = list()

    for ky in allcounts.keys():
        entropies.append(computeEntropies((ky,dict(allcounts[ky])),N))
        
    likelihoodEntropy = {}
    prior_entropy = {}

    for x in entropies:
        code = x[0]   
        entdict = dict(x[1])    

        priorNum = entdict.get("_"+code,0)
        prior_entropy[code] = priorNum
                            
        entdict.pop("_"+code,None)
        
        likelihoodEntropy[code] = dict(entdict) 

    for code in prior_entropy:
        if prior_entropy[code] != 0:   

            prior_entropy[code] = -np.log2(prior_entropy[code]/total_tweets)

    
    #### Testing Begins Here

    totals = {}

    for code in codes:
        totals[code] = {}      # code is each of the 9 classes now , empty value dict created for each class
        for thresh in thresholds:
            totals[code][str(thresh)] = {
                "TP" : 0,
                "FP" : 0,
                "FN" : 0,
                "TN" : 0
            }


    for record in testingrecords:
        result = processTweet(record["tweet"],prior_entropy,likelihoodEntropy)
        
        for code in codes:
            truth = record["codes"][code]       # assume truth is one, say for collective_force -> it'll be 1 for collective, force and action as well
            for thresh in thresholds:
                
                st = str(thresh)
                
                if result["posteriors"][code]["1"] >= thresh:     # assume if posterior probability of this tweet to be in this class = 0.46
                    prediction = 1
                else:
                    prediction = 0                           # prediction will stay 0 till our thresh reaches 0.46
                    
                if truth:
                    if prediction:
                        totals[code][st]["TP"] +=1
                        
                    else:
                        totals[code][st]["FN"] +=1
                                                        

                        
                else:
                    if prediction:
                        totals[code][st]["FP"] +=1
                        
                        
                    else:
                        totals[code][st]["TN"] +=1 
    
    
    for code in codes:         # Calculating the classification measures
                                # eth works just fine till here
        for thresh in thresholds:
            st = str(thresh)
            
            try:
                totals[code][st]["TPR"] = totals[code][st]["TP"]/(totals[code][st]["TP"] + totals[code][st]["FN"])    #TPR is recall = TP/(TP+FN)
            except:
                totals[code][st]["TPR"] = 0
                
                
            try:
                totals[code][st]["PPV"] = totals[code][st]["TP"]/(totals[code][st]["TP"]+totals[code][st]["FP"])      # PPV is precision = TP/(TP+FP)
            
            except:
                totals[code][st]["PPV"] = 0
                
                
            try:
                totals[code][st]["F1"] = (2*totals[code][st]["TPR"]*totals[code][st]["PPV"])/(
                totals[code][st]["TPR"] + totals[code][st]["PPV"])
                
            except:
                totals[code][st]["F1"] = 0
                
            try:
                totals[code][st]["ACC"] = (totals[code][st]["TP"]+totals[code][st]["TN"])/(                                            #ACC is Accuracy = (TP+TN)/(Total Predictions)
                totals[code][st]["TP"] + totals[code][st]["TN"] + totals[code][st]["FP"]+ totals[code][st]["FN"])
                
            except:
                
                totals[code][st]["ACC"] = 0

    print("the results for fold : " + str(foldnum))

    for code in codes:
            st = max([(totals[code][str(thresh)]["F1"], str(thresh)) for thresh in thresholds], key = lambda x: x[0])[1]
            
            totals[code]["thresh"] = st
            
            print(code,st,totals[code][st]["PPV"], totals[code][st]["TPR"], totals[code][st]["F1"], totals[code][st]["ACC"])
            
            # print(allresults.append(totals))

    allresults.append(totals)

    # print(allresults)

the results for fold : 0
collective_force 0.01 1.0 0.17012814847547503 0.290785498489426 0.17012814847547503
collective_peace 0.01 1.0 0.10561201944321696 0.19104716227018384 0.10561201944321696
singular_force 0.01 1.0 0.11356606274856386 0.20396825396825397 0.11356606274856386
singular_peace 0.01 1.0 0.4158197083517455 0.5873907615480649 0.4158197083517455
collective 0.01 1.0 0.10561201944321696 0.19104716227018384 0.10561201944321696
singular 0.01 1.0 0.4158197083517455 0.5873907615480649 0.4158197083517455
peace 0.01 1.0 0.4158197083517455 0.5873907615480649 0.4158197083517455
force 0.01 1.0 0.11356606274856386 0.20396825396825397 0.11356606274856386
action 0.01 1.0 0.4158197083517455 0.5873907615480649 0.4158197083517455
the results for fold : 1
collective_force 0.01 1.0 0.16836058329650905 0.2881996974281391 0.16836058329650905
collective_peace 0.01 1.0 0.1100309323906319 0.19824840764331209 0.1100309323906319
singular_force 0.01 1.0 0.12152010605391074 0.21670606776989754 0.12152

In [24]:
bestThresh = {}
for code in codes:
    bestThresh[code] = [0, 0]
    for thresh in thresholds:
        F1 = np.mean([totals[code][str(thresh)]["F1"] for totals in allresults])
        
        if F1 > bestThresh[code][1]:
            bestThresh[code] = [thresh,F1]
            
    thresh = bestThresh[code][0]
        
        # print(code,thresh,np.mean([totals[code][str(thresh)]["PPV"] for totals in allresults]),
        #      np.mean([totals[code][str(thresh)]["TPR"] for totals in allresults]),
        #      np.mean([totals[code][str(thresh)]["F1"] for totals in allresults]))
print(bestThresh)

{'collective_force': [0.01, 0.29429188984538224], 'collective_peace': [0.01, 0.20664901374504935], 'singular_force': [0.01, 0.2185544519997744], 'singular_peace': [0.01, 0.5967207377006216], 'collective': [0.01, 0.20664901374504935], 'singular': [0.01, 0.5967207377006216], 'peace': [0.01, 0.5967207377006216], 'force': [0.01, 0.2185544519997744], 'action': [0.01, 0.5967207377006216]}
