<h1 align="center">Testing out ML Techniques on the Annotated Argument Set<h1>


<h2 align="center"> Preparing the Data</h2>

We test two different formats for the input data

<ol>
<li>Using the data in the same format as is being used by our argument classifier:</li>

    <p>We will use exactly the same dataset as is being used by our argument classifier. In order to make a nice size matrix where the inputs are small, we will append a bunch of zeros to the end to make sure the whole set of training posts is not a jagged matrix</p>
     
    
<li>Representing each argument as a vector, where the position in the vector corresponds to the argument type:</li>

    <p>Example: If a post contains arguments [0 2 5] it would be represented as [1 0 1 0 0 1] (assuming 0 is an argument type)</p>
    
</ol>

In [259]:
import json, itertools

highestRating = 10
positiveArgTypes = [0,3,5]
negativeArgTypes = [1,2,4]
noArgsTypes = len(positiveArgTypes) + len(negativeArgTypes)

def convertRatingsToPolarities(ratings):
    polarities = []
    
    for rating in ratings:
        if rating < 5:
            polarities.append('Neg')
        elif rating > 6:
            polarities.append('Pos')
        else:
            polarities.append('Ntrl')    
    
    return polarities


def getAnnotationsAsListOfArgumentsFormat(annotationsUrl):
    annotatedPosts = []
    numericalsRatings = []
    
    # Collect the annotated data into suitable containers 
    with open(annotationsUrl) as data_file:
        with open('ForumPosts.json') as ratings_file:
            
            data = json.load(data_file)
            ratings = json.load(ratings_file)

            for idx, d in enumerate(data):
                # We get 'Nones' sometimes when things left completeley blank. Replace with 0's so as not to interfere
                # with our sums whilst letting us keep track of noOfSentences per post
                d = [[0, 0, 0, 0, 0, 0, 0] if v is None else v for v in d] 
                d = np.array(d)
                sums = d.sum(axis=0)
                sums = np.argwhere(sums > 0)
                sums = sums[sums !=6] # We ignore the annotations for the last category (6th - other) as we do not know how to use it in the argument graph or its polarity properly 
                annotatedPosts.append(sums.tolist())
                numericalsRatings.append(ratings[idx]['Rating'])
 
    annotatedPosts = np.array(annotatedPosts)
    numericalsRatings = np.array(numericalsRatings)
    
    largestNoOfArgsInPost = max(enumerate(annotatedPosts), key = lambda tup: len(tup[1]))
    vectorFmtPosts = np.zeros((annotatedPosts.shape[0], len(largestNoOfArgsInPost[1])))
    
    for idx, post in enumerate(annotatedPosts):
        vectorFmtPosts[idx,0:len(post)] = post
    
    polarities = convertRatingsToPolarities(numericalsRatings)
    
    return vectorFmtPosts, polarities
    
    
def getAnnotationsPositionVectorFormat(annotationsUrl):
    
    annotatedPosts = []
    numericalsRatings = []

    # Collect the annotated data into suitable containers 
    with open(annotationsUrl) as data_file:
        with open('ForumPosts.json') as ratings_file:
            data = json.load(data_file)
            ratings = json.load(ratings_file)

            for idx, d in enumerate(data):
                # We get 'Nones' sometimes when things left completeley blank. Replace with 0's so as not to interfere
                # with our sums whilst letting us keep track of noOfSentences per post
                d = [[0, 0, 0, 0, 0, 0, 0] if v is None else v for v in d] 
                d = np.array(d)
                
                sums = d.sum(axis=0)
                sums = np.argwhere(sums > 0)
                sums = sums[sums !=6]
            
                vectorFmt = np.zeros(noArgsTypes)
                vectorFmt[sums] = 1

                annotatedPosts.append(vectorFmt)
                numericalsRatings.append(ratings[idx]['Rating'])

    annotatedPosts = np.array(annotatedPosts)
    numericalsRatings = np.array(numericalsRatings)
    
    polarities = convertRatingsToPolarities(numericalsRatings)
    
    return annotatedPosts, polarities

<h2 align='center'>Setting up the ML Models</h2>

In [296]:
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

def trainRandomForestClassifier(trainingPosts, trainingRatings, testPosts):
    
    rf = RandomForestClassifier(max_depth = 4)
    rf.fit(trainingPosts, trainingRatings)
    
    predictions = []
    for idx, post in enumerate(testPosts):
        predictions.append(rf.predict(post)[0])
        
    return predictions


<h1 align='center'>K-Folds Testing</h1>

In [303]:
def run_k_folds_test(noOfFolds, annotatedPosts, numericalsRatings):

    annotatedPostsSplit = np.array_split(np.array(annotatedPosts), noOfFolds)
    numericalRatingsSplit = np.array_split(np.array(numericalsRatings), noOfFolds)

    totalRecalls = []
    totalPrecisions = []

    for fold in range(noOfFolds):
        #print('--------Fold ', fold, '---------')
        listOfSplits = list(range(0, noOfFolds))

        testPosts = list(annotatedPostsSplit[fold])
        testRatings = list(numericalRatingsSplit[fold])

        listOfSplits.remove(fold)
        trainingPosts = np.concatenate(np.array(annotatedPostsSplit)[listOfSplits], axis=0)
        trainingRatings = np.concatenate(np.array(numericalRatingsSplit)[listOfSplits], axis=0)


        #--------Train the Random Forest Classifier---------#
        
        predictionsRf = trainRandomForestClassifier(trainingPosts, trainingRatings, testPosts)
        confusionMatrixrRf = confusion_matrix(testRatings, predictionsRf, labels=["Pos", "Ntrl", "Neg"])
        #print(confusionMatrixrRf)
        
        sumPredicted = confusionMatrixrRf.sum(axis=0)
        sumActual = confusionMatrixrRf.sum(axis=1)

        recalls = []
        precisions = []

        for idx in range(confusionMatrixrRf.shape[0]):

            recalls.append(confusionMatrixrRf[idx,idx] / sumPredicted[idx])
            precisions.append(confusionMatrixrRf[idx,idx] / sumActual[idx])

        totalRecalls.append(recalls)
        totalPrecisions.append(precisions)

        
    averageRecall = np.mean(np.array(totalRecalls), axis=0)
    averagePrecision = np.mean(np.array(totalPrecisions), axis=0)
    
    return averageRecall, averagePrecision

<h1 align='center'>Running the Experiment</h1>

In [313]:
annotationsUrl = 'allAnnotationsChutesRun1.json'

annotationsFmt1, ratingsFmt1 = getAnnotationsAsListOfArgumentsFormat(annotationsUrl) # Format1
annotationsFmt2, ratingsFmt2 = getAnnotationsPositionVectorFormat(annotationsUrl) # Format2

noOfFolds = 5 # No of Data folds we will use

allaverageRecallFmt1 = []
allaveragePrecisionFmt1= [] 
allaverageRecallFmt2 = [] 
allaveragePrecisionFmt2 = []

for idx in range(50):
    print(idx)
    averageRecallFmt1, averagePrecisionFmt1 = run_k_folds_test(noOfFolds, annotationsFmt1, ratingsFmt1)
    averageRecallFmt2, averagePrecisionFmt2 = run_k_folds_test(noOfFolds, annotationsFmt2, ratingsFmt2)

    allaverageRecallFmt1.append(averageRecallFmt1)
    allaveragePrecisionFmt1.append(averagePrecisionFmt1)
    allaverageRecallFmt2.append(averageRecallFmt2)
    allaveragePrecisionFmt2.append(averagePrecisionFmt2)

allaverageRecallFmt1 = np.nan_to_num(allaverageRecallFmt1)
allaveragePrecisionFmt1 = np.nan_to_num(allaveragePrecisionFmt1)
allaverageRecallFmt2 = np.nan_to_num(allaverageRecallFmt2)
allaveragePrecisionFmt2 = np.nan_to_num(allaveragePrecisionFmt2)

sumallaverageRecallFmt1 = (np.array(allaverageRecallFmt1)).sum(axis=1)
print(sumallaverageRecallFmt1[sumallaverageRecallFmt1.argmax(axis=0)])
print('maxRecallFmt1: ', allaverageRecallFmt1[sumallaverageRecallFmt1.argmax(axis=0)])

sumallaveragePrecisionFmt1 = (np.array(allaveragePrecisionFmt1)).sum(axis=1)
print(sumallaveragePrecisionFmt1[sumallaveragePrecisionFmt1.argmax(axis=0)])
print('maxPrecFmt1: ', allaveragePrecisionFmt1[sumallaveragePrecisionFmt1.argmax(axis=0)])

sumallaverageRecallFmt2 = (np.array(allaverageRecallFmt2)).sum(axis=1)
print(sumallaverageRecallFmt2[sumallaverageRecallFmt2.argmax(axis=0)])
print('maxRecallFmt2: ', allaverageRecallFmt2[sumallaverageRecallFmt2.argmax(axis=0)])

sumallaveragePrecisionFmt2 = (np.array(allaveragePrecisionFmt2)).sum(axis=1)
print(sumallaveragePrecisionFmt2[sumallaveragePrecisionFmt2.argmax(axis=0)])
print('maxPrecFmt2: ', allaveragePrecisionFmt2[sumallaveragePrecisionFmt2.argmax(axis=0)])

print(sumallaverageRecallFmt1)
    
print('Fmt1: ', averageRecallFmt1, averagePrecisionFmt1 )
print('Fmt2: ', averageRecallFmt2, averagePrecisionFmt2 )


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
1.69294871795
maxRecallFmt1:  [ 0.86794872  0.          0.825     ]
1.78929070929
maxPrecFmt1:  [ 0.88643357  0.          0.90285714]
1.64823232323
maxRecallFmt2:  [ 0.91212121  0.          0.73611111]
1.77852147852
maxPrecFmt2:  [ 0.87566434  0.          0.90285714]
[ 1.51230159  1.52361472  1.52260462  1.58156566  1.51921911  1.58611111
  1.51921911  1.50669553  1.50790598  1.63961538  1.56995726  1.59409091
  1.64294872  1.55752525  1.60261905  1.59742424  1.53634199  1.49028139
  1.69294872  1.56409091  1.5211039   1.49836219  1.51166667  1.56995726
  1.52441392  1.52880952  1.59928571  1.59409091  1.53888889  1.54967532
  1.58306938  1.48704906  1.64294872  1.59742424  1.52361472  1.58611111
  1.69294872  1.69294872  1.53634199  1.55028139  1.52441392  1.51230159
  1.50790598  1.56474359  1.55277778  1.51230159  1.52352148  1.58460373
  1.5861

In [236]:
import json, itertools

highestRating = 10
positiveArgTypes = [0,3,5]
negativeArgTypes = [1,2,4]
noArgsTypes = len(positiveArgTypes) + len(negativeArgTypes)

annotatedPosts = []
numericalsRatings = []
cooccuranceMatrix = np.zeros((noArgsTypes, noArgsTypes)) # We need to count the number of times argument types appear together for normalisation later on


# Collect the annotated data into suitable containers 
with open('allAnnotationsChutesRun1.json') as data_file:
    with open('ForumPosts.json') as ratings_file:
        data = json.load(data_file)
        

        ratings = json.load(ratings_file)
        
        for idx, d in enumerate(data):
            # We get 'Nones' sometimes when things left completeley blank. Replace with 0's so as not to interfere
            # with our sums whilst letting us keep track of noOfSentences per post
            d = [[0, 0, 0, 0, 0, 0, 0] if v is None else v for v in d] 
            d = np.array(d)
            sums = d.sum(axis=0)
            sums = np.argwhere(sums > 0)
            sums = sums[sums !=6] # We ignore the annotations for the last category (6th - other) as we do not know how to use it in the argument graph or its polarity properly 
            annotatedPosts.append(sums.tolist())
            numericalsRatings.append(ratings[idx]['Rating'])
            
            # We will now count the co-occurances
            for i, j in itertools.product(sums.tolist(), sums.tolist()):
                cooccuranceCount = np.zeros((noArgsTypes, noArgsTypes))
                cooccuranceCount[i,j] = 1
                cooccuranceMatrix = cooccuranceCount + cooccuranceMatrix
                
                
annotatedPosts = np.array(annotatedPosts)
numericalsRatings = np.array(numericalsRatings)



In [237]:
x = max(enumerate(annotatedPosts), key = lambda tup: len(tup[1]))

In [238]:
trainingData = np.zeros((annotatedPosts.shape[0], len(x[1])))

In [239]:
for idx, post in enumerate(annotatedPosts):
    trainingData[idx,0:len(post)] = post

In [240]:
trainingRatings = []
for rating in numericalsRatings:
    if rating < 5:
        trainingRatings.append('Neg')
    elif rating > 6:
        trainingRatings.append('Pos')
    else:
        trainingRatings.append('Ntrl')

In [241]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
iris = load_iris()

In [242]:
rf = RandomForestClassifier(max_depth = 4)

 
rf.fit(trainingData[0:70], trainingRatings[0:70])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [243]:
rf.predict(trainingData[71])

array(['Neg'], 
      dtype='<U4')

In [244]:
predictions = []
testData = trainingData[71:-1]
for idx, post in enumerate(testData):
    predictions.append(rf.predict(post)[0])

In [245]:
predictions

['Neg',
 'Pos',
 'Pos',
 'Neg',
 'Pos',
 'Pos',
 'Neg',
 'Neg',
 'Neg',
 'Neg',
 'Neg',
 'Neg',
 'Neg',
 'Neg',
 'Neg',
 'Neg',
 'Neg',
 'Neg',
 'Neg',
 'Neg',
 'Neg',
 'Pos',
 'Neg',
 'Pos',
 'Neg',
 'Pos',
 'Neg',
 'Pos',
 'Pos']

In [246]:
trainingRatings[71:-1]

['Neg',
 'Pos',
 'Pos',
 'Neg',
 'Neg',
 'Pos',
 'Neg',
 'Ntrl',
 'Ntrl',
 'Neg',
 'Neg',
 'Ntrl',
 'Neg',
 'Neg',
 'Neg',
 'Neg',
 'Pos',
 'Neg',
 'Neg',
 'Ntrl',
 'Neg',
 'Pos',
 'Pos',
 'Ntrl',
 'Neg',
 'Pos',
 'Neg',
 'Neg',
 'Pos']

In [247]:
from sklearn.metrics import confusion_matrix

confusionMatrix = confusion_matrix(trainingRatings[71:-1], predictions, labels=["Pos", "Ntrl", "Neg"])

In [248]:
confusionMatrix

array([[ 6,  0,  2],
       [ 1,  0,  4],
       [ 2,  0, 14]])

In [249]:
sumPredicted = confusionMatrix.sum(axis=0)
sumActual = confusionMatrix.sum(axis=1)

for idx in range(confusionMatrix.shape[0]):

    print(confusionMatrix[idx,idx] / sumPredicted[idx])
    print(confusionMatrix[idx,idx] / sumActual[idx])

0.666666666667
0.75
nan
0.0
0.7
0.875


<h1>Using Total count of arguments</h1>

Here we will assume that each post is a vector [0 0 0 0 0 0] where the index in the vector indicates presence of argument index

In [255]:
import json, itertools

highestRating = 10
positiveArgTypes = [0,3,5]
negativeArgTypes = [1,2,4]
noArgsTypes = len(positiveArgTypes) + len(negativeArgTypes)

annotatedPosts = []
numericalsRatings = []
cooccuranceMatrix = np.zeros((noArgsTypes, noArgsTypes)) # We need to count the number of times argument types appear together for normalisation later on


# Collect the annotated data into suitable containers 
with open('allAnnotationsChutesRun1.json') as data_file:
    with open('ForumPosts.json') as ratings_file:
        data = json.load(data_file)
        

        ratings = json.load(ratings_file)
        
        for idx, d in enumerate(data):
            # We get 'Nones' sometimes when things left completeley blank. Replace with 0's so as not to interfere
            # with our sums whilst letting us keep track of noOfSentences per post
            d = [[0, 0, 0, 0, 0, 0, 0] if v is None else v for v in d] 
            d = np.array(d)
            sums = d.sum(axis=0)
            sums = np.argwhere(sums > 0)
            sums = sums[sums !=6]
            
            vectorFmt = np.zeros(noArgsTypes)
            vectorFmt[sums] = 1
            
            annotatedPosts.append(vectorFmt)
            numericalsRatings.append(ratings[idx]['Rating'])
                
annotatedPosts = np.array(annotatedPosts)
numericalsRatings = np.array(numericalsRatings)


In [256]:
trainingPosts = annotatedPosts
trainingRatings = []
for rating in numericalsRatings:
    if rating < 5:
        trainingRatings.append('Neg')
    elif rating > 6:
        trainingRatings.append('Pos')
    else:
        trainingRatings.append('Ntrl')

In [257]:
rf = RandomForestClassifier(max_depth = 4)

 
rf.fit(trainingData[0:70], trainingRatings[0:70])

predictions = []
testData = trainingData[71:-1]
for idx, post in enumerate(testData):
    predictions.append(rf.predict(post)[0])
    
confusionMatrix = confusion_matrix(trainingRatings[71:-1], predictions, labels=["Pos", "Ntrl", "Neg"])

In [258]:
confusionMatrix

array([[ 6,  0,  2],
       [ 1,  0,  4],
       [ 0,  0, 16]])

In [None]:
sumPredicted = confusionMatrix.sum(axis=0)
sumActual = confusionMatrix.sum(axis=1)

for idx in range(confusionMatrix.shape[0]):

    print(confusionMatrix[idx,idx] / sumPredicted[idx])
    print(confusionMatrix[idx,idx] / sumActual[idx])

<h1>K-Folds Testing</h1>

In [None]:
noOfFolds = 5 # No of Data folds we will use

annotatedPostsSplit = np.array_split(np.array(annotatedPosts), noOfFolds)
numericalRatingsSplit = np.array_split(np.array(numericalsRatings), noOfFolds)

totalRecalls = []
totalPrecisions = []

for fold in range(noOfFolds):
    print('--------Fold ', fold, '---------')
    listOfSplits = list(range(0, noOfFolds))

    testPosts = list(annotatedPostsSplit[fold])
    testRatings = list(numericalRatingsSplit[fold])

    listOfSplits.remove(fold)
    trainingPosts = np.concatenate(np.array(annotatedPostsSplit)[listOfSplits], axis=0)
    trainingRatings = np.concatenate(np.array(numericalRatingsSplit)[listOfSplits], axis=0)


    confusionMatrix = testAnnotatedData(trainingPosts, trainingRatings, testPosts, testRatings)

    print(confusionMatrix)

    sumPredicted = confusionMatrix.sum(axis=0)
    sumActual = confusionMatrix.sum(axis=1)

    recalls = []
    precisions = []

    for idx in range(confusionMatrix.shape[0]):

        recalls.append(confusionMatrix[idx,idx] / sumPredicted[idx])
        precisions.append(confusionMatrix[idx,idx] / sumActual[idx])

    totalRecalls.append(recalls)
    totalPrecisions.append(precisions)

averageRecall = np.mean(np.array(totalRecalls), axis=0)
averagePrecision = np.mean(np.array(totalPrecisions), axis=0)

print('avgRecall: ', averageRecall)
print('avgPrecision: ', averagePrecision)

In [314]:
print(annotationsFmt1)

[[ 0.  2.  4.  0.  0.]
 [ 0.  2.  5.  0.  0.]
 [ 2.  4.  0.  0.  0.]
 [ 0.  1.  2.  3.  4.]
 [ 1.  2.  4.  0.  0.]
 [ 0.  1.  2.  5.  0.]
 [ 0.  1.  2.  0.  0.]
 [ 0.  2.  5.  0.  0.]
 [ 0.  1.  2.  4.  0.]
 [ 0.  2.  5.  0.  0.]
 [ 0.  2.  0.  0.  0.]
 [ 2.  4.  0.  0.  0.]
 [ 1.  2.  0.  0.  0.]
 [ 0.  2.  5.  0.  0.]
 [ 0.  2.  5.  0.  0.]
 [ 0.  2.  5.  0.  0.]
 [ 2.  5.  0.  0.  0.]
 [ 0.  2.  5.  0.  0.]
 [ 1.  2.  4.  0.  0.]
 [ 1.  2.  4.  0.  0.]
 [ 2.  4.  0.  0.  0.]
 [ 2.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.]
 [ 0.  1.  0.  0.  0.]
 [ 0.  3.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.]
 [ 0.  1.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.]
 [ 0.  2.  0.  0.  0.]
 [ 0.  1.  0.  0.  0.]
 [ 0.  1.  2.  0.  0.]
 [ 1.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.]
 [ 0.  3.  0.  0.  0.]
 [ 0.  0.  

In [315]:
print(ratingsFmt1)

['Pos', 'Ntrl', 'Neg', 'Pos', 'Neg', 'Pos', 'Pos', 'Pos', 'Pos', 'Pos', 'Pos', 'Neg', 'Neg', 'Pos', 'Pos', 'Pos', 'Pos', 'Pos', 'Neg', 'Neg', 'Ntrl', 'Pos', 'Pos', 'Ntrl', 'Pos', 'Pos', 'Pos', 'Pos', 'Neg', 'Neg', 'Pos', 'Pos', 'Ntrl', 'Neg', 'Neg', 'Neg', 'Pos', 'Pos', 'Neg', 'Pos', 'Neg', 'Pos', 'Pos', 'Pos', 'Pos', 'Pos', 'Pos', 'Neg', 'Neg', 'Pos', 'Neg', 'Neg', 'Pos', 'Neg', 'Neg', 'Neg', 'Neg', 'Neg', 'Neg', 'Pos', 'Pos', 'Neg', 'Neg', 'Neg', 'Neg', 'Neg', 'Pos', 'Ntrl', 'Neg', 'Ntrl', 'Neg', 'Neg', 'Pos', 'Pos', 'Neg', 'Neg', 'Pos', 'Neg', 'Ntrl', 'Ntrl', 'Neg', 'Neg', 'Ntrl', 'Neg', 'Neg', 'Neg', 'Neg', 'Pos', 'Neg', 'Neg', 'Ntrl', 'Neg', 'Pos', 'Pos', 'Ntrl', 'Neg', 'Pos', 'Neg', 'Neg', 'Pos', 'Neg']
