In [42]:
from matplotlib import pyplot as plt
from matplotlib import style
from matplotlib2tikz import save as tikz_save
import numpy as np
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
#plt.rcParams['image.cmap'] = 'gray'

In [43]:
# initialize array for storing data
number_data = 338
data = {"id": np.empty([2, number_data], dtype="int32"),
       "thread_id": np.empty([2,number_data], dtype="int32"),
       "sentiment": np.empty([2,number_data], dtype="int32"),
        "whoposts": np.empty([2,number_data], dtype="str"),
       "post_body": np.empty([2,number_data], dtype=object)}

In [44]:
# read annotations from .csv file
import csv
files = ['data/posts_excel_deleted.csv', 'data/posts_excel_1_deleted.csv']
for i in range(len(files)):
    with open(files[i]) as csvfile:
        reader = csv.reader(csvfile)
        count = 0
        for row in reader:
            # exclude the first row, which are column names
            if count != 0:
                #print("Current line:",count)
                data['id'][i, count-1] = count
                data['thread_id'][i, count-1] = row[0]
                data['sentiment'][i, count-1] = row[1]
                data['whoposts'][i, count-1] = row[2]
                data['post_body'][i, count-1] = row[3]
            count += 1

In [45]:
def validate(data):
    temp = np.copy(data)
    comp = temp[0,:]!=temp[1,:]
    return np.where(comp)

In [46]:
distinct_id = validate(data['id'])
#print(distinct_id)
if distinct_id[0].size:
    print("Error! There are ", distinct_id[0].size," ids not the same:")
    print("They are:")
    print(data['id'][distinct_id])

In [47]:
distinct_thread_id = validate(data['id'])
#print(distinct_thread_id)
if distinct_thread_id[0].size:
    print("Error! There are ", distinct_thread_id[0].size," thread_ids not the same:")
    print("They are:")
    print(data['thread_id'][distinct_thread_id])

In [48]:
""" Computes the Fleiss' Kappa value as described in (Fleiss, 1971) """

DEBUG = True

def computeKappa(mat):
    """ Computes the Kappa value
        @param n Number of rating per subjects (number of human raters)
        @param mat Matrix[subjects][categories]
        @return The Kappa value """
    n = checkEachLineCount(mat)   # PRE : every line count must be equal to n
    N = len(mat)
    k = len(mat[0])
    
    if DEBUG:
        print(n, "raters.")
        print(N, "subjects.")
        print(k, "categories.")
    
    # Computing p[]
    p = [0.0] * k
    for j in range(k):
        p[j] = 0.0
        for i in range(N):
            p[j] += mat[i][j]
        p[j] /= N*n
    if DEBUG: print("p =", p)
    
    # Computing P[]    
    P = [0.0] * N
    for i in range(N):
        P[i] = 0.0
        for j in range(k):
            P[i] += mat[i][j] * mat[i][j]
        P[i] = (P[i] - n) / (n * (n - 1))
    if DEBUG: print("P =", P)
    
    # Computing Pbar
    Pbar = sum(P) / N
    if DEBUG: print("Pbar =", Pbar)
    
    # Computing PbarE
    PbarE = 0.0
    for pj in p:
        PbarE += pj * pj
    if DEBUG: print("PbarE =", PbarE)
    
    kappa = (Pbar - PbarE) / (1 - PbarE)
    if DEBUG: print("kappa =", kappa)
    
    return kappa

def checkEachLineCount(mat):
    """ Assert that each line has a constant number of ratings
        @param mat The matrix checked
        @return The number of ratings
        @throws AssertionError If lines contain different number of ratings """
    n = sum(mat[0])
    
    assert all(sum(line) == n for line in mat[1:]), "Line count != %d (n value)." % n
    return n


In [49]:
print(data['sentiment'].T.shape)
print(len(data['sentiment'].T))

(338, 2)
338


In [50]:
# transform the sentiment data so that computeKappa takes it as argument
temp = np.copy(data['sentiment'].T)
#print(temp)
#temp[:,0] = np.random.randint(2,size=temp.shape[0])
#temp[300:,0]=1
comp = temp[:,0]!=temp[:,1]
print("There are ", sum(comp)," annotations not the same:")
print("Indexes are:")
print(data['id'][0,np.where(comp)]+1)
sentiment = np.empty((number_data,2),dtype="int")
sentiment[:,0] = np.sum(temp==0, axis=1)#number of "negative" annotations
sentiment[:,1] = np.sum(temp==1, axis=1)#number of "positive" annotations

There are  90  annotations not the same:
Indexes are:
[[  9  10  15  25  34  35  36  46  47  48  49  50  63  64  67  71  73  77
   79  80  81  84  94  95  96 107 109 118 119 120 123 130 134 142 143 147
  153 155 158 164 165 166 167 168 176 178 179 180 182 184 190 193 198 200
  207 209 210 211 213 218 219 224 227 228 234 239 243 246 248 251 255 263
  264 270 278 279 285 290 291 299 300 310 312 314 315 317 321 322 324 334]]


In [51]:
computeKappa(sentiment)

2 raters.
338 subjects.
2 categories.
p = [0.40532544378698226, 0.59467455621301779]
P = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 

0.4476522497004029

In [54]:
m = np.where(comp)[0]

In [53]:
data['thread_id'][0,7]

5201

In [56]:
x = 'P'

In [61]:
if x == 'P':
    print('no')

no


In [60]:
# transform the sentiment data so that computeKappa takes it as argument
temp = np.copy(data['whoposts'].T)
comp = temp[:,0]!=temp[:,1]
print("There are ", sum(comp)," annotations not the same:")
print("Indexes are:")
print(data['id'][0,np.where(comp)]+1)
sentiment = np.empty((number_data,2),dtype="str")
sentiment[:,0] = np.sum(temp=='P', axis=1)#number of "negative" annotations
sentiment[:,1] = np.sum(temp=='R', axis=1)#number of "positive" annotations

There are  31  annotations not the same:
Indexes are:
[[ 18  26  34  36  48  61  72  83  84  85  90 109 112 113 125 131 141 144
  145 146 156 161 166 167 172 184 187 192 220 259 301]]


In [65]:
mm = np.load('data/correction_record_sentiment.npy')

In [66]:
mm

array([0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       1, 1,