In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns



In [2]:
def transitions(x,o,iw):
    """for a given essay, return the probability shifts for each word
    this has been adjusted to just return probabilities sequentially,
    instead of adding all probabilities from the same word"""
    worddeltas=[]
    endpoint = o.shape[0]
    startpoint = endpoint-np.count_nonzero(x)
    initval=o[startpoint-1]
    words = []
    probs = np.empty((0,4))
    for i in range(startpoint,endpoint):
        deltacuzofx=np.subtract(o[i],initval)
        words.append(iw[x[i]])
        probs = np.append(probs, deltacuzofx.reshape(1,4), axis=0)
        #if word in worddeltas:
        #    worddeltas[word]=worddeltas[word]+deltacuzofx
        #else:
        #    worddeltas[word]=deltacuzofx
        initval=o[i]
    return (words, probs)

In [3]:
def prob_reshape(x,o,iw):
    """for a given essay, return the probabilities for each word
  """
    worddeltas=[]
    endpoint = o.shape[0]
    startpoint = endpoint-np.count_nonzero(x)
    initval=o[startpoint-1]
    words = []
    probs = np.empty((0,4))
    for i in range(startpoint,endpoint):
        #deltacuzofx=np.subtract(o[i],initval)
        words.append(iw[x[i]])
        probs = np.append(probs, o[i].reshape(1,4), axis=0)
        #if word in worddeltas:
        #    worddeltas[word]=worddeltas[word]+deltacuzofx
        #else:
        #    worddeltas[word]=deltacuzofx
        initval=o[i]
    return (words, probs)

In [4]:
def addtransitions(x,o,iw):
    """for each essay, get the transitions associated with each word
    return a dictionary of word transitions summed across the corpus"""
    addworddeltas={}
    for i in range(x.shape[0]):
        wd=transitions(x[i],o[i],iw)
        for key, value in wd.iteritems():
            if key in addworddeltas:
                addworddeltas[key]=addworddeltas[key]+value
            else:
                addworddeltas[key]=value
    return addworddeltas

In [5]:
def filtertestdata(x,y,o,t,pos):
    t=np.asarray(t)
    ynew=y[(y[:,pos]==1)]
    xnew=x[(y[:,pos]==1)]
    onew=o[(y[:,pos]==1)]
    tnew=t[(y[:,pos]==1)]
    return xnew,ynew,onew,tnew.tolist()

In [6]:
def filterdictionary(d,pos):
    newdic={}
    values=[]
    words=[]
    for key, value in d.iteritems():
        words.append(key)
        values.append(value[pos])
    words=[x for (y,x) in sorted(zip(values,words))]
    values=[y for (y,x) in sorted(zip(values,words))]
    newdic = dict(zip(words, values))
    return words,values,newdic
    

In [8]:
# main function - Values(-1-ALL, 0-control male, 1-control female, 2-affirmation male, 3-affirmation female)
from __future__ import division
index_word=np.load('output/model_compfinalgenderdict.npy').item()
word_index=np.load('output/model_compfinalgenderdictinv.npy').item()
testdata_output=np.load('output/model_compfinalgenderoutput.npy')
testdata_x=np.load('output/model_compfinalgenderxtestdata.npy')
testdata_y=np.load('output/model_compfinalgenderytestdata.npy')
with open('output/model_compfinalgendertextsinput.txt') as f:
    text_content = f.readlines()
filtertestdatavalue=-1
filterdictionaryvalue=1
if filtertestdatavalue!=-1:
    testdata_x,testdata_y,testdata_output,text_content=filtertestdata(testdata_x,testdata_y,testdata_output,text_content,filtertestdatavalue)
maxindexoutput=np.argmax(testdata_output,axis=1)
maxindexyval=np.argmax(testdata_y,axis=1)
accuracy=(np.count_nonzero(maxindexoutput == maxindexyval)/maxindexyval.shape[0])*100
#addwd=addtransitions(testdata_x,testdata_y,testdata_output,index_word)
#words,values,dic=filterdictionary(addwd,filterdictionaryvalue)
#accuracy



- index_word is pairing of index numbers with words
- word_index is the pairing of words with index numbers
- testdata_output is the sequential probabilities for each of the 500 test essays
- testdata_x is a matrix of index_word values (i.e. tells you what word is in the essay
- testdata_y is a matrix that indicates the class for each essay
- accuracy is not scored in the same manner as essay classification (f1 score)

# Model comparison model information

In [9]:
words=[]
essay=[]
probs = np.empty((0,4))
for i in range(0, len(testdata_x)):
    word_dists = transitions(testdata_x[i],testdata_output[i], index_word)
    words.append(word_dists[0])
    essay.append([i]*len(word_dists[0]))
    probs = np.append(probs, word_dists[1], axis=0)
df1 = pd.DataFrame({'feature': [w for e in words for w in e],
                    'prob': probs[:,0],
                    'class': ['control_m']*len(probs),
                    'essay_num': [w for e in essay for w in e]})
df2 = pd.DataFrame({'feature': [w for e in words for w in e],
                    'prob': probs[:,1],
                    'class': ['control_f']*len(probs),
                    'essay_num': [w for e in essay for w in e]})
df3 = pd.DataFrame({'feature': [w for e in words for w in e],
                    'prob': probs[:,2],
                    'class': ['aff_m']*len(probs),
                    'essay_num': [w for e in essay for w in e]})
df4 = pd.DataFrame({'feature': [w for e in words for w in e],
                    'prob': probs[:,3],
                    'class': ['aff_f']*len(probs),
                    'essay_num': [w for e in essay for w in e]})
df = pd.concat([df1,df2,df3,df4])
df.to_csv('output/nn_probs.csv', index=False, encoding='utf-8')

In [7]:
index_word = np.load('output/flute_iw.npy').item()
word_index = np.load('output/flute_iw_inv.npy').item()
preds = np.load('output/flute_preds.npy')
x_dat = np.load('output/flute_xdat.npy')
word_dists = transitions(x_dat[0], preds[0], index_word)
df1 = pd.DataFrame({'word':word_dists[0],
                  'prob': word_dists[1][:,0],
                   'class': ['control_m']*len(word_dists[0])})
df2 = pd.DataFrame({'word':word_dists[0],
                  'prob': word_dists[1][:,1],
                   'class': ['control_f']*len(word_dists[0])})
df3 = pd.DataFrame({'word':word_dists[0],
                  'prob': word_dists[1][:,2],
                   'class': ['aff_m']*len(word_dists[0])})
df4 = pd.DataFrame({'word':word_dists[0],
                  'prob': word_dists[1][:,3],
                   'class': ['aff_f']*len(word_dists[0])})
df = pd.concat([df1,df2,df3,df4])
df.to_csv('output/flute_probs.csv', index=False, encoding='utf-8')

# Justifications
(with others)

In [16]:
index_word = np.load('output/testsentiw_just.npy').item()
word_index = np.load('output/testsentiw_inv_just.npy').item()
preds = np.load('output/testsentpreds_just.npy')
x_dat = np.load('output/testsentxdat_just.npy')
df = np.load('output/testdat_just.npy')

df = pd.DataFrame(df, columns=['justification', 'text', 'value'])

In [17]:
words=[]
essay=[]
probs = np.empty((0,4))
prob_shifts = np.empty((0,4))
val = []
just = []
for i in range(0, len(x_dat)):
    word_dists = transitions(x_dat[i],preds[i],index_word)
    word_probs = prob_reshape(x_dat[i],preds[i],index_word)
    words.append(word_dists[0])
    essay.append([i]*len(word_dists[0]))
    val.append([df.value.loc[i]]*len(word_dists[0]))
    just.append([df.justification.loc[i]]*len(word_dists[0]))
    prob_shifts = np.append(prob_shifts, word_dists[1], axis=0)
    probs = np.append(probs, word_probs[1], axis=0)
df1 = pd.DataFrame({'word': [w for e in words for w in e],
                    'prob': probs[:,0],
                    'prob_shift': prob_shifts[:,0],
                    'class': ['control_m']*len(probs),
                    'sentence_num': [w for e in essay for w in e],
                    'just': [w for e in just for w in e],
                    'val': [p for w in val for p in w]})
df2 = pd.DataFrame({'word': [w for e in words for w in e],
                    'prob': probs[:,1],
                    'prob_shift': prob_shifts[:,1],
                    'class': ['control_f']*len(probs),
                    'sentence_num': [w for e in essay for w in e],
                    'just': [w for e in just for w in e],
                    'val': [p for w in val for p in w]})
df3 = pd.DataFrame({'word': [w for e in words for w in e],
                    'prob': probs[:,2],
                    'prob_shift': prob_shifts[:,2],
                    'class': ['aff_m']*len(probs),
                    'sentence_num': [w for e in essay for w in e],
                    'just': [w for e in just for w in e],
                    'val': [p for w in val for p in w]})
df4 = pd.DataFrame({'word': [w for e in words for w in e],
                    'prob': probs[:,3],
                    'prob_shift': prob_shifts[:,3],
                    'class': ['aff_f']*len(probs),
                    'sentence_num': [w for e in essay for w in e],
                    'just': [w for e in just for w in e],
                    'val': [p for w in val for p in w]})
df_w = pd.concat([df1,df2,df3,df4])
#df.to_csv('output/nn_probs.csv', index=False, encoding='utf-8')

In [18]:
df_w.to_csv('output/nnprobs_just.csv', index=False, encoding='utf-8')

# Justifications
(with self)

In [19]:
index_word = np.load('output/testsentiw_justself.npy').item()
word_index = np.load('output/testsentiw_inv_justself.npy').item()
preds = np.load('output/testsentpreds_justself.npy')
x_dat = np.load('output/testsentxdat_justself.npy')
df = np.load('output/testdat_justself.npy')

df = pd.DataFrame(df, columns=['justification', 'text', 'value'])

In [20]:
words=[]
essay=[]
probs = np.empty((0,4))
prob_shifts = np.empty((0,4))
val = []
just = []
for i in range(0, len(x_dat)):
    word_dists = transitions(x_dat[i],preds[i],index_word)
    word_probs = prob_reshape(x_dat[i],preds[i],index_word)
    words.append(word_dists[0])
    essay.append([i]*len(word_dists[0]))
    val.append([df.value.loc[i]]*len(word_dists[0]))
    just.append([df.justification.loc[i]]*len(word_dists[0]))
    prob_shifts = np.append(prob_shifts, word_dists[1], axis=0)
    probs = np.append(probs, word_probs[1], axis=0)
df1 = pd.DataFrame({'word': [w for e in words for w in e],
                    'prob': probs[:,0],
                    'prob_shift': prob_shifts[:,0],
                    'class': ['control_m']*len(probs),
                    'sentence_num': [w for e in essay for w in e],
                    'just': [w for e in just for w in e],
                    'val': [p for w in val for p in w]})
df2 = pd.DataFrame({'word': [w for e in words for w in e],
                    'prob': probs[:,1],
                    'prob_shift': prob_shifts[:,1],
                    'class': ['control_f']*len(probs),
                    'sentence_num': [w for e in essay for w in e],
                    'just': [w for e in just for w in e],
                    'val': [p for w in val for p in w]})
df3 = pd.DataFrame({'word': [w for e in words for w in e],
                    'prob': probs[:,2],
                    'prob_shift': prob_shifts[:,2],
                    'class': ['aff_m']*len(probs),
                    'sentence_num': [w for e in essay for w in e],
                    'just': [w for e in just for w in e],
                    'val': [p for w in val for p in w]})
df4 = pd.DataFrame({'word': [w for e in words for w in e],
                    'prob': probs[:,3],
                    'prob_shift': prob_shifts[:,3],
                    'class': ['aff_f']*len(probs),
                    'sentence_num': [w for e in essay for w in e],
                    'just': [w for e in just for w in e],
                    'val': [p for w in val for p in w]})
df_w = pd.concat([df1,df2,df3,df4])
#df.to_csv('output/nn_probs.csv', index=False, encoding='utf-8')

In [21]:
df_w.to_csv('output/nnprobs_justself.csv', index=False, encoding='utf-8')