In [95]:
from __future__ import print_function
import os
import numpy as np
np.random.seed(1337)
__docformat__ = 'restructedtext en'
import timeit
import numpy
import scipy.io

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
import sys
BASE_DIR = '/Users/triddle/Documents/Gits/'
GLOVE_DIR = BASE_DIR + '/Data/'
#MAX_SEQUENCE_LENGTH = 210
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.2
# first, build index mapping words in the embeddings set
# to their embedding vector

print('Indexing word vectors.')

embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.840B.300d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))


In [96]:
# for custom metrics
import keras.backend as K
from keras.utils.generic_utils import get_from_module

def categorical_accuracy(y_true, y_pred):
    """Categorical accuracy metric.

    Computes the mean accuracy rate across all predictions for
    multiclass classification problems.
    """
    return K.mean(K.equal(K.argmax(y_true, axis=-1),
                          K.argmax(y_pred, axis=-1)))


def precision(y_true, y_pred):
    """Precision metric.

    Only computes a batch-wise average of precision.

    Computes the precision, a metric for multi-label classification of
    how many selected items are relevant.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision


def recall(y_true, y_pred):
    """Recall metric.

    Only computes a batch-wise average of recall.

    Computes the recall, a metric for multi-label classification of
    how many relevant items are selected.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall


def fbeta_score(y_true, y_pred, beta=1):
    """Computes the F score.

    The F score is the weighted harmonic mean of precision and recall.
    Here it is only computed as a batch-wise average, not globally.

    This is useful for multi-label classification, where input samples can be
    classified as sets of labels. By only using accuracy (precision) a model
    would achieve a perfect score by simply assigning every class to every
    input. In order to avoid this, a metric should penalize incorrect class
    assignments as well (recall). The F-beta score (ranged from 0.0 to 1.0)
    computes this, as a weighted mean of the proportion of correct class
    assignments vs. the proportion of incorrect class assignments.

    With beta = 1, this is equivalent to a F-measure. With beta < 1, assigning
    correct classes becomes more important, and with beta > 1 the metric is
    instead weighted towards penalizing incorrect class assignments.
    """
    if beta < 0:
        raise ValueError('The lowest choosable beta is zero (only precision).')

    # If there are no true positives, fix the F score at 0 like sklearn.
    if K.sum(K.round(K.clip(y_true, 0, 1))) == 0:
        return 0

    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    bb = beta ** 2
    fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon())
    return fbeta_score


def fmeasure(y_true, y_pred):
    """Computes the f-measure, the harmonic mean of precision and recall.

    Here it is only computed as a batch-wise average, not globally.
    """
    return fbeta_score(y_true, y_pred, beta=1)


In [97]:
def create_dat(df_ess, df_dem):
    df_ess = df_ess[df_ess.Study=='Connecticut']
    df_ess.Condition.replace(['c', 'c2', 'c1', 'c3', 'ca', 'cb', '3'], 'Control', inplace=True)
    df_ess.Condition.replace(['t', 't2', 't3', 't1', '1', '2', 'ta', 'tb'], 'Treatment', inplace=True)
    df_ess.Condition.replace(['c/t'], np.nan, inplace=True)
    
    df_dem = df_dem[df_dem.Study=='Connecticut']
    df_dem.Ethnicity.replace('Asian', 'Asian American', inplace=True)
    df_dem.Ethnicity.replace('Other/Mixed', 'Other', inplace=True)
    df_dem = df_dem[['ID', 'Ethnicity', 'Gender']].dropna()
    
    outdat = pd.merge(df_ess[['ID', 'Intervention_number', 'Essay', 'Condition', 'Intervention_Date', 'corrected']], 
                    df_dem, how='left', on='ID').drop_duplicates()
    
    return(outdat)

# Here, we're preparing the data

In [124]:
import pandas as pd
import matplotlib
from pandas import DataFrame, read_csv
import matplotlib.pyplot as plt
import pandas as pd #this is how I usually import pandas
import sys #only needed to determine Python version number
import matplotlib #only needed to determine Matplotlib version number
import numpy as np
# second, prepare text samples and their labels
print('Processing text dataset')
df_ess = pd.read_csv('../../Data/3 CSV Files/essays1.23.16.csv', sep='|')
df_dem = pd.read_csv('../../Data/3 CSV Files/demog3.2.16.csv')
df = create_dat(df_ess, df_dem)
df.dropna(axis=0, subset=['corrected', 'Condition'], inplace=True)
df['word_count'] = df.corrected.apply(lambda x: len(x.split()))
print('Found %s total students' % len(df.ID.value_counts()))
print(df.Gender.value_counts())
print(df.Condition.value_counts())
print('mean essay length = %s' % np.mean(df.word_count))
print('sd essay length = %s' % np.std(df.word_count))
texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
labels_index['conm']=0
labels_index['conf']=1
labels_index['affm']=2
labels_index['afff']=3
texts = df["corrected"].tolist()
labelsType = df["Condition"].tolist()
labelsRace = df["Gender"].tolist()
labelsCombined=[]
for i in range(len(labelsType)):
    if labelsType[i]=='Control' and labelsRace[i]=='m':
        labelsCombined.append(0)
    elif labelsType[i]=='Control' and labelsRace[i]=='f':
        labelsCombined.append(1)
    elif labelsType[i]=='Treatment' and labelsRace[i]=='m':
        labelsCombined.append(2)
    elif labelsType[i]=='Treatment' and labelsRace[i]=='f':
        labelsCombined.append(3)
        
print('Found %s texts.' % len(texts))

print(df.groupby(['Gender', 'Condition']).size())

# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

maxseqval=0
for i in range(len(sequences)):
    if len(sequences[i])>maxseqval:
        maxseqval=len(sequences[i])
        
MAX_SEQUENCE_LENGTH = maxseqval

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labelsCombined))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
#VALIDATION_SPLIT = 0.15
TEST_SPLIT = 0.15
nb_test_samples = int(TEST_SPLIT * data.shape[0])
#nb_validation_samples = int(VALIDATION_SPLIT * (data.shape[0]-nb_test_samples))

x_train = data[:-nb_test_samples]
y_train = labels[:-nb_test_samples]
#x_val = data[-nb_validation_samples:]
#y_val = labels[-nb_validation_samples:]
#y_val=np.swapaxes(np.tile(y_val, (MAX_SEQUENCE_LENGTH, 1, 1)),0,1)
#y_train=np.swapaxes(np.tile(y_train, (MAX_SEQUENCE_LENGTH, 1, 1)),0,1)
x_test = data[:nb_test_samples]
y_test = labels[:nb_test_samples]

textstest=[]
textsshuff=[]
for i in indices:
  textsshuff.append(texts[i])
index_word={}
for i,x in word_index.items():
    index_word[x]=i
textstest=textsshuff[:nb_test_samples]

thefile = open('../output/gendercleantimedistmodeltesttexts.txt', 'w')
for item in textstest:
  thefile.write("%s\n" % item)  

np.save('../output/gendercleantimedistmodelxtest.npy', x_test)
np.save('../output/gendercleantimedistmodelytest.npy', y_test)

print('Shape of xtrain tensor:', x_train.shape)
print('Shape of ytrain tensor:', y_train.shape)
#print('Shape of xval tensor:', x_val.shape)
#print('Shape of yval tensor:', y_val.shape)
print('Shape of xtest tensor:', x_test.shape)
print('Shape of ytest tensor:', y_test.shape)

print('Preparing embedding matrix.')
# prepare embedding matrix
nb_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(nb_words + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)


Processing text dataset
Found 1232 total students
f    3315
m    2909
Name: Gender, dtype: int64
Control      3473
Treatment    2751
Name: Condition, dtype: int64
mean essay length = 42.8125
sd essay length = 23.8684140013
Found 6224 texts.
Gender  Condition
f       Control      1851
        Treatment    1464
m       Control      1622
        Treatment    1287
dtype: int64
Found 5816 unique tokens.
Shape of data tensor: (6224, 208)
Shape of label tensor: (6224, 4)
Shape of xtrain tensor: (5291, 208)
Shape of ytrain tensor: (5291, 4)
Shape of xtest tensor: (933, 208)
Shape of ytest tensor: (933, 4)
Preparing embedding matrix.


# Define and fit the model

In [99]:
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.models import load_model

model = Sequential()
model.add(Embedding(nb_words + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True,dropout=0.2))
model.add(Dropout(0.5))
model.add(LSTM(50))
model.add(Dropout(0.5))
model.add(Dense(len(labels_index), activation='softmax'))
print(model.summary())

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_24 (Embedding)         (None, 208, 300)      1745100     embedding_input_16[0][0]         
____________________________________________________________________________________________________
dropout_35 (Dropout)             (None, 208, 300)      0           embedding_24[0][0]               
____________________________________________________________________________________________________
lstm_20 (LSTM)                   (None, 50)            70200       dropout_35[0][0]                 
____________________________________________________________________________________________________
dropout_36 (Dropout)             (None, 50)            0           lstm_20[0][0]                    
___________________________________________________________________________________________

In [100]:
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=[precision,recall,fmeasure,categorical_accuracy])
bz=128
model.fit(x_train, y_train, 
          nb_epoch=35, batch_size=bz)
model.save_weights('../output/model_compfinalgendermodel.h5')

Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35


# Adjust model and get predictions

In [101]:
from keras.layers import TimeDistributed
model.load_weights('../output/model_compfinalgendermodel.h5')
layer_dict = dict([(layer.name, layer) for layer in model.layers])
print(layer_dict.keys())
model.summary()

['lstm_20', 'dense_20', 'embedding_24', 'dropout_36', 'dropout_35']
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_24 (Embedding)         (None, 208, 300)      1745100     embedding_input_16[0][0]         
____________________________________________________________________________________________________
dropout_35 (Dropout)             (None, 208, 300)      0           embedding_24[0][0]               
____________________________________________________________________________________________________
lstm_20 (LSTM)                   (None, 50)            70200       dropout_35[0][0]                 
____________________________________________________________________________________________________
dropout_36 (Dropout)             (None, 50)            0           lstm_20[0][0]                    
_______________________

In [102]:
dens=layer_dict['dense_20'].get_weights()
lstmw=layer_dict['lstm_20'].get_weights()

In [103]:
#layer_dict['lstm_2'].get_weights()
model = Sequential()
model.add(Embedding(nb_words + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True,dropout=0.2))
model.add(Dropout(0.5))
lstmout = LSTM(50,
               return_sequences=True,
               stateful=False,weights=lstmw)
model.add(lstmout)
model.add(Dropout(0.5))
templayer=TimeDistributed(Dense(len(labels_index), activation='softmax',weights=dens))
model.add(templayer)
model.load_weights('../output/model_compfinalgendermodel.h5')
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_25 (Embedding)         (None, 208, 300)      1745100     embedding_input_17[0][0]         
____________________________________________________________________________________________________
dropout_37 (Dropout)             (None, 208, 300)      0           embedding_25[0][0]               
____________________________________________________________________________________________________
lstm_21 (LSTM)                   (None, 208, 50)       70200       dropout_37[0][0]                 
____________________________________________________________________________________________________
dropout_38 (Dropout)             (None, 208, 50)       0           lstm_21[0][0]                    
___________________________________________________________________________________________

In [104]:
outgendernewviz=model.predict(x_test,verbose=1)
np.save('../output/model_compfinalgenderdict.npy', index_word) 
np.save('../output/model_compfinalgenderdictinv.npy', word_index) 
np.save('../output/model_compfinalgenderoutput.npy', outgendernewviz) 
np.save('../output/model_compfinalgenderxtestdata.npy', x_test) 
np.save('../output/model_compfinalgenderytestdata.npy', y_test) 
thefile = open('../output/model_compfinalgendertextsinput.txt', 'w')
for item in textstest:
  thefile.write("%s\n" % item) 



In [105]:
from sklearn.metrics import f1_score
f1_score(np.argmax(y_test, axis=1), np.argmax(outgendernewviz[:, 207, :], axis=1), average='macro', pos_label=None)

0.85138832960220401

In [106]:
#flute prediction
essay = ['They are important because I like listening to music and playing my flute. I like having friends and I also enjoy being funny sometimes.']
test_seq = tokenizer.texts_to_sequences(essay)
full_dat = pad_sequences(test_seq, maxlen=MAX_SEQUENCE_LENGTH)
preds = model.predict(full_dat, verbose=1)
word_index = tokenizer.word_index
index_word={}
for i,x in word_index.items():
    index_word[x]=i
np.save('../output/flute_iw.npy', index_word)
np.save('../output/flute_iw_inv.npy', word_index)
np.save('../output/flute_preds.npy', preds)
np.save('../output/flute_xdat.npy', full_dat)
#np.save('../output/testdat_just.npy', df)



# Refit the model on the whole dataset

In [108]:
# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
x_train = data
y_train = labels

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
#VALIDATION_SPLIT = 0.15
#nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data
y_train = labels

textstest=[]
textsshuff=[]
for i in indices:
  textsshuff.append(texts[i])
index_word={}
for i,x in word_index.items():
    index_word[x]=i
textstest=textsshuff

thefile = open('gendercleantimedistmodeltesttexts.txt', 'w')
for item in textstest:
  thefile.write("%s\n" % item)  

#np.save('gendercleantimedistmodelxtest.npy', x_test)
#np.save('gendercleantimedistmodelytest.npy', y_test)

print('Shape of xtrain tensor:', x_train.shape)
print('Shape of ytrain tensor:', y_train.shape)
#print('Shape of xval tensor:', x_val.shape)
#print('Shape of yval tensor:', y_val.shape)
#print('Shape of xtest tensor:', x_test.shape)
#print('Shape of ytest tensor:', y_test.shape)

print('Preparing embedding matrix.')
# prepare embedding matrix
nb_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(nb_words + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

Shape of xtrain tensor: (6224, 208)
Shape of ytrain tensor: (6224, 4)
Preparing embedding matrix.


# Next, we define the model

In [109]:
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.models import load_model

model = Sequential()
model.add(Embedding(nb_words + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True,dropout=0.2))
model.add(Dropout(0.5))
model.add(LSTM(50))
model.add(Dropout(0.5))
model.add(Dense(len(labels_index), activation='softmax'))
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_27 (Embedding)         (None, 208, 300)      1745100     embedding_input_18[0][0]         
____________________________________________________________________________________________________
dropout_39 (Dropout)             (None, 208, 300)      0           embedding_27[0][0]               
____________________________________________________________________________________________________
lstm_22 (LSTM)                   (None, 50)            70200       dropout_39[0][0]                 
____________________________________________________________________________________________________
dropout_40 (Dropout)             (None, 50)            0           lstm_22[0][0]                    
___________________________________________________________________________________________

# Fit the model

I'm fitting it to all the data here, so as to make predictions using novel sentences

In [110]:
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=[precision,recall,fmeasure,categorical_accuracy])
print(model.summary())
bz=128
model.fit(x_train, y_train, #validation_data=(x_val, y_val),
          nb_epoch=35, batch_size=bz)
model.save_weights('../output/full_finalgendermodel.h5')

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_27 (Embedding)         (None, 208, 300)      1745100     embedding_input_18[0][0]         
____________________________________________________________________________________________________
dropout_39 (Dropout)             (None, 208, 300)      0           embedding_27[0][0]               
____________________________________________________________________________________________________
lstm_22 (LSTM)                   (None, 50)            70200       dropout_39[0][0]                 
____________________________________________________________________________________________________
dropout_40 (Dropout)             (None, 50)            0           lstm_22[0][0]                    
___________________________________________________________________________________________

# Next, restructure the model to make predictions

In [111]:
from keras.layers import TimeDistributed
model.load_weights('../output/full_finalgendermodel.h5')
model.summary()
layer_dict = dict([(layer.name, layer) for layer in model.layers])
layer_dict.keys()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_27 (Embedding)         (None, 208, 300)      1745100     embedding_input_18[0][0]         
____________________________________________________________________________________________________
dropout_39 (Dropout)             (None, 208, 300)      0           embedding_27[0][0]               
____________________________________________________________________________________________________
lstm_22 (LSTM)                   (None, 50)            70200       dropout_39[0][0]                 
____________________________________________________________________________________________________
dropout_40 (Dropout)             (None, 50)            0           lstm_22[0][0]                    
___________________________________________________________________________________________

['dense_22', 'dropout_39', 'lstm_22', 'dropout_40', 'embedding_27']

In [112]:
dens=layer_dict['dense_22'].get_weights()
lstmw=layer_dict['lstm_22'].get_weights()

In [113]:
model = Sequential()
model.add(Embedding(nb_words + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True,dropout=0.2))
model.add(Dropout(0.5))
lstmout = LSTM(50,
               return_sequences=True,
               stateful=False,weights=lstmw)
model.add(lstmout)
model.add(Dropout(0.5))
templayer=TimeDistributed(Dense(len(labels_index), activation='softmax',weights=dens))
model.add(templayer)
model.load_weights('../output/full_finalgendermodel.h5')
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_28 (Embedding)         (None, 208, 300)      1745100     embedding_input_19[0][0]         
____________________________________________________________________________________________________
dropout_41 (Dropout)             (None, 208, 300)      0           embedding_28[0][0]               
____________________________________________________________________________________________________
lstm_23 (LSTM)                   (None, 208, 50)       70200       dropout_41[0][0]                 
____________________________________________________________________________________________________
dropout_42 (Dropout)             (None, 208, 50)       0           lstm_23[0][0]                    
___________________________________________________________________________________________

# Here, I make some novel sentences to pass to the network
(Justification with relationships)

In [118]:
import itertools
texts = ['Athletic ability is important to me because ', 
         'Art is important to me because ', 
         'Being smart is important to me because ', 
         'Getting good grades is important to me because ', 
         'Creativity is important to me because ', 
         'Independence is important to me because ', 
         'Social groups are important to me because ', 
         'Music is important to me because ', 
         'Politics is important to me because ', 
         'Relationships is important to me because ', 
         'Religion is important to me because ', 
         'Sense of humor is important to me because ', 
         'Living in the moment is important to me because ']

justification = ['my friends', 'my family', 'my grandparents', 'my parents', 'my teacher', 
                 'my friend', 'my dad', 'my mom', 'my sister', 'my brother', 
                 'my mother', 'my father', 'my cousin', 'my aunt', 'my uncle']


a = list(itertools.product(texts, justification))
combos = []
value = []
just = []
for i in a:
    combos.append(i[0]+i[1])
    value.append(i[0].split()[0])
    just.append(i[1].split()[1])
df = pd.DataFrame({'text':combos,
                  'value':value,
                  'justification':just})
df.shape

(195, 3)

# Make predictions and save important bits

In [119]:
test_seq = tokenizer.texts_to_sequences(combos)
full_dat = pad_sequences(test_seq, maxlen=MAX_SEQUENCE_LENGTH)
preds = model.predict(full_dat, verbose=1)
word_index = tokenizer.word_index
index_word={}
for i,x in word_index.items():
    index_word[x]=i
np.save('../output/testsentiw_just.npy', index_word)
np.save('../output/testsentiw_inv_just.npy', word_index)
np.save('../output/testsentpreds_just.npy', preds)
np.save('../output/testsentxdat_just.npy', full_dat)
np.save('../output/testdat_just.npy', df)



# Here, I make some novel sentences to pass to the network
(Justification with the self)

In [120]:
texts = ['Athletic ability is important to me because ', 
         'Art is important to me because ', 
         'Being smart is important to me because ', 
         'Getting good grades is important to me because ', 
         'Creativity is important to me because ', 
         'Independence is important to me because ', 
         'Social groups are important to me because ', 
         'Music is important to me because ', 
         'Politics is important to me because ', 
         'Relationships is important to me because ', 
         'Religion is important to me because ', 
         'Sense of humor is important to me because ', 
         'Living in the moment is important to me because ']

justification = ['I want', 'I need', 'I will', 'I have', 'I can', 
                 'I feel', 'I should', 'I would', 'I hope', 'I am', 
                 'I get', 'I might', 'I use', 'I like', 'I take']


a = list(itertools.product(texts, justification))
combos = []
value = []
just = []
for i in a:
    combos.append(i[0]+i[1])
    value.append(i[0].split()[0])
    just.append(i[1].split()[1])
df = pd.DataFrame({'text':combos,
                  'value':value,
                  'justification':just})

# Make predictions and save important bits

In [121]:
test_seq = tokenizer.texts_to_sequences(combos)
full_dat = pad_sequences(test_seq, maxlen=MAX_SEQUENCE_LENGTH)
preds = model.predict(full_dat, verbose=1)
word_index = tokenizer.word_index
index_word={}
for i,x in word_index.items():
    index_word[x]=i
np.save('../output/testsentiw_justself.npy', index_word)
np.save('../output/testsentiw_inv_justself.npy', word_index)
np.save('../output/testsentpreds_justself.npy', preds)
np.save('../output/testsentxdat_justself.npy', full_dat)
np.save('../output/testdat_justself.npy', df)



# Below here is other stuff not in the paper

## Here, I make some novel sentences to pass to the network
(social words)

In [17]:
import spacy
nlp = spacy.load('en')

In [69]:
df = pd.read_csv('../../liwcpy/liwc-dict/Social.csv', header=None, names=['word'], encoding='UTF-8')
df = df[~df.word.str.contains('\*')]

In [70]:
df['pos'] = df['word'].apply(lambda x: nlp(x)[0].pos_)
df['lemma'] = df['word'].apply(lambda x: nlp(x)[0].lemma_)

df['sent'] = 'These values are important because my'
df.sent.loc[df.pos=='VERB'] = 'These values'
df.sent.loc[df.pos=='PRON'] = 'These values are important to'
df.sent.loc[df.pos=='ADJ'] = 'These values are important because my'
df.sent.loc[df.pos=='INTJ'] = ''
df.sent.loc[df.pos=='ADV'] = 'These values are important because my'
df['test_sent'] = df.sent+' '+df.word
df['sent_wo'] = 'These values are important because without my'
df.sent_wo.loc[df.pos=='VERB'] = 'These values don\'t' 
df.sent_wo.loc[df.pos=='PRON'] = 'These values are not important to'
df.sent_wo.loc[df.pos=='ADJ'] = 'These values are important because without my'
df.sent_wo.loc[df.pos=='INTJ'] = ''
df.sent_wo.loc[df.pos=='ADV'] = 'These values are important because without my'
df['test_sent_wo'] = df.sent_wo+' '+df.lemma
df.test_sent_wo.loc[df.pos=='PRON'] = df.sent_wo + ' ' + df.word

## Make predictions and save important bits

In [71]:
subdat = df.drop_duplicates(subset='test_sent')
subdat['test_sent'] = subdat['test_sent'].apply(lambda x: str(x))
test_seq = tokenizer.texts_to_sequences(subdat['test_sent'])
full_dat = pad_sequences(test_seq, maxlen=MAX_SEQUENCE_LENGTH)
preds = model.predict(full_dat, verbose=1)
word_index = tokenizer.word_index
index_word={}
for i,x in word_index.items():
    index_word[x]=i
np.save('testsentiw_soc.npy', index_word)
np.save('testsentiw_inv_soc.npy', word_index)
np.save('testsentpreds_soc.npy', preds)
np.save('testsentxdat_soc.npy', full_dat)
np.save('testdat_soc.npy', subdat)

subdat['test_sent_wo'] = subdat['test_sent_wo'].apply(lambda x: str(x))
test_seq = tokenizer.texts_to_sequences(subdat['test_sent_wo'])
full_dat = pad_sequences(test_seq, maxlen=MAX_SEQUENCE_LENGTH)
preds_wo = model.predict(full_dat, verbose=1)
word_index_wo = tokenizer.word_index
index_word_wo={}
for i,x in word_index_wo.items():
    index_word_wo[x]=i
np.save('testsentiw_wo_soc.npy', index_word_wo)
np.save('testsentiw_inv_wo_soc.npy', word_index_wo)
np.save('testsentpreds_wo_soc.npy', preds_wo)
np.save('testsentxdat_wo_soc.npy', full_dat)



## Here, I make some novel sentences to pass to the network
(positive emotion words)

In [73]:
df = pd.read_csv('../../liwcpy/liwc-dict/Posemo.csv', header=None, names=['word'], encoding='UTF-8')

In [74]:
df = df[~df.word.str.contains('\*')]

In [75]:
#df[df.word.str.contains('\*')]
df['pos'] = df['word'].apply(lambda x: nlp(x)[0].pos_)
df['lemma'] = df['word'].apply(lambda x: nlp(x)[0].lemma_)

df['sent'] = 'These values are important because my'
df.sent.loc[df.pos=='VERB'] = 'These values'
df.sent.loc[df.pos=='PRON'] = 'These values are important to'
df.sent.loc[df.pos=='ADJ'] = 'These values are important because my'
df.sent.loc[df.pos=='INTJ'] = ''
df.sent.loc[df.pos=='ADV'] = 'These values are important because my'
df['test_sent'] = df.sent+' '+df.word
df['sent_wo'] = 'These values are important because without my'
df.sent_wo.loc[df.pos=='VERB'] = 'These values don\'t' 
df.sent_wo.loc[df.pos=='PRON'] = 'These values are not important to'
df.sent_wo.loc[df.pos=='ADJ'] = 'These values are important because without my'
df.sent_wo.loc[df.pos=='INTJ'] = ''
df.sent_wo.loc[df.pos=='ADV'] = 'These values are important because without my'
df['test_sent_wo'] = df.sent_wo+' '+df.lemma
df.test_sent_wo.loc[df.pos=='PRON'] = df.sent_wo + ' ' + df.word

## Make predictions and save important bits

In [76]:
subdat = df.drop_duplicates(subset='test_sent')
subdat['test_sent'] = subdat['test_sent'].apply(lambda x: str(x))
test_seq = tokenizer.texts_to_sequences(subdat['test_sent'])
full_dat = pad_sequences(test_seq, maxlen=MAX_SEQUENCE_LENGTH)
preds = model.predict(full_dat, verbose=1)
word_index = tokenizer.word_index
index_word={}
for i,x in word_index.items():
    index_word[x]=i
np.save('testsentiw_posem.npy', index_word)
np.save('testsentiw_inv_posem.npy', word_index)
np.save('testsentpreds_posem.npy', preds)
np.save('testsentxdat_posem.npy', full_dat)
np.save('testdat_posem.npy', subdat)

subdat['test_sent_wo'] = subdat['test_sent_wo'].apply(lambda x: str(x))
test_seq = tokenizer.texts_to_sequences(subdat['test_sent_wo'])
full_dat = pad_sequences(test_seq, maxlen=MAX_SEQUENCE_LENGTH)
preds_wo = model.predict(full_dat, verbose=1)
word_index_wo = tokenizer.word_index
index_word_wo={}
for i,x in word_index_wo.items():
    index_word_wo[x]=i
np.save('testsentiw_wo_posem.npy', index_word_wo)
np.save('testsentiw_inv_wo_posem.npy', word_index_wo)
np.save('testsentpreds_wo_posem.npy', preds_wo)
np.save('testsentxdat_wo_posem.npy', full_dat)



## Here, I make some novel sentences to pass to the network
(Negative emotion words)

In [77]:
df = pd.read_csv('../../liwcpy/liwc-dict/Negemo.csv', header=None, names=['word'], encoding='UTF-8')

In [78]:
df = df[~df.word.str.contains('\*')]

In [79]:
#df[df.word.str.contains('\*')]
df['pos'] = df['word'].apply(lambda x: nlp(x)[0].pos_)
df['lemma'] = df['word'].apply(lambda x: nlp(x)[0].lemma_)

df['sent'] = 'These values are important because my'
df.sent.loc[df.pos=='VERB'] = 'These values'
df.sent.loc[df.pos=='PRON'] = 'These values are important to'
df.sent.loc[df.pos=='ADJ'] = 'These values are important because my'
df.sent.loc[df.pos=='INTJ'] = ''
df.sent.loc[df.pos=='ADV'] = 'These values are important because my'
df['test_sent'] = df.sent+' '+df.word
df['sent_wo'] = 'These values are important because without my'
df.sent_wo.loc[df.pos=='VERB'] = 'These values don\'t' 
df.sent_wo.loc[df.pos=='PRON'] = 'These values are not important to'
df.sent_wo.loc[df.pos=='ADJ'] = 'These values are important because without my'
df.sent_wo.loc[df.pos=='INTJ'] = ''
df.sent_wo.loc[df.pos=='ADV'] = 'These values are important because without my'
df['test_sent_wo'] = df.sent_wo+' '+df.lemma
df.test_sent_wo.loc[df.pos=='PRON'] = df.sent_wo + ' ' + df.word

## Make predictions and save important bits

In [80]:
subdat = df.drop_duplicates(subset='test_sent')
subdat['test_sent'] = subdat['test_sent'].apply(lambda x: str(x))
test_seq = tokenizer.texts_to_sequences(subdat['test_sent'])
full_dat = pad_sequences(test_seq, maxlen=MAX_SEQUENCE_LENGTH)
preds = model.predict(full_dat, verbose=1)
word_index = tokenizer.word_index
index_word={}
for i,x in word_index.items():
    index_word[x]=i
np.save('testsentiw_negem.npy', index_word)
np.save('testsentiw_inv_negem.npy', word_index)
np.save('testsentpreds_negem.npy', preds)
np.save('testsentxdat_negem.npy', full_dat)
np.save('testdat_negem.npy', subdat)

subdat['test_sent_wo'] = subdat['test_sent_wo'].apply(lambda x: str(x))
test_seq = tokenizer.texts_to_sequences(subdat['test_sent_wo'])
full_dat = pad_sequences(test_seq, maxlen=MAX_SEQUENCE_LENGTH)
preds_wo = model.predict(full_dat, verbose=1)
word_index_wo = tokenizer.word_index
index_word_wo={}
for i,x in word_index_wo.items():
    index_word_wo[x]=i
np.save('testsentiw_wo_negem.npy', index_word_wo)
np.save('testsentiw_inv_wo_negem.npy', word_index_wo)
np.save('testsentpreds_wo_negem.npy', preds_wo)
np.save('testsentxdat_wo_negem.npy', full_dat)



## Here, I make some novel sentences to pass to the network
(Value-specific sentences)

In [81]:
texts = ['Athletic ability is important to me', 
         'Art is important to me', 
         'Being smart is important to me', 
         'Getting good grades is important to me', 
         'Creativity is important to me', 
         'Independence is important to me', 
         'Social groups are important to me', 
         'Music is important to me', 
         'Politics is important to me', 
         'Relationships is important to me', 
         'Religion is important to me', 
         'Sense of humor is important to me', 
         'Living in the moment is important to me']

val = ['Athletics', 'Art', 'Being Smart', 'Good Grades', 'Creativity', 
      'Independence', 'Social Groups', 'Music', 'Politics', 'Relationships',
      'Religion', 'Sense of Humor', 'Living in the Moment']

texts_not = ['Athletic ability is not important to me', 
         'Art is not important to me', 
         'Being smart is not important to me', 
         'Getting good grades is not important to me', 
         'Creativity is not important to me', 
         'Independence is not important to me', 
         'Social groups are not important to me', 
         'Music is not important to me', 
         'Politics is not important to me', 
         'Relationships is not important to me', 
         'Religion is not important to me', 
         'Sense of humor is not important to me', 
         'Living in the moment is not important to me']

df = pd.DataFrame({'text':texts,
                  'value':val})

## Make predictions and save important bits

In [82]:
test_seq = tokenizer.texts_to_sequences(texts)
full_dat = pad_sequences(test_seq, maxlen=MAX_SEQUENCE_LENGTH)
preds = model.predict(full_dat, verbose=1)
word_index = tokenizer.word_index
index_word={}
for i,x in word_index.items():
    index_word[x]=i
np.save('testsentiw_val.npy', index_word)
np.save('testsentiw_inv_val.npy', word_index)
np.save('testsentpreds_val.npy', preds)
np.save('testsentxdat_val.npy', full_dat)
np.save('testdat_val.npy', df)

test_seq = tokenizer.texts_to_sequences(texts_not)
full_dat = pad_sequences(test_seq, maxlen=MAX_SEQUENCE_LENGTH)
preds_not = model.predict(full_dat, verbose=1)
word_index_not = tokenizer.word_index
index_word_not={}
for i,x in word_index_not.items():
    index_word_not[x]=i
np.save('testsentiw_wo_val.npy', index_word_not)
np.save('testsentiw_inv_wo_val.npy', word_index_not)
np.save('testsentpreds_wo_val.npy', preds_not)
np.save('testsentxdat_wo_val.npy', full_dat)



In [42]:
texts = ['Athletic ability is important to me', 
         'Art is important to me', 
         'Being smart is important to me', 
         'Getting good grades is important to me', 
         'Creativity is important to me', 
         'Independence is important to me', 
         'Social groups are important to me', 
         'Music is important to me', 
         'Politics is important to me', 
         'Relationships is important to me', 
         'Religion is important to me', 
         'Sense of humor is important to me', 
         'Living in the moment is important to me']
test_seq = tokenizer.texts_to_sequences(texts)
full_dat = pad_sequences(test_seq, maxlen=MAX_SEQUENCE_LENGTH)
preds = model.predict(full_dat, verbose=1)



overleaf create project, add acl style file and start editing!