In [1]:
from __future__ import print_function
import os
import numpy as np
np.random.seed(1337)
__docformat__ = 'restructedtext en'
import timeit
import numpy
import scipy.io

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
import sys
BASE_DIR = '/Users/travis/Documents/Gits/'
GLOVE_DIR = BASE_DIR + '/Data/'
#MAX_SEQUENCE_LENGTH = 210
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.2
# first, build index mapping words in the embeddings set
# to their embedding vector

print('Indexing word vectors.')

embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.840B.300d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))


Using Theano backend.


Indexing word vectors.
Found 2196016 word vectors.


In [2]:
# for custom metrics
import keras.backend as K
from keras.utils.generic_utils import get_from_module

def categorical_accuracy(y_true, y_pred):
    """Categorical accuracy metric.

    Computes the mean accuracy rate across all predictions for
    multiclass classification problems.
    """
    return K.mean(K.equal(K.argmax(y_true, axis=-1),
                          K.argmax(y_pred, axis=-1)))


def precision(y_true, y_pred):
    """Precision metric.

    Only computes a batch-wise average of precision.

    Computes the precision, a metric for multi-label classification of
    how many selected items are relevant.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision


def recall(y_true, y_pred):
    """Recall metric.

    Only computes a batch-wise average of recall.

    Computes the recall, a metric for multi-label classification of
    how many relevant items are selected.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall


def fbeta_score(y_true, y_pred, beta=1):
    """Computes the F score.

    The F score is the weighted harmonic mean of precision and recall.
    Here it is only computed as a batch-wise average, not globally.

    This is useful for multi-label classification, where input samples can be
    classified as sets of labels. By only using accuracy (precision) a model
    would achieve a perfect score by simply assigning every class to every
    input. In order to avoid this, a metric should penalize incorrect class
    assignments as well (recall). The F-beta score (ranged from 0.0 to 1.0)
    computes this, as a weighted mean of the proportion of correct class
    assignments vs. the proportion of incorrect class assignments.

    With beta = 1, this is equivalent to a F-measure. With beta < 1, assigning
    correct classes becomes more important, and with beta > 1 the metric is
    instead weighted towards penalizing incorrect class assignments.
    """
    if beta < 0:
        raise ValueError('The lowest choosable beta is zero (only precision).')

    # If there are no true positives, fix the F score at 0 like sklearn.
    if K.sum(K.round(K.clip(y_true, 0, 1))) == 0:
        return 0

    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    bb = beta ** 2
    fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon())
    return fbeta_score


def fmeasure(y_true, y_pred):
    """Computes the f-measure, the harmonic mean of precision and recall.

    Here it is only computed as a batch-wise average, not globally.
    """
    return fbeta_score(y_true, y_pred, beta=1)


In [3]:
def create_dat(df_ess, df_dem):
    df_ess = df_ess[df_ess.Study=='Connecticut']
    df_ess.Condition.replace(['c', 'c2', 'c1', 'c3', 'ca', 'cb', '3'], 'Control', inplace=True)
    df_ess.Condition.replace(['t', 't2', 't3', 't1', '1', '2', 'ta', 'tb'], 'Treatment', inplace=True)
    df_ess.Condition.replace(['c/t'], np.nan, inplace=True)
    
    df_dem = df_dem[df_dem.Study=='Connecticut']
    df_dem.Ethnicity.replace('Asian', 'Asian American', inplace=True)
    df_dem.Ethnicity.replace('Other/Mixed', 'Other', inplace=True)
    df_dem = df_dem[['ID', 'Ethnicity', 'Gender']].dropna()
    
    outdat = pd.merge(df_ess[['ID', 'Intervention_number', 'Essay', 'Condition', 'Intervention_Date', 'corrected']], 
                    df_dem, how='left', on='ID').drop_duplicates()
    
    return(outdat)

# Here, we're preparing the data

In [4]:
import pandas as pd
import matplotlib
from pandas import DataFrame, read_csv
import matplotlib.pyplot as plt
import pandas as pd #this is how I usually import pandas
import sys #only needed to determine Python version number
import matplotlib #only needed to determine Matplotlib version number
import numpy as np
from sklearn.cross_validation import train_test_split
# second, prepare text samples and their labels
print('Processing text dataset')
df_ess = pd.read_csv('../../Data/3 CSV Files/essays1.23.16.csv', sep='|')
df_dem = pd.read_csv('../../Data/3 CSV Files/demog3.2.16.csv')
df = create_dat(df_ess, df_dem)
df.dropna(axis=0, subset=['corrected', 'Condition'], inplace=True)
df['word_count'] = df.corrected.apply(lambda x: len(x.split()))
print('Found %s total students' % len(df.ID.value_counts()))
print(df.Gender.value_counts())
print(df.Condition.value_counts())
print('mean essay length = %s' % np.mean(df.word_count))
print('sd essay length = %s' % np.std(df.word_count))
texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
labels_index['conm']=0
labels_index['conf']=1
labels_index['affm']=2
labels_index['afff']=3
texts = df["corrected"].tolist()
labelsType = df["Condition"].tolist()
labelsRace = df["Gender"].tolist()
labelsCombined=[]
for i in range(len(labelsType)):
    if labelsType[i]=='Control' and labelsRace[i]=='m':
        labelsCombined.append(0)
    elif labelsType[i]=='Control' and labelsRace[i]=='f':
        labelsCombined.append(1)
    elif labelsType[i]=='Treatment' and labelsRace[i]=='m':
        labelsCombined.append(2)
    elif labelsType[i]=='Treatment' and labelsRace[i]=='f':
        labelsCombined.append(3)
        
print('Found %s texts.' % len(texts))

print(df.groupby(['Gender', 'Condition']).size())

# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

maxseqval=0
for i in range(len(sequences)):
    if len(sequences[i])>maxseqval:
        maxseqval=len(sequences[i])
        
MAX_SEQUENCE_LENGTH = maxseqval

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labelsCombined))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=.15)

# split the data into a training set and a validation set
indices = np.arange(x_train.shape[0])
np.random.shuffle(indices)
data = x_train[indices]
labels = y_train[indices]
VALIDATION_SPLIT = 0.15
#TEST_SPLIT = 0.15
nb_val_samples = int(VALIDATION_SPLIT * data.shape[0])
#nb_validation_samples = int(VALIDATION_SPLIT * (data.shape[0]-nb_test_samples))


x_val = data[-nb_val_samples:]
y_val = labels[-nb_val_samples:]
#y_val=np.swapaxes(np.tile(y_val, (MAX_SEQUENCE_LENGTH, 1, 1)),0,1)
#y_train=np.swapaxes(np.tile(y_train, (MAX_SEQUENCE_LENGTH, 1, 1)),0,1)
x_train = data[:-nb_val_samples]
y_train = labels[:-nb_val_samples]

textstest=[]
textsshuff=[]
for i in indices:
  textsshuff.append(texts[i])
index_word={}
for i,x in word_index.items():
    index_word[x]=i
textstest=textsshuff[:nb_val_samples]

thefile = open('../output/gendercleantimedistmodeltesttexts.txt', 'w')
for item in textstest:
  thefile.write("%s\n" % item)  

np.save('../output/gendercleantimedistmodelxtest.npy', x_test)
np.save('../output/gendercleantimedistmodelytest.npy', y_test)

print('Shape of xtrain tensor:', x_train.shape)
print('Shape of ytrain tensor:', y_train.shape)
print('Shape of xval tensor:', x_val.shape)
print('Shape of yval tensor:', y_val.shape)
print('Shape of xtest tensor:', x_test.shape)
print('Shape of ytest tensor:', y_test.shape)

print('Preparing embedding matrix.')
# prepare embedding matrix
nb_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(nb_words + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)





Processing text dataset


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


Found 1232 total students
f    3315
m    2909
Name: Gender, dtype: int64
Control      3473
Treatment    2751
Name: Condition, dtype: int64
mean essay length = 42.8125
sd essay length = 23.8684140013
Found 6224 texts.
Gender  Condition
f       Control      1851
        Treatment    1464
m       Control      1622
        Treatment    1287
dtype: int64
Found 5816 unique tokens.
Shape of data tensor: (6224, 208)
Shape of label tensor: (6224, 4)
Shape of xtrain tensor: (4497, 208)
Shape of ytrain tensor: (4497, 4)
Shape of xval tensor: (793, 208)
Shape of yval tensor: (793, 4)
Shape of xtest tensor: (934, 208)
Shape of ytest tensor: (934, 4)
Preparing embedding matrix.


# Define and fit the model

In [5]:
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.models import load_model

model = Sequential()
model.add(Embedding(nb_words + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True,dropout=0.2))
model.add(Dropout(0.5))
model.add(LSTM(50))
model.add(Dropout(0.5))
model.add(Dense(len(labels_index), activation='softmax'))
print(model.summary())

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_2 (Embedding)          (None, 208, 300)      1745100     embedding_input_1[0][0]          
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 208, 300)      0           embedding_2[0][0]                
____________________________________________________________________________________________________
lstm_1 (LSTM)                    (None, 50)            70200       dropout_1[0][0]                  
____________________________________________________________________________________________________
dropout_2 (Dropout)              (None, 50)            0           lstm_1[0][0]                     
___________________________________________________________________________________________

In [6]:
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=[precision,recall,fmeasure,categorical_accuracy])
bz=128
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          nb_epoch=35, batch_size=bz)
model.save_weights('../output/model_compfinalgendermodel.h5')

Train on 4497 samples, validate on 793 samples
Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35


# Adjust model and get predictions

In [7]:
from keras.layers import TimeDistributed
model.load_weights('../output/model_compfinalgendermodel.h5')
layer_dict = dict([(layer.name, layer) for layer in model.layers])
print(layer_dict.keys())
model.summary()

['dropout_1', 'lstm_1', 'dense_1', 'dropout_2', 'embedding_2']
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_2 (Embedding)          (None, 208, 300)      1745100     embedding_input_1[0][0]          
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 208, 300)      0           embedding_2[0][0]                
____________________________________________________________________________________________________
lstm_1 (LSTM)                    (None, 50)            70200       dropout_1[0][0]                  
____________________________________________________________________________________________________
dropout_2 (Dropout)              (None, 50)            0           lstm_1[0][0]                     
____________________________

In [8]:
dens=layer_dict['dense_1'].get_weights()
lstmw=layer_dict['lstm_1'].get_weights()

In [9]:
#layer_dict['lstm_2'].get_weights()
model = Sequential()
model.add(Embedding(nb_words + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True,dropout=0.2))
model.add(Dropout(0.5))
lstmout = LSTM(50,
               return_sequences=True,
               stateful=False,weights=lstmw)
model.add(lstmout)
model.add(Dropout(0.5))
templayer=TimeDistributed(Dense(len(labels_index), activation='softmax',weights=dens))
model.add(templayer)
model.load_weights('../output/model_compfinalgendermodel.h5')
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_3 (Embedding)          (None, 208, 300)      1745100     embedding_input_2[0][0]          
____________________________________________________________________________________________________
dropout_3 (Dropout)              (None, 208, 300)      0           embedding_3[0][0]                
____________________________________________________________________________________________________
lstm_2 (LSTM)                    (None, 208, 50)       70200       dropout_3[0][0]                  
____________________________________________________________________________________________________
dropout_4 (Dropout)              (None, 208, 50)       0           lstm_2[0][0]                     
___________________________________________________________________________________________

In [10]:
outgendernewviz=model.predict(x_test,verbose=1)
np.save('../output/model_compfinalgenderdict.npy', index_word) 
np.save('../output/model_compfinalgenderdictinv.npy', word_index) 
np.save('../output/model_compfinalgenderoutput.npy', outgendernewviz) 
np.save('../output/model_compfinalgenderxtestdata.npy', x_test) 
np.save('../output/model_compfinalgenderytestdata.npy', y_test) 
thefile = open('../output/model_compfinalgendertextsinput.txt', 'w')
for item in textstest:
  thefile.write("%s\n" % item) 



In [11]:
from sklearn.metrics import f1_score
f1_score(np.argmax(y_test, axis=1), np.argmax(outgendernewviz[:, 207, :], axis=1), average='macro', pos_label=None)

0.60437813906452131

In [106]:
#flute prediction
essay = ['They are important because I like listening to music and playing my flute. I like having friends and I also enjoy being funny sometimes.']
test_seq = tokenizer.texts_to_sequences(essay)
full_dat = pad_sequences(test_seq, maxlen=MAX_SEQUENCE_LENGTH)
preds = model.predict(full_dat, verbose=1)
word_index = tokenizer.word_index
index_word={}
for i,x in word_index.items():
    index_word[x]=i
np.save('../output/flute_iw.npy', index_word)
np.save('../output/flute_iw_inv.npy', word_index)
np.save('../output/flute_preds.npy', preds)
np.save('../output/flute_xdat.npy', full_dat)
#np.save('../output/testdat_just.npy', df)



# Refit the model on the whole dataset

In [12]:
data.shape

(5290, 208)

In [13]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labelsCombined))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

x_train, x_val, y_train, y_val = train_test_split(data, labels, test_size=.15)

textstest=[]
textsshuff=[]
for i in indices:
  textsshuff.append(texts[i])
index_word={}
for i,x in word_index.items():
    index_word[x]=i
textstest=textsshuff[:nb_val_samples]

thefile = open('../output/gendercleantimedistmodeltesttexts.txt', 'w')
for item in textstest:
  thefile.write("%s\n" % item)  

np.save('../output/gendercleantimedistmodelxtest.npy', x_test)
np.save('../output/gendercleantimedistmodelytest.npy', y_test)

print('Shape of xtrain tensor:', x_train.shape)
print('Shape of ytrain tensor:', y_train.shape)
print('Shape of xval tensor:', x_val.shape)
print('Shape of yval tensor:', y_val.shape)
#print('Shape of xtest tensor:', x_test.shape)
#print('Shape of ytest tensor:', y_test.shape)

print('Preparing embedding matrix.')
# prepare embedding matrix
nb_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(nb_words + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)



Shape of data tensor: (6224, 208)
Shape of label tensor: (6224, 4)
Shape of xtrain tensor: (5290, 208)
Shape of ytrain tensor: (5290, 4)
Shape of xval tensor: (934, 208)
Shape of yval tensor: (934, 4)
Preparing embedding matrix.


# Next, we define the model

In [14]:
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.models import load_model

model = Sequential()
model.add(Embedding(nb_words + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True,dropout=0.2))
model.add(Dropout(0.5))
model.add(LSTM(50))
model.add(Dropout(0.5))
model.add(Dense(len(labels_index), activation='softmax'))
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_5 (Embedding)          (None, 208, 300)      1745100     embedding_input_3[0][0]          
____________________________________________________________________________________________________
dropout_5 (Dropout)              (None, 208, 300)      0           embedding_5[0][0]                
____________________________________________________________________________________________________
lstm_3 (LSTM)                    (None, 50)            70200       dropout_5[0][0]                  
____________________________________________________________________________________________________
dropout_6 (Dropout)              (None, 50)            0           lstm_3[0][0]                     
___________________________________________________________________________________________

# Fit the model

I'm fitting it to all the data here, so as to make predictions using novel sentences

In [15]:
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=[precision,recall,fmeasure,categorical_accuracy])
print(model.summary())
bz=128
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          nb_epoch=35, batch_size=bz)
model.save_weights('../output/full_finalgendermodel.h5')

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_5 (Embedding)          (None, 208, 300)      1745100     embedding_input_3[0][0]          
____________________________________________________________________________________________________
dropout_5 (Dropout)              (None, 208, 300)      0           embedding_5[0][0]                
____________________________________________________________________________________________________
lstm_3 (LSTM)                    (None, 50)            70200       dropout_5[0][0]                  
____________________________________________________________________________________________________
dropout_6 (Dropout)              (None, 50)            0           lstm_3[0][0]                     
___________________________________________________________________________________________

# Next, restructure the model to make predictions

In [16]:
from keras.layers import TimeDistributed
model.load_weights('../output/full_finalgendermodel.h5')
model.summary()
layer_dict = dict([(layer.name, layer) for layer in model.layers])
layer_dict.keys()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_5 (Embedding)          (None, 208, 300)      1745100     embedding_input_3[0][0]          
____________________________________________________________________________________________________
dropout_5 (Dropout)              (None, 208, 300)      0           embedding_5[0][0]                
____________________________________________________________________________________________________
lstm_3 (LSTM)                    (None, 50)            70200       dropout_5[0][0]                  
____________________________________________________________________________________________________
dropout_6 (Dropout)              (None, 50)            0           lstm_3[0][0]                     
___________________________________________________________________________________________

['embedding_5', 'dense_3', 'lstm_3', 'dropout_5', 'dropout_6']

In [17]:
dens=layer_dict['dense_3'].get_weights()
lstmw=layer_dict['lstm_3'].get_weights()

In [18]:
model = Sequential()
model.add(Embedding(nb_words + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True,dropout=0.2))
model.add(Dropout(0.5))
lstmout = LSTM(50,
               return_sequences=True,
               stateful=False,weights=lstmw)
model.add(lstmout)
model.add(Dropout(0.5))
templayer=TimeDistributed(Dense(len(labels_index), activation='softmax',weights=dens))
model.add(templayer)
model.load_weights('../output/full_finalgendermodel.h5')
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_6 (Embedding)          (None, 208, 300)      1745100     embedding_input_4[0][0]          
____________________________________________________________________________________________________
dropout_7 (Dropout)              (None, 208, 300)      0           embedding_6[0][0]                
____________________________________________________________________________________________________
lstm_4 (LSTM)                    (None, 208, 50)       70200       dropout_7[0][0]                  
____________________________________________________________________________________________________
dropout_8 (Dropout)              (None, 208, 50)       0           lstm_4[0][0]                     
___________________________________________________________________________________________

# Here, I make some novel sentences to pass to the network
(Justification with relationships)

In [23]:
import itertools
texts = ['Athletic ability is important to me because ', 
         'Art is important to me because ', 
         'Being smart is important to me because ', 
         'Getting good grades is important to me because ', 
         'Creativity is important to me because ', 
         'Independence is important to me because ', 
         'Social groups are important to me because ', 
         'Music is important to me because ', 
         'Politics is important to me because ', 
         'Relationships is important to me because ', 
         'Religion is important to me because ', 
         'Sense of humor is important to me because ', 
         'Living in the moment is important to me because ']

justification = ['my friends', 'my family', 'my grandparents', 'my parents', 'my teacher', 
                 'my friend', 'my dad', 'my mom', 'my sister', 'my brother', 
                 'my mother', 'my father', 'my cousin', 'my aunt', 'my uncle']


a = list(itertools.product(texts, justification))
combos = []
value = []
just = []
for i in a:
    combos.append(i[0]+i[1])
    value.append(i[0].split()[0])
    just.append(i[1].split()[1])
df = pd.DataFrame({'text':combos,
                  'value':value,
                  'justification':just})
df.shape

(195, 3)

# Make predictions and save important bits

In [24]:
test_seq = tokenizer.texts_to_sequences(combos)
full_dat = pad_sequences(test_seq, maxlen=MAX_SEQUENCE_LENGTH)
preds = model.predict(full_dat, verbose=1)
word_index = tokenizer.word_index
index_word={}
for i,x in word_index.items():
    index_word[x]=i
np.save('../output/testsentiw_just.npy', index_word)
np.save('../output/testsentiw_inv_just.npy', word_index)
np.save('../output/testsentpreds_just.npy', preds)
np.save('../output/testsentxdat_just.npy', full_dat)
np.save('../output/testdat_just.npy', df)



# Here, I make some novel sentences to pass to the network
(Justification with the self)

In [25]:
texts = ['Athletic ability is important to me because ', 
         'Art is important to me because ', 
         'Being smart is important to me because ', 
         'Getting good grades is important to me because ', 
         'Creativity is important to me because ', 
         'Independence is important to me because ', 
         'Social groups are important to me because ', 
         'Music is important to me because ', 
         'Politics is important to me because ', 
         'Relationships is important to me because ', 
         'Religion is important to me because ', 
         'Sense of humor is important to me because ', 
         'Living in the moment is important to me because ']

justification = ['I want', 'I need', 'I will', 'I have', 'I can', 
                 'I feel', 'I should', 'I would', 'I hope', 'I am', 
                 'I get', 'I might', 'I use', 'I like', 'I take']


a = list(itertools.product(texts, justification))
combos = []
value = []
just = []
for i in a:
    combos.append(i[0]+i[1])
    value.append(i[0].split()[0])
    just.append(i[1].split()[1])
df = pd.DataFrame({'text':combos,
                  'value':value,
                  'justification':just})

# Make predictions and save important bits

In [26]:
test_seq = tokenizer.texts_to_sequences(combos)
full_dat = pad_sequences(test_seq, maxlen=MAX_SEQUENCE_LENGTH)
preds = model.predict(full_dat, verbose=1)
word_index = tokenizer.word_index
index_word={}
for i,x in word_index.items():
    index_word[x]=i
np.save('../output/testsentiw_justself.npy', index_word)
np.save('../output/testsentiw_inv_justself.npy', word_index)
np.save('../output/testsentpreds_justself.npy', preds)
np.save('../output/testsentxdat_justself.npy', full_dat)
np.save('../output/testdat_justself.npy', df)

