In [1]:
from __future__ import print_function
import os
import numpy as np
np.random.seed(1337)
__docformat__ = 'restructedtext en'
import timeit
import numpy
import scipy.io

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
import sys
BASE_DIR = '/Users/triddle/Documents/Gits'
GLOVE_DIR = BASE_DIR + '/Data/'
#MAX_SEQUENCE_LENGTH = 210
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.2
# first, build index mapping words in the embeddings set
# to their embedding vector

print('Indexing word vectors.')

embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.840B.300d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))



Using Theano backend.


Indexing word vectors.
Found 2196016 word vectors.


In [2]:
# for custom metrics
import keras.backend as K
from keras.utils.generic_utils import get_from_module

def categorical_accuracy(y_true, y_pred):
    """Categorical accuracy metric.

    Computes the mean accuracy rate across all predictions for
    multiclass classification problems.
    """
    return K.mean(K.equal(K.argmax(y_true, axis=-1),
                          K.argmax(y_pred, axis=-1)))


def precision(y_true, y_pred):
    """Precision metric.

    Only computes a batch-wise average of precision.

    Computes the precision, a metric for multi-label classification of
    how many selected items are relevant.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision


def recall(y_true, y_pred):
    """Recall metric.

    Only computes a batch-wise average of recall.

    Computes the recall, a metric for multi-label classification of
    how many relevant items are selected.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall


def fbeta_score(y_true, y_pred, beta=1):
    """Computes the F score.

    The F score is the weighted harmonic mean of precision and recall.
    Here it is only computed as a batch-wise average, not globally.

    This is useful for multi-label classification, where input samples can be
    classified as sets of labels. By only using accuracy (precision) a model
    would achieve a perfect score by simply assigning every class to every
    input. In order to avoid this, a metric should penalize incorrect class
    assignments as well (recall). The F-beta score (ranged from 0.0 to 1.0)
    computes this, as a weighted mean of the proportion of correct class
    assignments vs. the proportion of incorrect class assignments.

    With beta = 1, this is equivalent to a F-measure. With beta < 1, assigning
    correct classes becomes more important, and with beta > 1 the metric is
    instead weighted towards penalizing incorrect class assignments.
    """
    if beta < 0:
        raise ValueError('The lowest choosable beta is zero (only precision).')

    # If there are no true positives, fix the F score at 0 like sklearn.
    if K.sum(K.round(K.clip(y_true, 0, 1))) == 0:
        return 0

    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    bb = beta ** 2
    fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon())
    return fbeta_score


def fmeasure(y_true, y_pred):
    """Computes the f-measure, the harmonic mean of precision and recall.

    Here it is only computed as a batch-wise average, not globally.
    """
    return fbeta_score(y_true, y_pred, beta=1)


In [3]:
def create_dat(df_ess, df_dem):
    df_ess = df_ess[df_ess.Study=='Connecticut']
    df_ess.Condition.replace(['c', 'c2', 'c1', 'c3', 'ca', 'cb', '3'], 'Control', inplace=True)
    df_ess.Condition.replace(['t', 't2', 't3', 't1', '1', '2', 'ta', 'tb'], 'Treatment', inplace=True)
    df_ess.Condition.replace(['c/t'], np.nan, inplace=True)
    
    df_dem = df_dem[df_dem.Study=='Connecticut']
    df_dem.Ethnicity.replace('Asian', 'Asian American', inplace=True)
    df_dem.Ethnicity.replace('Other/Mixed', 'Other', inplace=True)
    df_dem = df_dem[['ID', 'Ethnicity', 'Gender']].dropna()
    
    outdat = pd.merge(df_ess[['ID', 'Intervention_number', 'Essay', 'Condition', 'Intervention_Date', 'corrected']], 
                    df_dem, how='left', on='ID').drop_duplicates()
    
    return(outdat)

In [4]:
import pandas as pd
import matplotlib
from pandas import DataFrame, read_csv
import matplotlib.pyplot as plt
import pandas as pd #this is how I usually import pandas
import sys #only needed to determine Python version number
import matplotlib #only needed to determine Matplotlib version number
import numpy as np
# second, prepare text samples and their labels
print('Processing text dataset')
df_ess = pd.read_csv('../../Data/3 CSV Files/essays1.23.16.csv', sep='|')
df_dem = pd.read_csv('../../Data/3 CSV Files/demog3.2.16.csv')
df = create_dat(df_ess, df_dem)
df.dropna(axis=0, subset=['corrected', 'Condition'], inplace=True)
texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
labels_index['conm']=0
labels_index['conf']=1
labels_index['affm']=2
labels_index['afff']=3
texts = df["corrected"].tolist()
labelsType = df["Condition"].tolist()
labelsRace = df["Gender"].tolist()
labelsCombined=[]
for i in range(len(labelsType)):
    if labelsType[i]=='Control' and labelsRace[i]=='m':
        labelsCombined.append(0)
    elif labelsType[i]=='Control' and labelsRace[i]=='f':
        labelsCombined.append(1)
    elif labelsType[i]=='Treatment' and labelsRace[i]=='m':
        labelsCombined.append(2)
    elif labelsType[i]=='Treatment' and labelsRace[i]=='f':
        labelsCombined.append(3)
        
print('Found %s texts.' % len(texts))

# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

maxseqval=0
for i in range(len(sequences)):
    if len(sequences[i])>maxseqval:
        maxseqval=len(sequences[i])
        
MAX_SEQUENCE_LENGTH = maxseqval

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labelsCombined))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
x_train = data
y_train = labels

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
VALIDATION_SPLIT = 0.15
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

textstest=[]
textsshuff=[]
for i in indices:
  textsshuff.append(texts[i])
index_word={}
for i,x in word_index.items():
    index_word[x]=i
textstest=textsshuff[-nb_validation_samples:-nb_validation_samples]

thefile = open('gendercleantimedistmodeltesttexts.txt', 'w')
for item in textstest:
  thefile.write("%s\n" % item)  

#np.save('gendercleantimedistmodelxtest.npy', x_test)
#np.save('gendercleantimedistmodelytest.npy', y_test)

print('Shape of xtrain tensor:', x_train.shape)
print('Shape of ytrain tensor:', y_train.shape)
print('Shape of xval tensor:', x_val.shape)
print('Shape of yval tensor:', y_val.shape)
#print('Shape of xtest tensor:', x_test.shape)
#print('Shape of ytest tensor:', y_test.shape)

print('Preparing embedding matrix.')
# prepare embedding matrix
nb_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(nb_words + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)



Processing text dataset


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


Found 6224 texts.
Found 5816 unique tokens.
Shape of data tensor: (6224, 208)
Shape of label tensor: (6224, 4)
Shape of xtrain tensor: (5291, 208)
Shape of ytrain tensor: (5291, 4)
Shape of xval tensor: (933, 208)
Shape of yval tensor: (933, 4)
Preparing embedding matrix.


In [5]:
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.models import load_model

model = Sequential()
model.add(Embedding(nb_words + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True,dropout=0.2))
model.add(Dropout(0.5))
model.add(LSTM(50))
model.add(Dropout(0.5))
model.add(Dense(len(labels_index), activation='softmax'))
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_2 (Embedding)          (None, 208, 300)      1745100     embedding_input_1[0][0]          
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 208, 300)      0           embedding_2[0][0]                
____________________________________________________________________________________________________
lstm_1 (LSTM)                    (None, 50)            70200       dropout_1[0][0]                  
____________________________________________________________________________________________________
dropout_2 (Dropout)              (None, 50)            0           lstm_1[0][0]                     
___________________________________________________________________________________________

In [75]:
#model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=[precision,recall,fmeasure,categorical_accuracy])
#print(model.summary())
#bz=128
#model.fit(x_train, y_train, validation_data=(x_val, y_val),
#          nb_epoch=35, batch_size=bz)
#model.save_weights('finalgendermodel.h5')

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_4 (Embedding)          (None, 208, 300)      1745100     embedding_input_2[0][0]          
____________________________________________________________________________________________________
dropout_5 (Dropout)              (None, 208, 300)      0           embedding_4[0][0]                
____________________________________________________________________________________________________
lstm_4 (LSTM)                    (None, 50)            70200       dropout_5[0][0]                  
____________________________________________________________________________________________________
dropout_6 (Dropout)              (None, 50)            0           lstm_4[0][0]                     
___________________________________________________________________________________________

In [6]:
from keras.layers import TimeDistributed
model.load_weights('finalgendermodel.h5')
model.summary()
layer_dict = dict([(layer.name, layer) for layer in model.layers])
layer_dict.keys()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_2 (Embedding)          (None, 208, 300)      1745100     embedding_input_1[0][0]          
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 208, 300)      0           embedding_2[0][0]                
____________________________________________________________________________________________________
lstm_1 (LSTM)                    (None, 50)            70200       dropout_1[0][0]                  
____________________________________________________________________________________________________
dropout_2 (Dropout)              (None, 50)            0           lstm_1[0][0]                     
___________________________________________________________________________________________

['dropout_1', 'lstm_1', 'dense_1', 'dropout_2', 'embedding_2']

In [7]:
#layer_dict['lstm_2'].get_weights()
dens=layer_dict['dense_1'].get_weights()
lstmw=layer_dict['lstm_1'].get_weights()
model.pop()
model.pop()
model.pop()
lstmout = LSTM(50,
               return_sequences=True,
               stateful=False,weights=lstmw)
model.add(lstmout)
model.add(Dropout(0.5))
templayer=TimeDistributed(Dense(len(labels_index), activation='softmax',weights=dens))
model.add(templayer)
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_2 (Embedding)          (None, 208, 300)      1745100     embedding_input_1[0][0]          
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 208, 300)      0           embedding_2[0][0]                
____________________________________________________________________________________________________
lstm_2 (LSTM)                    (None, 208, 50)       70200       dropout_1[0][0]                  
____________________________________________________________________________________________________
dropout_3 (Dropout)              (None, 208, 50)       0           lstm_2[0][0]                     
___________________________________________________________________________________________

In [112]:
#pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
#sequences = tokenizer.texts_to_sequences(texts)
#pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
testdat = ['These values might', 'These values are', 'These values will', 'Sports']
test_seq = tokenizer.texts_to_sequences(testdat)
full_dat = pad_sequences(test_seq, maxlen=MAX_SEQUENCE_LENGTH)
preds = model.predict(full_dat, verbose=1)
preds[4]



array([[ 0.25946808,  0.2427309 ,  0.26948729,  0.22831374],
       [ 0.2520169 ,  0.2447765 ,  0.28034008,  0.22286652],
       [ 0.24652748,  0.24267758,  0.29390815,  0.2168868 ],
       [ 0.24286158,  0.23721558,  0.30969056,  0.21023227],
       [ 0.24073376,  0.22947262,  0.32675067,  0.20304295],
       [ 0.23983349,  0.22049777,  0.34406051,  0.19560823],
       [ 0.23988432,  0.21115462,  0.36070469,  0.18825635],
       [ 0.24066298,  0.20207317,  0.37598497,  0.1812789 ],
       [ 0.24199603,  0.19365996,  0.38945323,  0.17489077],
       [ 0.24374832,  0.18613701,  0.40089476,  0.16921991],
       [ 0.24581099,  0.17958942,  0.41028357,  0.16431604],
       [ 0.24809258,  0.17400992,  0.41772857,  0.16016893],
       [ 0.25051424,  0.1693352 ,  0.42342219,  0.1567284 ],
       [ 0.25300723,  0.16547269,  0.42759848,  0.1539216 ],
       [ 0.25551242,  0.16231835,  0.43050307,  0.15166618],
       [ 0.25797984,  0.15976788,  0.43237305,  0.14987922],
       [ 0.26036873,  0.

In [8]:
import spacy
nlp = spacy.load('en')


In [18]:
df = pd.read_csv('../../Text_dictionaries/LIWC/Social.csv', header=None, names=['word'], encoding='UTF-8')

In [43]:
#df[df.word.str.contains('\*')]
df['pos'] = df['word'].apply(lambda x: nlp(x)[0].pos_)
df['lemma'] = df['word'].apply(lambda x: nlp(x)[0].lemma_)
df.head(10)
#doc = nlp(df.word.loc[0])
#doc[0].pos_
#df[df.pos=='NOUN']['sent'] = 'These values are important because my'
#df[df.pos=='VERB']['sent'] = 'These values'
df[df.pos=='PRON'] = 'These values are importnat '
#ADJ
#INTJ
#ADV

Unnamed: 0,word,pos,lemma
169,he,PRON,he
174,hed,PRON,-PRON-
175,he'd,PRON,-PRON-
176,he'll,PRON,-PRON-
185,herself,PRON,herself
186,hes,PRON,-PRON-
187,he's,PRON,-PRON-
190,him,PRON,him
191,himself,PRON,himself
287,ourselves,PRON,ourselves


In [None]:
outgendernewviz=model.predict(x_test,verbose=1)
np.save('finalgenderdict.npy', index_word) 
np.save('finalgenderdictinv.npy', word_index) 
np.save('finalgenderoutput.npy', outgendernewviz) 
np.save('finalgenderxtestdata.npy', x_test) 
np.save('finalgenderytestdata.npy', y_test) 
thefile = open('finalgendertextsinput.txt', 'w')
for item in textstest:
  thefile.write("%s\n" % item) 