In [11]:
import sys
sys.path.append("/Users/g/git/pan17/src_gronup/")
from pipeline import *

emojis = [e for e in emoji.UNICODE_EMOJI]


feats = FeatureUnion([
               ('EmoticonNoses', features.EmoticonNoses()),
               # ('EmoticonReverse', features.EmoticonReverse()),
               # ('EmoticonCount', features.EmoticonCount()),
               # ('EmoticonEmotion', features.EmoticonEmotion()),
               # ('StartsWithCapital', features.StartsWithCapital()),
               # ('EndsWithPunctuation', features.EndsWithPunctuation()),
               # ('AverageWordLength', features.AverageWordLength()),
               # ('AverageSentenceLength', features.AverageSentenceLength()),
               # ('PunctuationByTweet', features.PunctuationByTweet()),
               # ('CapitalizedTokens', features.CapitalizedTokens()),
               # ('CapitalLetters', features.CapitalLetters()),
               ('VocabularyRichness', features.VocabularyRichness()),
               ('wordvec', TfidfVectorizer(ngram_range = (1,3), preprocessor = lambda x: x, tokenizer = identity)),
               ('charvec', TfidfVectorizer(analyzer = 'char', ngram_range = (2,5), preprocessor = lambda x: ' '.join(identity(x)))),
               ('emojivec', TfidfVectorizer(analyzer = 'char', preprocessor = lambda x: ' '.join(identity(x)), vocabulary=emojis)),
               ('FunctionWords', features.FunctionWords(2500))
    ])

In [284]:
# Helper functions for preprocessing raw text before feeding it into a Neural Net
import string
import numpy as np

# Map printable characters to ints and vice-versa
ALPHABET = string.printable

char2int = dict((c, i) for i, c in enumerate(ALPHABET))
int2char = dict((i, c) for i, c in enumerate(ALPHABET))


def c2i(char):
    return char2int[char]

def i2c(num):
    return int2char[num]


def vectorize_text(text):
    """Convert a text into integers"""
    X = []
    text = ''.join(list(filter(lambda x: x in ALPHABET, text)))
    X = [c2i(char) for char in text]
    X = np.array(X)
    X = X / float(len(ALPHABET))
    return X

In [285]:
print(vectorize_text([ALPHABET]).shape)
# print(len(ALPHABET))
# print(ALPHABET[-1])

(100,)


In [1]:
from datasets import load_pan17

gender_data = load_pan17("/data/pan17/pan17-author-profiling-training-dataset-2017-03-10/")

In [2]:
males = gender_data.corpus[(gender_data.corpus["gender"]=="male") & (gender_data.corpus["lang"]=="en")]
females = gender_data.corpus[(gender_data.corpus["gender"]=="female") & (gender_data.corpus["lang"]=="en")]

mtexts = [" " .join(males.iloc[i].text) for i in range(len(males))]
ftexts = [" " .join(females.iloc[i].text) for i in range(len(females))]

In [3]:
from spacy.en import English
nlp = English()

In [4]:
mvecs = [nlp(t).vector for t in mtexts]
fvecs = [nlp(t).vector for t in ftexts]

In [7]:
len(mvecs)

1800

In [92]:
mX = tk.texts_to_sequences(mtexts)

In [93]:
fX = tk.texts_to_sequences(ftexts)

In [102]:
from keras.preprocessing import sequence

max_review_length = 5000
mX = sequence.pad_sequences(mX, maxlen=max_review_length)
fX = sequence.pad_sequences(fX, maxlen=max_review_length)

In [286]:
len(fX)

1800

In [5]:
mX = mvecs
fX = fvecs
pairs = []
labels = []
i = 0
while i < len(mX):
    pairs.append([mX[i], mX[i+1]])
    pairs.append([fX[i], fX[i+1]])
    pairs.append([mX[i], fX[i]])
    pairs.append([mX[i+1], fX[i+1]])
    i += 2
labels = [1,1,0,0] * (int(len(mX)/2))

In [8]:
len(labels)

3600

In [12]:
feats.fit(ftexts + mtexts, [0] * len(ftexts) + [1] * len(mtexts))

In [14]:
mX = feats.transform(mtexts)

In [15]:
fX = feats.transform(ftexts)

In [63]:
mX = mX.todense()
fX = fX.todense()

In [137]:
fX2.shape

(1800, 378307)

In [192]:
mX = mXs
fX = fXs
pairs = []
labels = []
i = 0
while i < mX.shape[0]:
    pairs.append([mX[i], mX[i+1]])
    pairs.append([fX[i], fX[i+1]])
    pairs.append([mX[i], fX[i]])
    pairs.append([mX[i+1], fX[i+1]])
    i += 2
labels = [1,1,0,0] * (int(mX.shape[0]/2))

In [195]:
len(pairs)

3600

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(pairs, labels, test_size=0.2)

In [10]:
from __future__ import absolute_import
from __future__ import print_function
import numpy as np
np.random.seed(1337)  # for reproducibility

import random
from keras.datasets import mnist
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Input, Lambda, Embedding, LSTM, Conv1D, Flatten, MaxPooling1D
from keras.optimizers import RMSprop
from keras import backend as K


def euclidean_distance(vects):
    x, y = vects
    return K.sqrt(K.sum(K.square(x - y), axis=1, keepdims=True))


def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)


def contrastive_loss(y_true, y_pred):
    '''Contrastive loss from Hadsell-et-al.'06
    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    '''
    margin = 1
    return K.mean(y_true * K.square(y_pred) + (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))


def create_pairs(x, digit_indices):
    '''Positive and negative pair creation.
    Alternates between positive and negative pairs.
    '''
    pairs = []
    labels = []
    n = min([len(digit_indices[d]) for d in range(10)]) - 1
    for d in range(10):
        for i in range(n):
            z1, z2 = digit_indices[d][i], digit_indices[d][i + 1]
            pairs += [[x[z1], x[z2]]]
            inc = random.randrange(1, 10)
            dn = (d + inc) % 10
            z1, z2 = digit_indices[d][i], digit_indices[dn][i]
            pairs += [[x[z1], x[z2]]]
            labels += [1, 0]
    return np.array(pairs), np.array(labels)


def create_base_network_blah(input_dim):
    embedding_vecor_length = 32
    model = Sequential()
    model.add(Embedding(100000, embedding_vecor_length, input_length=5000))
    model.add(LSTM(128))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    return model

def create_base_network(input_dim):
    '''Base network to be shared (eq. to feature extraction).
    '''
    seq = Sequential()
    seq.add(Dense(256, input_shape=(input_dim,), activation='relu'))
    seq.add(Dense(256, activation='relu'))
    seq.add(Dense(256, activation='relu'))
    seq.add(Dense(256, activation='relu'))
    seq.add(Dense(256, activation='relu'))
    seq.add(Dense(256, activation='relu'))
    seq.add(Dense(256, activation='relu'))
    seq.add(Dense(256, activation='relu'))
    seq.add(Dense(256, activation='relu'))
    # seq.add(Dropout(0.1))
    seq.add(Dense(512, activation='relu'))
    # seq.add(Dense(512, activation='relu'))
    # seq.add(Dense(512, activation='relu'))
    # seq.add(Dropout(0.1))
    # seq.add(Dense(512, activation='relu'))
    return seq

def compute_accuracy(predictions, labels):
    return np.mean(np.equal(predictions.ravel() < 0.5, labels))

Using TensorFlow backend.


In [11]:
tr_pairs = np.array(X_train)
te_pairs = np.array(X_test)
tr_y = np.array(y_train)
te_y = np.array(y_test)

In [271]:
tr_pairs.shape

(2880, 1000)

In [205]:
tr_pairs.reshape(2880, 300, 2, 300)

ValueError: cannot reshape array of size 1728000 into shape (2880,300,2,300)

In [50]:
tr_pairs_s = tr_pairs.squeeze()

In [72]:
tr_pairs_s.reshape(3600, 2, mX.shape[1])

ValueError: cannot reshape array of size 3600 into shape (3600,2,378307)

mX.shape

In [73]:
tr_pairs = tr_pairs.reshape(1800, 2, mX.shape[1])
te_pairs = te_pairs.reshape(mX.shape[0], 2, mX.shape[1])

In [191]:
from keras.preprocessing.sequence import pad_sequences
mXs = pad_sequences(mX)
fXs = pad_sequences(fX)

In [292]:
mXs.shape

(1800, 300)

In [12]:
input_dim = tr_pairs.shape[-1]

# network definition
base_network = create_base_network(input_dim)

input_a = Input(shape=(input_dim,))
input_b = Input(shape=(input_dim,))

# because we re-use the same instance `base_network`,
# the weights of the network
# will be shared across the two branches
processed_a = base_network(input_a)
processed_b = base_network(input_b)

distance = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([processed_a, processed_b])

model = Model(inputs=[input_a, input_b], outputs=distance)

In [283]:
np.array([tr_pairs[:, 0], tr_pairs[:, 1]]).shape

(2, 2880)

In [16]:
# train
from keras.optimizers import Adam

'''tr_pairs1 = tr_pairs[:250]
te_pairs1 = tr_pairs[250:]
tr_y1 = tr_y[:250]
te_y1 = tr_y[250:]

tr_pairs = tr_pairs1
te_pairs = te_pairs1
tr_y = tr_y1
te_y = te_y1
'''
# tr_pairs = np.vstack([tr_pairs, te_pairs])
# print(tr_pairs.shape)

rms = RMSprop()
adam = Adam(decay=0.0001)
model.compile(loss=contrastive_loss, optimizer='adam', metrics=['acc'])
model.fit([tr_pairs[:, 0], tr_pairs[:, 1]], tr_y,
          validation_split=0.2,
          batch_size=30,
          epochs=20)



Train on 2304 samples, validate on 576 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x17bcd8588>

In [None]:
cross_val_score(Xs, ys)

In [17]:
# compute final accuracy on training and test sets
pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]])
tr_acc = compute_accuracy(pred, tr_y)
pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])
te_acc = compute_accuracy(pred, te_y)

print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))
print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))

* Accuracy on training set: 73.78%
* Accuracy on test set: 64.72%


In [18]:
from sklearn.metrics import accuracy_score
print(accuracy_score(te_y, [1 if x < 0.5 else 0 for x in pred]))

0.647222222222


In [506]:
np.vstack([tr_pairs, te_pairs]).shape

(600, 2, 1200)

In [388]:
tr_pairs.shape

(250, 2, 1200)

In [389]:
te_pairs[:,0].shape

(250, 1200)

In [390]:
np.hstack([tr_y,te_y])

array([1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 0,

In [240]:

from __future__ import print_function

import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model


BASE_DIR = ''
GLOVE_DIR = BASE_DIR + '/glove.6B/'
TEXT_DATA_DIR = BASE_DIR + '/20_newsgroup/'
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

# first, build index mapping words in the embeddings set
# to their embedding vector

print('Indexing word vectors.')

embeddings_index = {}
f = open("/data/glove/glove.6B.100d.txt")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

# second, prepare text samples and their labels
print('Processing text dataset')

Indexing word vectors.
Found 400000 word vectors.
Processing text dataset


In [311]:
texts = ftexts + mtexts # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = ([0] * 1800) + ([1] * 1800)  # list of label ids

In [318]:
MAX_NB_WORDS

20000

In [321]:
# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

# x_train = data[:-num_validation_samples]
# y_train = labels[:-num_validation_samples]
# x_val = data[-num_validation_samples:]
# y_val = labels[-num_validation_samples:]



Found 410153 unique tokens.
Shape of data tensor: (3600, 1000)
Shape of label tensor: (7200, 2)
Preparing embedding matrix.


In [324]:
print('Preparing embedding matrix.')

# prepare embedding matrix
num_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

Preparing embedding matrix.


In [250]:
mX = data[:1800]
fX = data[1800:]
pairs = []
labels = []
i = 0
while i < mX.shape[0]:
    pairs.append([mX[i], mX[i+1]])
    pairs.append([fX[i], fX[i+1]])
    pairs.append([mX[i], fX[i]])
    pairs.append([mX[i+1], fX[i+1]])
    i += 2
labels = [1,1,0,0] * (int(mX.shape[0]/2))

In [266]:
data = np.array(data)
labels = np.array(labels)

In [317]:
len(word_index)

410153

In [322]:
from keras.layers import Embedding

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [325]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(35)(x)  # global max pooling
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(len(labels_index), activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

# happy learning!
model.fit(data, labels, validation_split=0.33,
          epochs=2, batch_size=128)

ValueError: A target array with shape (3600, 2) was passed for an output of shape (None, 0) while using as loss `categorical_crossentropy`. This loss expects targets to have the same shape as the output.

In [329]:
'''This script loads pre-trained word embeddings (GloVe embeddings)
into a frozen Keras Embedding layer, and uses it to
train a text classification model on the 20 Newsgroup dataset
(classication of newsgroup messages into 20 different categories).

GloVe embedding data can be found at:
http://nlp.stanford.edu/data/glove.6B.zip
(source page: http://nlp.stanford.edu/projects/glove/)

20 Newsgroup data can be found at:
http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html
'''

from __future__ import print_function

import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model


BASE_DIR = ''
GLOVE_DIR = BASE_DIR + '/glove.6B/'
TEXT_DATA_DIR = BASE_DIR + '/20_newsgroup/'
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

# first, build index mapping words in the embeddings set
# to their embedding vector

print('Indexing word vectors.')

embeddings_index = {}
f = open("/data/glove/glove.6B.100d.txt")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

# second, prepare text samples and their labels
print('Processing text dataset')

texts = ftexts + mtexts
labels_index = {'female': 0,
                'male': 1}  # dictionary mapping label name to numeric id
labels = [0] * 1800 + [1] * 1800


print('Found %s texts.' % len(texts))

# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

print('Preparing embedding matrix.')

# prepare embedding matrix
num_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

print('Training model.')



Indexing word vectors.


KeyboardInterrupt: 

In [331]:

# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = LSTM(256, activation='relu')(embedded_sequences)
# x = MaxPooling1D(5)(x)
# x = Conv1D(128, 5, activation='relu')(x)
# x = MaxPooling1D(5)(x)
# x = Conv1D(128, 5, activation='relu')(x)
# x = MaxPooling1D(35)(x)
# x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(len(labels_index), activation='softmax')(x)

model = Model(sequence_input, preds)


In [332]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

model.fit(x_train, y_train,
          batch_size=128,
          epochs=5,
          validation_data=(x_val, y_val))

Train on 2880 samples, validate on 720 samples
Epoch 1/5

KeyboardInterrupt: 