In [3]:
import xmltodict
import logging
import glob
import os
import pandas as pd
import sys
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings("ignore", category=UserWarning)



# Main dataset loading utility
class PanDataLoader:
    
    def __init__(self, logger=None):
        if logger is None:
            logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
            self.log = logging.getLogger(__name__)
        else:
            self.log = logger
                               
    def load_17(self, directory):
            
        """Load and return the pan17 gender and variation twitter dataset.
        ==============                                      ==============
        Samples total                                                10800
        Targets            nominal [{male, female},
                                    {ar, pt, es, en},
                                    {'brazil', 'australia', 'venezuela',
                                     'portugal', 'great britain', 'chile',
                                     'levantine', 'egypt', 'colombia',
                                     'peru', 'ireland', 'argentina',
                                     'maghrebi', 'mexico', 'new zealand',
                                     'spain', 'canada', 'gulf'}]
        ==============                                      ==============
        Parameters
        ----------
        inputdir
        The directory containing the training data, i.e. /data/training.

        Returns
        -------
        data : Pandas dataframe
            The interesting attributes are:
            'text', the data to learn, ['gender','lang', variety],
            the regression targets,
        Examples
        --------
        >>> from datasets import load_pan17
        >>> df_training = load_pan17(inputdir)
        >>> print(df_training.corpus.shape)
        (10800, 5)
        """

        X_docs = glob.glob(os.path.join(directory, '*.xml'), recursive=True)
        Y_doc = os.path.join(directory, 'truth.txt')
        # check that the dataset is loaded correctly

        X_tmp = []
        for t in X_docs:
            with open(t) as f:
                doc = xmltodict.parse(f.read())
            author = os.path.splitext(os.path.basename(t))[0]
            lang = doc['author']['@lang']
            text = doc['author']['documents']['document']
            X_tmp.append((author, lang, text))

        text = pd.DataFrame(X_tmp, columns=["author", "lang", "text"])

        Y_tmp = pd.read_csv(Y_doc,
                             sep='\:\:\:',
                             names=['author', 'gender', 'variety'],
                             engine='python')

        corpus = pd.merge(text, Y_tmp, on='author')
        return corpus
    
    def load_16(self, directory):
        return self.load_14(directory)
    
    def load_15(self, directory):
        X_docs = glob.glob(os.path.join(directory, '*.xml'), recursive=True)
        Y_doc = os.path.join(directory, 'truth.txt')
        X_tmp = []
        for t in X_docs:
            with open(t) as f:
                doc = xmltodict.parse(f.read())
            author = os.path.splitext(os.path.basename(t))[0]
            lang = doc['author']['@lang']
            text = doc['author']['document']
            # print(author, lang, text[:100])
            X_tmp.append((author, lang, text))

        text = pd.DataFrame(X_tmp, columns=["author", "lang", "text"])

        Y_tmp = pd.read_csv(Y_doc,
                             sep='\:\:\:',
                             names=['author', 'gender', 'age', '1','2','3','4', '5'],
                             engine='python') 


        corpus = pd.merge(text, Y_tmp, on='author')
        return corpus
    
    
    def load_14(self, directory):
        errors = 0
        X_docs = glob.glob(os.path.join(directory, '*.xml'), recursive=True)
        Y_doc = os.path.join(directory, 'truth.txt')
        X_tmp = []
        for t in X_docs:
            with open(t) as f:
                try:
                    doc = xmltodict.parse(f.read())
                except Exception as e:
                    self.log.warning(e)
                    self.log.warning("Skipping: {}".format(t))
                    continue
            author = os.path.splitext(os.path.basename(t))[0]
            lang = doc['author']['@lang']
            text = []
            for td in doc['author']['documents']['document']:
                try:
                    t = BeautifulSoup(td['#text'], "lxml").getText()
                    text.append(t)
                except Exception as e:
                    errors += 1
                    # log.warning(e)
                    # self.log.warning("skipping {}".format(td))
                    continue
            X_tmp.append((author, lang, text))

        text = pd.DataFrame(X_tmp, columns=["author", "lang", "text"])

        Y_tmp = pd.read_csv(Y_doc,
                             sep='\:\:\:',
                             names=['author', 'gender', 'age'],
                             engine='python') 

        self.log.warning("Skipped {}".format(errors))

        corpus = pd.merge(text, Y_tmp, on='author')
        return corpus
    
    def _load_all(self, loader_func, directories):
        """Concatenate across languages"""
        corpora = []
        for dr in directories:
            corpus = loader_func(dr)
            corpora.append(corpus)
        return pd.concat(corpora)
    
    def load_all_17(self, directories):
        return self._load_all(self.load_17, directories)
    
    def load_all_16(self, directories):
        return self._load_all(self.load_16, directories)
    
    def load_all_15(self, directories):
        return self._load_all(self.load_15, directories)
    
    def load_all_14(self, directories):
        return self._load_all(self.load_14, directories)
    
    def clean_and_normalize(self, corpus):
        """Standardize to lowercase for gender and langauge, m/f for gender
           Remove personality scores"""
        # FIXME TODO -- how do you do this in place?
        # FIXME TODO -- normalize age ranges?
        corpus['gender'] = corpus['gender'].apply(lambda s: s[0].lower())
        corpus['lang'] = corpus['lang'].apply(lambda s: s.lower())

        for c in ['1', '2', '3', '4', '5']:
            if c in corpus:
                del corpus[c]
        return corpus


In [4]:
pdl = PanDataLoader()

gender_data = pdl.load_17("/data/pan17/pan17-author-profiling-training-dataset-2017-03-10/en")

In [5]:
males = gender_data[(gender_data["gender"]=="male") & (gender_data["lang"]=="en")]
females = gender_data[(gender_data["gender"]=="female") & (gender_data["lang"]=="en")]

mtexts = [" " .join(males.iloc[i].text) for i in range(len(males))]
ftexts = [" " .join(females.iloc[i].text) for i in range(len(females))]

In [11]:
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
mtok = tokenizer.tokenize(mtexts)


max_review_length = 5000
mX = sequence.pad_sequences(mtok, maxlen=max_review_length)
fX = sequence.pad_sequences(ftok, maxlen=max_review_length)

AttributeError: 'Tokenizer' object has no attribute 'tokenize'

In [118]:
len(fX)

1800

In [103]:
pairs = []
labels = []
i = 0
while i < len(mX):
    pairs.append([mX[i], mX[i+1]])
    pairs.append([fX[i], fX[i+1]])
    pairs.append([mX[i], fX[i]])
    pairs.append([mX[i+1], fX[i+1]])
    i += 2
labels = [1,1,0,0] * (int(len(mX)/2))

In [104]:
X_train, X_test, y_train, y_test = train_test_split(pairs, labels)

In [132]:
print(X_train[500][0][-100:])

[    9   132  2151    15   151    56    35    24   219     4   189    19
    16    44     8  1008 42712 21060 25282    22   453  4943   535     5
  2615    17    53    96  1855    39   263  2299   610   331   102 31659
   134  3778  2954  8072     9  4066   302     4   369     9     4    85
   538   112     5     4  1140   521     7   134  1872    18  2050  3684
   286  5455    50    13    96    20   237     6  7238    67    32   112
     4   144   359     4   805   868    14  7202   107   150 17183    46
   240    88     5 32392    15    57   569   108     4   189   224    17
    12     4   948   102]


In [12]:
feats.fit(ftexts + mtexts, [0] * len(ftexts) + [1] * len(mtexts))

In [14]:
mX = feats.transform(mtexts)

In [15]:
fX = feats.transform(ftexts)

In [63]:
mX = mX.todense()
fX = fX.todense()

In [137]:
fX2.shape

(1800, 378307)

In [65]:
pairs = []
labels = []
i = 0
while i < mX.shape[0]:
    pairs.append([mX[i], mX[i+1]])
    pairs.append([fX[i], fX[i+1]])
    pairs.append([mX[i], fX[i]])
    pairs.append([mX[i+1], fX[i+1]])
    i += 2
labels = [1,1,0,0] * (int(mX.shape[0]/2))

In [1]:
len(y_train)

NameError: name 'y_train' is not defined

In [67]:
from sklearn.model_selection import train_test_split

tr_pairs, te_pairs, tr_y, te_y = train_test_split(pairs, labels, test_size=0.5)

In [1]:
from __future__ import absolute_import
from __future__ import print_function
import numpy as np
np.random.seed(1337)  # for reproducibility

import random
from keras.datasets import mnist
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Input, Lambda, Embedding, LSTM
from keras.optimizers import RMSprop
from keras import backend as K


def euclidean_distance(vects):
    x, y = vects
    return K.sqrt(K.sum(K.square(x - y), axis=1, keepdims=True))


def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)


def contrastive_loss(y_true, y_pred):
    '''Contrastive loss from Hadsell-et-al.'06
    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    '''
    margin = 1
    return K.mean(y_true * K.square(y_pred) + (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))


def create_pairs(x, digit_indices):
    '''Positive and negative pair creation.
    Alternates between positive and negative pairs.
    '''
    pairs = []
    labels = []
    n = min([len(digit_indices[d]) for d in range(10)]) - 1
    for d in range(10):
        for i in range(n):
            z1, z2 = digit_indices[d][i], digit_indices[d][i + 1]
            pairs += [[x[z1], x[z2]]]
            inc = random.randrange(1, 10)
            dn = (d + inc) % 10
            z1, z2 = digit_indices[d][i], digit_indices[dn][i]
            pairs += [[x[z1], x[z2]]]
            labels += [1, 0]
    return np.array(pairs), np.array(labels)


def create_base_network(input_dim):
    embedding_vecor_length = 32
    model = Sequential()
    model.add(Embedding(100000, embedding_vecor_length, input_length=5000))
    model.add(LSTM(128))
    model.add(Dense(128))

    return model

def create_base_network_old(input_dim):
    '''Base network to be shared (eq. to feature extraction).
    '''
    seq = Sequential()
    seq.add(Dense(128, input_shape=(input_dim,), activation='relu'))
    seq.add(Dense(128, activation='relu'))
    # seq.add(Dropout(0.1))
    # seq.add(Dense(512, activation='relu'))
    # seq.add(Dense(512, activation='relu'))
    # seq.add(Dense(512, activation='relu'))
    # seq.add(Dropout(0.1))
    # seq.add(Dense(512, activation='relu'))
    return seq


def compute_accuracy(predictions, labels):
    return np.mean(np.equal(predictions.ravel() < 0.5, labels))

Using TensorFlow backend.


In [2]:
tr_pairs = np.array(X_train)
te_pairs = np.array(X_test)
tr_y = np.array(y_train)
te_y = np.array(y_test)

NameError: name 'X_train' is not defined

In [141]:
te_pairs.shape

(900, 2, 5000)

In [50]:
tr_pairs_s = tr_pairs.squeeze()

In [72]:
tr_pairs_s.reshape(3600, 2, mX.shape[1])

ValueError: cannot reshape array of size 3600 into shape (3600,2,378307)

mX.shape

In [73]:
tr_pairs = tr_pairs.reshape(1800, 2, mX.shape[1])
te_pairs = te_pairs.reshape(mX.shape[0], 2, mX.shape[1])

In [138]:
X_train.shape

AttributeError: 'list' object has no attribute 'shape'

In [None]:
input_dim = tr_pairs.shape[-1]

# network definition
base_network = create_base_network(input_dim)

input_a = Input(shape=(input_dim,))
input_b = Input(shape=(input_dim,))

# because we re-use the same instance `base_network`,
# the weights of the network
# will be shared across the two branches
processed_a = base_network(input_a)
processed_b = base_network(input_b)

distance = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([processed_a, processed_b])

model = Model(input=[input_a, input_b], output=distance)

In [None]:
# train

'''tr_pairs1 = tr_pairs[:250]
te_pairs1 = tr_pairs[250:]
tr_y1 = tr_y[:250]
te_y1 = tr_y[250:]

tr_pairs = tr_pairs1
te_pairs = te_pairs1
tr_y = tr_y1
te_y = te_y1
'''
# tr_pairs = np.vstack([tr_pairs, te_pairs])
# print(tr_pairs.shape)

rms = RMSprop()
model.compile(loss=contrastive_loss, optimizer='adam', metrics=['acc'])
model.fit([tr_pairs[:, 0], tr_pairs[:, 1]], tr_y,
          validation_split=0.9,
          batch_size=240,
          epochs=1)



Train on 269 samples, validate on 2431 samples
Epoch 1/1


In [81]:
# compute final accuracy on training and test sets
pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]])
tr_acc = compute_accuracy(pred, tr_y)
pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])
te_acc = compute_accuracy(pred, te_y)

print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))
print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))

* Accuracy on training set: 50.17%
* Accuracy on test set: 49.83%


In [52]:
from sklearn.metrics import accuracy_score
print(accuracy_score(te_y, [1 if x < 0.5 else 0 for x in pred]))

0.387333333333


In [506]:
np.vstack([tr_pairs, te_pairs]).shape

(600, 2, 1200)

In [388]:
tr_pairs.shape

(250, 2, 1200)

In [389]:
te_pairs[:,0].shape

(250, 1200)

In [390]:
np.hstack([tr_y,te_y])

array([1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 0,