In [2]:
from nltk.corpus import stopwords
import re
import pandas as pd
import json
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [21]:
MAX_SEQUENCE_LENGTH = 15
MAX_VOCAB_SIZE = 50000
EMBEDDING_DIM = 50


word2vec = {}
with open('glove.6B.50d.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        word2vec[word] = vec

stop = stopwords.words('english')

def normalize_text(text):
    norm_text = text.lower()
    # Replace breaks with spaces
    norm_text = norm_text.replace('<br />', ' ')
    # Pad punctuation with spaces on both sides
    norm_text = re.sub(r"([\.\",\(\)!\?;:])", " \\1 ", norm_text)
    return norm_text

def remove_stop_words(text):
    return " ".join([item.lower() for item in text.split() if item not in stop])

def remove_non_ascii(text):
    return ''.join(["" if ord(i) < 32 or ord(i) > 126 else i for i in text])

def process_train_data(df, filename):
    
    df = df[df.gold_label != "-"]
    df['gold_label'] = df['gold_label'].map({val: i for i, val in enumerate(df['gold_label'].unique())})
    df['sentence1'] = df['sentence1'].apply(remove_non_ascii)
    df['sentence1'] = df['sentence1'].apply(normalize_text)
    df['sentence1'] = df['sentence1'].apply(remove_stop_words)
    df['sentence1'] = df['sentence1'].str.replace('[^\w\s]','')

    df['sentence2'] = df['sentence2'].apply(remove_non_ascii)
    df['sentence2'] = df['sentence2'].apply(normalize_text)
    df['sentence2'] = df['sentence2'].apply(remove_stop_words)
    df['sentence2'] = df['sentence2'].str.replace('[^\w\s]','')
    texts = df['sentence1'].values + df['sentence2'].values
    
    tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
    
    tokenizer.fit_on_texts(texts)

    word2idx = tokenizer.word_index
    
    sentence1 = tokenizer.texts_to_sequences(df['sentence1'])
    sentence2 = tokenizer.texts_to_sequences(df['sentence2'])
    
    sentence1 = pad_sequences(sentence1, maxlen=MAX_SEQUENCE_LENGTH)
    sentence2 = pad_sequences(sentence2, maxlen=MAX_SEQUENCE_LENGTH)
    
    num_words = min(MAX_VOCAB_SIZE, len(word2idx) + 1)
    embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
    for word, i in word2idx.items():
          if i < MAX_VOCAB_SIZE:
            embedding_vector = word2vec.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    
    sentence1_embeddings = []
    for sentence in sentence1:
        temp = []
        for val in sentence:
            temp.append(embedding_matrix[val])
        sentence1_embeddings.append(np.asarray(temp))
    sentence1_embeddings = np.asarray(sentence1_embeddings)
    
    sentence2_embeddings = []
    for sentence in sentence2:
        temp = []
        for val in sentence:
            temp.append(embedding_matrix[val])
        sentence2_embeddings.append(np.asarray(temp))
    sentence2_embeddings = np.asarray(sentence2_embeddings)
    
    with open('sentence1_embedd_%s'%filename,'w') as outfile:
        np.save(outfile, sentence1_embeddings.reshape(sentence1_embeddings.shape[0],-1))
        
    with open('sentence2_embedd_%s'%filename,'w') as outfile:
        np.save(outfile, sentence2_embeddings.reshape(sentence1_embeddings.shape[0],-1))
        
    labels = np.asarray(df['gold_label'])
    labels.astype(np.float32)
    
    true_label = np.zeros((labels.shape[0], 3))
    true_label[np.arange(labels.shape[0]), labels] = 1

    with open('label_%s'%filename, 'w') as outfile:
        np.save(outfile, true_label)
        
    return tokenizer, embedding_matrix
        
        
def process_test_data(df, filename, tokenizer, embedding_matrix):
    
    df = df[df.gold_label != "-"]
    df['gold_label'] = df['gold_label'].map({val: i for i, val in enumerate(df['gold_label'].unique())})
    df['sentence1'] = df['sentence1'].apply(remove_non_ascii)
    df['sentence1'] = df['sentence1'].apply(normalize_text)
    df['sentence1'] = df['sentence1'].apply(remove_stop_words)
    df['sentence1'] = df['sentence1'].str.replace('[^\w\s]','')

    df['sentence2'] = df['sentence2'].apply(remove_non_ascii)
    df['sentence2'] = df['sentence2'].apply(normalize_text)
    df['sentence2'] = df['sentence2'].apply(remove_stop_words)
    df['sentence2'] = df['sentence2'].str.replace('[^\w\s]','')
    
    ## tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
    # texts = df['sentence1'].values + df['sentence2'].values
    # tokenizer.fit_on_texts(texts)

    # word2idx = tokenizer.word_index
    
    sentence1 = tokenizer.texts_to_sequences(df['sentence1'])
    sentence2 = tokenizer.texts_to_sequences(df['sentence2'])
    
    sentence1 = pad_sequences(sentence1, maxlen=MAX_SEQUENCE_LENGTH)
    sentence2 = pad_sequences(sentence2, maxlen=MAX_SEQUENCE_LENGTH)
    
    """
    num_words = min(MAX_VOCAB_SIZE, len(word2idx) + 1)
    embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
    for word, i in word2idx.items():
          if i < MAX_VOCAB_SIZE:
            embedding_vector = word2vec.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    """
    
    sentence1_embeddings = []
    for sentence in sentence1:
        temp = []
        for val in sentence:
            temp.append(embedding_matrix[val])
        sentence1_embeddings.append(np.asarray(temp))
    sentence1_embeddings = np.asarray(sentence1_embeddings)
    
    sentence2_embeddings = []
    for sentence in sentence2:
        temp = []
        for val in sentence:
            temp.append(embedding_matrix[val])
        sentence2_embeddings.append(np.asarray(temp))
    sentence2_embeddings = np.asarray(sentence2_embeddings)
    
    with open('sentence1_embedd_%s'%filename,'w') as outfile:
        np.save(outfile, sentence1_embeddings.reshape(sentence1_embeddings.shape[0],-1))
        
    with open('sentence2_embedd_%s'%filename,'w') as outfile:
        np.save(outfile, sentence2_embeddings.reshape(sentence1_embeddings.shape[0],-1))
        
    labels = np.asarray(df['gold_label'])
    labels.astype(np.float32)
    
    true_label = np.zeros((labels.shape[0], 3))
    true_label[np.arange(labels.shape[0]), labels] = 1
    print(true_label.shape)
    with open('label_%s'%filename, 'w') as outfile:
       np.save(outfile, true_label)

In [22]:
filename = 'train'
with open(filename) as f:
    data = pd.DataFrame(json.loads(line) for line in f)
    df = data[['gold_label', 'sentence1', 'sentence2']]
tokenizer, embedding_matrix = process_train_data(df, filename)

In [23]:
filename = 'test'
with open(filename) as f:
    data = pd.DataFrame(json.loads(line) for line in f)
    df = data[['gold_label', 'sentence1', 'sentence2']]
process_test_data(df, filename, tokenizer, embedding_matrix)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

(990, 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [15]:
df['gold_label'].unique()

array([u'neutral', u'entailment', u'contradiction', u'-'], dtype=object)

In [16]:
{val: i for i, val in enumerate(df['gold_label'].unique())}

{u'-': 3, u'contradiction': 2, u'entailment': 1, u'neutral': 0}