In [32]:
import pandas as pd
import numpy as np
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
import gensim
from nltk.corpus import brown
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import re

import codecs
import csv
from gensim.models import KeyedVectors

# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import Sequential


In [2]:
# Eenmalig downloaden
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('brown')

# Voorkomen onterechte warnings
pd.options.mode.chained_assignment = None

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Renée\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Renée\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\Renée\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [3]:
# Functie voor lemmatizing
"""
"Lemmatization, unlike Stemming, reduces the inflected words properly ensuring that the root word belongs to the language."
Lemmatization is waarschijnlijk beter dan stemming in ons project omdat de betekenis van de woorden erg belangrijk is 
"""

def cleanSentence(text):
    
    text = str(text).lower()
    
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e ?-? ?mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    # tokenize
    words=text.split()
    
    # lemmatize
    wordnet_lemmatizer = WordNetLemmatizer()
    lem_sentence=[]
    for word in words:   
        lem_sentence.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
        lem_sentence.append(" ")
    
    return "".join(lem_sentence)

In [4]:
# Toepassen van lemmatization functie op Train data

train_data = pd.read_csv('train_data.csv')

train_data['q1_clean'] = train_data['question1'].apply(cleanSentence)
train_data['q2_clean'] = train_data['question2'].apply(cleanSentence)


In [5]:
test_data = pd.read_csv('test_data.csv')

test_data['q1_clean'] = test_data['question1'].apply(cleanSentence)
test_data['q2_clean'] = test_data['question2'].apply(cleanSentence)

#### Word index

In [20]:
# The Tokenizer stores everything in the word_index during fit_on_texts
tokenizer = Tokenizer(num_words=200000)

# Omzitten naar list om alle woorden in de fit_on_texts functie te krijgen
train_data_q1 = train_data['q1_clean'].tolist()
train_data_q2 = train_data['q2_clean'].tolist()
test_data_q1 = test_data['q1_clean'].tolist()
test_data_q2 = test_data['q2_clean'].tolist()

tokenizer.fit_on_texts(train_data_q1+
                       train_data_q2+
                       test_data_q1+
                       test_data_q2)

word_index = tokenizer.word_index
print('%s unique words' % len(word_index))

77325 unique words


#### Embedding matrix

In [21]:
# Locatie embedding file (misschien nog andere gebruiken dan deze)
EMB_FILE = 'C:/Renee/aml2018/GoogleNews-vectors-negative300.bin'

word2vec = KeyedVectors.load_word2vec_format(EMB_FILE, binary=True)
print('%s word vectors' % len(word2vec.vocab))


3000000 word vectors


In [22]:
MAX_WORDS = 250000
EMB_DIM = 300

In [23]:
# Embedding matrix maken (300 dimension vectoren voor alle woorden uit de index)
n_words = min(MAX_WORDS, len(word_index))+1

embedding_matrix = np.zeros((n_words, EMB_DIM))
for word, i in word_index.items():
    if word in word2vec.vocab:
        embedding_matrix[i] = word2vec.word_vec(word)
        
print('Words without embedding: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Words without embedding: 37254


In [25]:
embedding_matrix.shape

(77326, 300)

#### Sequences

In [26]:
LENGTH_SEQUENCE = 25

In [27]:
# Train data to sequences

train_seq_1 = tokenizer.texts_to_sequences(train_data['q1_clean'])
train_seq_2 = tokenizer.texts_to_sequences(train_data['q2_clean'])

# Padding in sequences when sentences smaller than 25 words
train_1 = pad_sequences(train_seq_1, maxlen=LENGTH_SEQUENCE)
train_2 = pad_sequences(train_seq_2, maxlen=LENGTH_SEQUENCE)

print('Shape of tensor:', train_1.shape)

Shape of tensor: (404290, 25)


In [28]:
# Test data to sequences

test_seq_1 = tokenizer.texts_to_sequences(test_data['q1_clean'])
test_seq_2 = tokenizer.texts_to_sequences(test_data['q2_clean'])

# Padding in sequences when sentences smaller than 25 words
test_1 = pad_sequences(test_seq_1, maxlen=LENGTH_SEQUENCE)
test_2 = pad_sequences(test_seq_2, maxlen=LENGTH_SEQUENCE)

print('Shape of tensor:', test_1.shape)

Shape of tensor: (81126, 25)


#### Labels

In [30]:
# Labels naar numpy array
labels = train_data['is_duplicate']
labels = labels.as_matrix()

print('Shape of label tensor:', labels.shape)

Shape of label tensor: (404290,)


  This is separate from the ipykernel package so we can avoid doing imports until


#### Validation split

In [None]:
# Make a validation split of 10%

train_1 = 
train_2 =

val_1 = 
val_2 = 


# Inclusief labels
train_lab =
val_lab =

#### Model structure

In [None]:
########################################
## define the model structure
########################################
embedding_layer = Embedding(nb_words,
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=False)
lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)

sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = lstm_layer(embedded_sequences_1)

sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
y1 = lstm_layer(embedded_sequences_2)

z1 = Input(shape=(1,), dtype='float32')

a1 = Input(shape=(1,), dtype='float32')
b1 = Input(shape=(1,), dtype='float32')

merged = concatenate([x1, y1, z1, a1, b1])
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)

merged = Dense(num_dense, activation=act)(merged)
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)


In [37]:
lstm_out = 200
batch_size = 32

model = Sequential()
model.add(Embedding(n_words, EMB_DIM,input_length = LENGTH_SEQUENCE, trainable=False))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 25, 300)           23197800  
_________________________________________________________________
lstm_4 (LSTM)                (None, 200)               400800    
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 402       
Total params: 23,599,002
Trainable params: 401,202
Non-trainable params: 23,197,800
_________________________________________________________________
None


#### Train model

In [None]:
early_stopping =EarlyStopping(monitor='val_loss', patience=3)
bst_model_path = STAMP + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

hist = model.fit([train_1, train_2], train_lab, \
        validation_data=([val_1, val_2], val_lab), \
        epochs=200, batch_size=2048, shuffle=True, \
        class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])

#### Submission