In [1]:
import pandas as pd
import gensim
from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence
import numpy as np
import nltk
import random
import logging
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
from nltk.corpus import stopwords
# nltk.download("stopwords")
# nltk.download('punkt')
stop = set(stopwords.words('english'))

In [4]:
def get_words(question) :
    return list(filter(lambda word: word not in stop and word.isalpha(), nltk.word_tokenize(question.lower())))

getLabeledSentencetrain = lambda given_tuple: [
             LabeledSentence(get_words(given_tuple[0]),["trainquestion1_"+str(given_tuple[2])]),  
             LabeledSentence(get_words(given_tuple[1]),["trainquestion2_"+str(given_tuple[2])])]

getLabeledSentencetest = lambda given_tuple: [
             LabeledSentence(get_words(given_tuple[0]),["testquestion1_"+str(given_tuple[2])]),  
             LabeledSentence(get_words(given_tuple[1]),["testquestion2_"+str(given_tuple[2])])]

flatten = lambda l: [item for sublist in l for item in sublist]

In [97]:
traindf = pd.read_csv('train.csv')
traindf['index1'] = traindf.index
traindf = traindf.fillna("")
traindf = traindf[:2000]
traindf.shape

(2000, 7)

In [98]:
train_df_size = traindf.shape[0]

In [99]:
trainSentences = flatten(list(map(lambda x: getLabeledSentencetrain(x) , traindf[['question1', 'question2', 'id']].values)))

In [100]:
sentences = trainSentences
len(sentences)

4000

In [101]:
model = Doc2Vec(min_count=1, window=10, size=500, sample=1e-4, negative=5, workers=8)

In [102]:
model.build_vocab(sentences)

2017-05-02 14:53:20,792 : INFO : collecting all words and their counts
2017-05-02 14:53:20,795 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2017-05-02 14:53:20,835 : INFO : collected 5784 word types and 4000 unique tags from a corpus of 4000 examples and 21257 words
2017-05-02 14:53:20,838 : INFO : Loading a fresh vocabulary
2017-05-02 14:53:20,865 : INFO : min_count=1 retains 5784 unique words (100% of original 5784, drops 0)
2017-05-02 14:53:20,867 : INFO : min_count=1 leaves 21257 word corpus (100% of original 21257, drops 0)
2017-05-02 14:53:20,908 : INFO : deleting the raw counts dictionary of 5784 items
2017-05-02 14:53:20,910 : INFO : sample=0.0001 downsamples 798 most-common words
2017-05-02 14:53:20,910 : INFO : downsampling leaves estimated 14966 word corpus (70.4% of prior 21257)
2017-05-02 14:53:20,911 : INFO : estimated required memory for 5784 words and 500 dimensions: 34828000 bytes
2017-05-02 14:53:20,933 : INFO : resetting layer weigh

In [103]:
for epoch in range(10):
    random.shuffle(sentences)
    model.train(sentences)

2017-05-02 14:53:21,811 : INFO : training model with 8 workers on 5784 vocabulary and 500 features, using sg=0 hs=0 sample=0.0001 negative=5 window=10
2017-05-02 14:53:21,813 : INFO : expecting 4000 sentences, matching count from corpus used for vocabulary survey
2017-05-02 14:53:22,772 : INFO : worker thread finished; awaiting finish of 7 more threads
2017-05-02 14:53:22,784 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-05-02 14:53:22,788 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-05-02 14:53:22,795 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-05-02 14:53:22,798 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-05-02 14:53:22,962 : INFO : PROGRESS: at 81.11% examples, 67608 words/s, in_qsize 2, out_qsize 1
2017-05-02 14:53:22,963 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-05-02 14:53:23,045 : INFO : worker thread finished; awaiting finish of 1 more threa

2017-05-02 14:53:29,585 : INFO : expecting 4000 sentences, matching count from corpus used for vocabulary survey
2017-05-02 14:53:30,487 : INFO : worker thread finished; awaiting finish of 7 more threads
2017-05-02 14:53:30,492 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-05-02 14:53:30,512 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-05-02 14:53:30,515 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-05-02 14:53:30,521 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-05-02 14:53:30,664 : INFO : PROGRESS: at 81.18% examples, 71782 words/s, in_qsize 2, out_qsize 1
2017-05-02 14:53:30,665 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-05-02 14:53:30,764 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-05-02 14:53:30,768 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-05-02 14:53:30,768 : INFO : training on 106285 raw w

In [124]:
print(model.docvecs.similarity("trainquestion1_"+str(i), "trainquestion2_"+str(i)))
print(np.dot(model.docvecs["trainquestion1_"+str(i)], model.docvecs["trainquestion1_"+str(i)]))

0.999450647871
0.279517


In [126]:
train_data = []
train_labels = []
for i in range(0, train_df_size):
    sim = [model.docvecs.similarity("trainquestion1_"+str(i), "trainquestion2_"+str(i)), 
           np.average(model.docvecs["trainquestion1_"+str(i)]),
           np.average(model.docvecs["trainquestion2_"+str(i)]),
           np.dot(model.docvecs["trainquestion1_"+str(i)], model.docvecs["trainquestion1_"+str(i)])
         ]
    train_data.append(sim)
    train_labels.append(traindf.iloc[i].is_duplicate)
    
train_data = np.array(train_data)
train_labels = np.array(train_labels)

print(train_data.shape)
print(train_labels.shape)

(2000, 4)
(2000,)


In [194]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten, Embedding
from keras.utils import np_utils

In [195]:
modelNN = Sequential()
modelNN.add(Embedding(1000, 64, input_length=10))
modelNN.add(Dense(100, input_dim=4, activation='tanh'))
# modelNN.add(Dropout(0.5))
modelNN.add(Dense(64, activation='relu'))
# modelNN.add(Dropout(0.5))
modelNN.add(Dense(1, activation='sigmoid'))

modelNN.compile(loss='mean_squared_logarithmic_error',
              optimizer='rmsprop',
              metrics=['accuracy'])

modelNN.fit(train_data, train_labels,
          epochs=100,
          batch_size=100)
score = modelNN.evaluate(train_data, train_labels, batch_size=100)

ValueError: Error when checking model input: expected embedding_1_input to have shape (None, 10) but got array with shape (2000, 4)

In [189]:
print(score)

[0.10106739476323127, 0.63700000643730159]


In [190]:
def getDuplicate(row):
    id = row['id']
    sim = np.array([[model.docvecs.similarity("trainquestion1_"+str(id), "trainquestion2_"+str(id)), 
           np.average(model.docvecs["trainquestion1_"+str(id)]),
           np.average(model.docvecs["trainquestion2_"+str(id)]),
           np.dot(model.docvecs["trainquestion1_"+str(id)], model.docvecs["trainquestion1_"+str(id)])
         ]])
    return modelNN.predict_classes(sim)[0][0]

mdf2 = traindf[traindf['is_duplicate'] == 1]
mdf2['predict'] = mdf2.apply(lambda row: getDuplicate(row), axis=1 )









A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [191]:
mdf2.predict.value_counts()

0    715
1     27
Name: predict, dtype: int64