In [290]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [291]:
import tensorflow as tf
from tensorflow import keras

In [292]:
from sklearn.model_selection import train_test_split

In [293]:
from keras.models import Sequential, Model
from keras.layers import Dense, Conv2D, Flatten

In [294]:
quests = pd.read_csv('train.csv')

In [295]:
X = quests[[x for x in quests.columns if x != "is_duplicate"]]
y = quests["is_duplicate"]

y

0         0
1         0
2         0
3         0
4         0
         ..
404285    0
404286    1
404287    0
404288    0
404289    0
Name: is_duplicate, Length: 404290, dtype: int64

In [296]:
quests.head(5)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


# Word2Vec implementation

In [297]:
WINDOW_SIZE = 4 # of the sentences in context
EMBEDDING_DIM = 10

In [298]:
corpus_raw = """A king is a man who rules a country, because of inheritance. A king usually comes to power when the previous monarch dies, who is usually a family member of his. Sometimes a person may become king due to the previous monarch's abdication, for example George VI. For most of history, most countries were ruled in this way, especially in Europe. Some countries, such as France, are no longer monarchies. Some, such as the United Kingdom, still have a royal family. In some countries, people chose a new king from other people to decide from.

The wife of a king is called a queen. A woman who becomes a ruler because of inheritance is also called a queen.

If a country has a king or a queen, that means it is a monarchy. A country which a king or queen rules is called a kingdom.

In the Muslim world a King would be known as Malik or Sultan. """

In [299]:
corpus_raw

"A king is a man who rules a country, because of inheritance. A king usually comes to power when the previous monarch dies, who is usually a family member of his. Sometimes a person may become king due to the previous monarch's abdication, for example George VI. For most of history, most countries were ruled in this way, especially in Europe. Some countries, such as France, are no longer monarchies. Some, such as the United Kingdom, still have a royal family. In some countries, people chose a new king from other people to decide from.\n\nThe wife of a king is called a queen. A woman who becomes a ruler because of inheritance is also called a queen.\n\nIf a country has a king or a queen, that means it is a monarchy. A country which a king or queen rules is called a kingdom.\n\nIn the Muslim world a King would be known as Malik or Sultan. "

In [300]:
corpus_raw = corpus_raw.replace("\n"," ")

In [301]:
corpus_raw

"A king is a man who rules a country, because of inheritance. A king usually comes to power when the previous monarch dies, who is usually a family member of his. Sometimes a person may become king due to the previous monarch's abdication, for example George VI. For most of history, most countries were ruled in this way, especially in Europe. Some countries, such as France, are no longer monarchies. Some, such as the United Kingdom, still have a royal family. In some countries, people chose a new king from other people to decide from.  The wife of a king is called a queen. A woman who becomes a ruler because of inheritance is also called a queen.  If a country has a king or a queen, that means it is a monarchy. A country which a king or queen rules is called a kingdom.  In the Muslim world a King would be known as Malik or Sultan. "

In [302]:
corpus_low = corpus_raw.lower()

In [303]:
def clean_word(w, removeables = "'.,!?/*[]{}"):
    temp = w
    for punct in removeables:
        temp = temp.replace(punct,'')
    return temp

In [304]:
words = set()
removeables = '.,!?/*[]{}'
for w in corpus_low.split(' '):
    if w not in '.,!?/*[]{}':
        w = clean_word(w)
        words.add(w.strip(removeables))

## Word indexing get

In [305]:
vocab_size = len(words)
word2int, int2word = {}, {}

In [306]:
for i,w in enumerate(words):
    word2int[w] = i
    int2word[i] = w

In [307]:
word2int

{'it': 0,
 'way': 1,
 'decide': 2,
 'sometimes': 3,
 'especially': 4,
 'royal': 5,
 'inheritance': 6,
 'or': 7,
 'ruler': 8,
 'this': 9,
 'no': 10,
 'still': 11,
 'of': 12,
 'monarchs': 13,
 'queen': 14,
 'europe': 15,
 'from': 16,
 'be': 17,
 'monarchies': 18,
 'countries': 19,
 'monarchy': 20,
 'ruled': 21,
 'have': 22,
 'family': 23,
 'power': 24,
 'that': 25,
 'malik': 26,
 'which': 27,
 'usually': 28,
 'other': 29,
 'also': 30,
 'example': 31,
 'monarch': 32,
 'george': 33,
 'sultan': 34,
 'comes': 35,
 'kingdom': 36,
 'france': 37,
 'longer': 38,
 'may': 39,
 'country': 40,
 'vi': 41,
 'people': 42,
 'a': 43,
 'member': 44,
 'becomes': 45,
 'history': 46,
 'to': 47,
 'world': 48,
 'means': 49,
 'in': 50,
 'due': 51,
 'person': 52,
 'dies': 53,
 'were': 54,
 'known': 55,
 'his': 56,
 'wife': 57,
 'has': 58,
 'such': 59,
 'united': 60,
 'who': 61,
 'man': 62,
 'previous': 63,
 'muslim': 64,
 'as': 65,
 'chose': 66,
 'are': 67,
 'would': 68,
 'woman': 69,
 'abdication': 70,
 'called

## Sentencing get for training data gen

In [308]:
raw_sentences = corpus_low.split('.')

sentences = []
for sentence in raw_sentences:
    sentences.append(sentence.split())

In [309]:
training_data = []
for sentence in sentences:
    for word_position, word in enumerate(sentence):
        for considered_word in \
        sentence[max(word_position-WINDOW_SIZE,0) : min(word_position+WINDOW_SIZE,len(sentence)) + 1]:
            if considered_word != word:
                training_data.append( [clean_word(word),clean_word(considered_word)] )
            

In [310]:
def to_one_hot(data_word_index, vocab_size):
    vect = np.zeros(vocab_size)
    vect[data_word_index] = 1
    return vect

In [311]:
X_data, y_data = [], []

In [312]:
for data_item in training_data:
    X_data.append( to_one_hot(word2int[data_item[0]], vocab_size) )
    y_data.append( to_one_hot(word2int[data_item[1]], vocab_size) )

In [313]:
X_data = np.asarray(X_data)
y_data = np.asarray(y_data)

In [314]:
X_data[0].shape

(84,)

In [315]:
vocab_size

84

## Building the Model

In [316]:
word2vec_full = Sequential()

# this is the main layer we require
word2vec_full.add(Dense(EMBEDDING_DIM, input_dim=vocab_size, activation='relu'))

# this is the removeable layer
word2vec_full.add(Dense(vocab_size, input_dim=EMBEDDING_DIM, activation='sigmoid')) 

In [317]:
word2vec_full.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [318]:
X_data[1]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [319]:
word2vec_full.fit(X_data, y_data, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1684ddcb908>

In [320]:
n = word2vec_full.pop()

In [321]:
word2vec_full.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_10 (Dense)             (None, 10)                850       
Total params: 850
Trainable params: 850
Non-trainable params: 0
_________________________________________________________________


## Building the embedding function

In [322]:
def embed_word(word, vocab_size=vocab_size):
    return word2vec_full.predict(np.asarray([to_one_hot(word2int[word], vocab_size),]))[0]

In [323]:
embed_word('king')

array([1.1322192 , 0.7729502 , 1.029761  , 0.58355415, 0.8797009 ,
       0.59177446, 0.83155787, 0.9597095 , 0.8869908 , 0.596187  ],
      dtype=float32)

In [324]:
embed_word('queen')

array([0.7740394 , 0.63698715, 1.0393512 , 0.80124885, 0.9750515 ,
       0.68160576, 0.8081225 , 0.8289232 , 0.59306896, 0.65237796],
      dtype=float32)

In [325]:
embed_word('man')

array([0.9786841 , 0.74226093, 0.7655614 , 0.5236539 , 0.92473596,
       0.762135  , 0.5328676 , 0.8943528 , 0.5752293 , 0.660254  ],
      dtype=float32)

In [326]:
embed_word('person')

array([0.786098  , 0.61664385, 0.8000004 , 0.9430488 , 0.8171786 ,
       0.8962017 , 0.6125772 , 0.9324915 , 0.61085266, 0.8672253 ],
      dtype=float32)