### Quora Question Pairs 

Can you identify question pairs that have the same intent?


###### Import libraries

In [3]:
import pandas as pd
import numpy as np
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
import gensim
from nltk.corpus import brown
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [23]:
# Eenmalig downloaden

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('brown')

# Voorkomen onterechte warnings
pd.options.mode.chained_assignment = None

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\wiets\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\wiets\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\wiets\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


###### Stemming

In [4]:
# Porter of Lancaster stemmers - tweede is 'aggressiever' dan de eerste.
porter = PorterStemmer()
lancaster=LancasterStemmer()

In [5]:
# Functie voor stemming (met Porter)

def stemSentence(sentence):
   
    # tokenize
    token_words=word_tokenize(sentence)
    
    # stemming
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

###### Lemmatization

In [7]:
# Functie voor lemmatizing
"""
"Lemmatization, unlike Stemming, reduces the inflected words properly ensuring that the root word belongs to the language."
Lemmatization is waarschijnlijk beter dan stemming in ons project omdat de betekenis van de woorden erg belangrijk is 
"""

def lemSentence(sentence):
    
    # tokenize
    token_words=word_tokenize(sentence)
    
    wordnet_lemmatizer = WordNetLemmatizer()
    
    # lemmatize
    lem_sentence=[]
    for word in token_words:   
        lem_sentence.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
        lem_sentence.append(" ")
    
    return "".join(lem_sentence)

###### Testen van functies

In [26]:
# Train data inladen en tijdelijk eerste paar regels om functies te checken

train_data = pd.read_csv('train_data.csv')
train_data_head=train_data[:10].copy()

print(train_data_head)

   id  qid1  qid2                                          question1  \
0   0     1     2  What is the step by step guide to invest in sh...   
1   1     3     4  What is the story of Kohinoor (Koh-i-Noor) Dia...   
2   2     5     6  How can I increase the speed of my internet co...   
3   3     7     8  Why am I mentally very lonely? How can I solve...   
4   4     9    10  Which one dissolve in water quikly sugar, salt...   
5   5    11    12  Astrology: I am a Capricorn Sun Cap moon and c...   
6   6    13    14                                Should I buy tiago?   
7   7    15    16                     How can I be a good geologist?   
8   8    17    18                    When do you use シ instead of し?   
9   9    19    20  Motorola (company): Can I hack my Charter Moto...   

                                           question2  is_duplicate  
0  What is the step by step guide to invest in sh...             0  
1  What would happen if the Indian government sto...             0  


In [28]:
# Toepassen van stemming functie op Train data

train_data_head['question1_stem'] = train_data_head['question1'].apply(stemSentence)
train_data_head['question2_stem'] = train_data_head['question2'].apply(stemSentence)

print(train_data_head)

   id  qid1  qid2                                          question1  \
0   0     1     2  What is the step by step guide to invest in sh...   
1   1     3     4  What is the story of Kohinoor (Koh-i-Noor) Dia...   
2   2     5     6  How can I increase the speed of my internet co...   
3   3     7     8  Why am I mentally very lonely? How can I solve...   
4   4     9    10  Which one dissolve in water quikly sugar, salt...   
5   5    11    12  Astrology: I am a Capricorn Sun Cap moon and c...   
6   6    13    14                                Should I buy tiago?   
7   7    15    16                     How can I be a good geologist?   
8   8    17    18                    When do you use シ instead of し?   
9   9    19    20  Motorola (company): Can I hack my Charter Moto...   

                                           question2  is_duplicate  \
0  What is the step by step guide to invest in sh...             0   
1  What would happen if the Indian government sto...             0 

In [29]:
# Toepassen van lemmatization functie op Train data

train_data_head['question1_lem'] = train_data_head['question1'].apply(lemSentence)
train_data_head['question2_lem'] = train_data_head['question2'].apply(lemSentence)

print(train_data_head)

   id  qid1  qid2                                          question1  \
0   0     1     2  What is the step by step guide to invest in sh...   
1   1     3     4  What is the story of Kohinoor (Koh-i-Noor) Dia...   
2   2     5     6  How can I increase the speed of my internet co...   
3   3     7     8  Why am I mentally very lonely? How can I solve...   
4   4     9    10  Which one dissolve in water quikly sugar, salt...   
5   5    11    12  Astrology: I am a Capricorn Sun Cap moon and c...   
6   6    13    14                                Should I buy tiago?   
7   7    15    16                     How can I be a good geologist?   
8   8    17    18                    When do you use シ instead of し?   
9   9    19    20  Motorola (company): Can I hack my Charter Moto...   

                                           question2  is_duplicate  \
0  What is the step by step guide to invest in sh...             0   
1  What would happen if the Indian government sto...             0 

######  Word2vec

In [30]:
# https://medium.com/@mishra.thedeepak/word2vec-in-minutes-gensim-nlp-python-6940f4e00980

# create trained model
sentences = brown.sents()
model = gensim.models.Word2Vec(sentences, min_count=1)
print('---- SAVE BROWN CORPUS MODEL ----')
model.save('brown_model')

# create brown_model
model = gensim.models.Word2Vec.load('brown_model')

# find similarity of words
print(model.similarity('woman', 'man'))

---- SAVE BROWN CORPUS MODEL ----
0.912204


  del sys.path[0]


##### Doc2vec

In [40]:
# https://medium.com/@mishra.thedeepak/doc2vec-simple-implementation-example-df2afbbfbad5
# https://datascience.stackexchange.com/questions/23969/sentence-similarity-prediction

# create model

tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(question1)]

max_epochs = 100
vec_size = 20
alpha = 0.025

model = Doc2Vec(size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)

model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("d2v.model")
print("Model Saved")



iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 71
iteration 72
iteration 73
iteration 74
iteration 75
iteration 76
iteration

In [44]:
from gensim.models.doc2vec import Doc2Vec

model= Doc2Vec.load("d2v.model")
#to find the vector of a document which is not in training data
test_data = word_tokenize("I love chatbots".lower())
v1 = model.infer_vector(test_data)
print("V1_infer", v1)

# to find most similar doc using tags
similar_doc = model.docvecs.most_similar('1')
print(similar_doc)

# to find vector of doc in training data using tags or in other words, printing the vector of document at index 1 in training data
print(model.docvecs['1'])

V1_infer [ 0.01910738  0.01698834 -0.01945818 -0.0152849   0.00559611  0.01271356
  0.02366393 -0.00546236 -0.00389903 -0.0077853   0.00632725  0.02083503
  0.01776579 -0.00873339  0.00331536  0.0104358  -0.00993257 -0.01187992
 -0.01308167 -0.00776289]
[('0', -0.1929418444633484)]
[ -1.42748412e-02  -1.85315013e-02  -1.62658957e-03   1.52987000e-02
  -8.50031618e-03  -2.54184455e-02  -2.23779939e-02   1.00819319e-02
  -2.08196566e-02  -1.57111138e-02  -8.49933270e-03   1.05077894e-02
  -1.42097082e-02  -2.52882182e-03   1.86181590e-02   9.18155070e-03
   1.12419454e-02   1.83722388e-03  -5.49770375e-05   5.54856192e-03]


In [46]:
from gensim.models.doc2vec import Doc2Vec

model = Doc2Vec.load("d2v.model")

question1 = train_data_head[['qid1', 'question1_stem']].copy()
print(question1)
question2 = train_data_head[['qid2', 'question2_stem']].copy()
print(question2)

# get similarity of two rows


   qid1                                     question1_stem
0     1  what is the step by step guid to invest in sha...
1     3  what is the stori of kohinoor ( koh-i-noor ) d...
2     5  how can I increas the speed of my internet con...
3     7   whi am I mental veri lone ? how can I solv it ? 
4     9  which one dissolv in water quikli sugar , salt...
5    11  astrolog : I am a capricorn sun cap moon and c...
6    13                              should I buy tiago ? 
7    15                   how can I be a good geologist ? 
8    17                  when do you use シ instead of し ? 
9    19  motorola ( compani ) : can I hack my charter m...
   qid2                                     question2_stem
0     2  what is the step by step guid to invest in sha...
1     4  what would happen if the indian govern stole t...
2     6  how can internet speed be increas by hack thro...
3     8  find the remaind when [ math ] 23^ { 24 } [ /m...
4    10           which fish would surviv in salt water 

###### Load the dataset

In [18]:
train_data = pd.read_csv('train_data.csv')
training_labels = pd.read_csv('train_labels.csv')
test_data = pd.read_csv('test_data.csv')

#print(train_data)
#print(training_labels)
#print(test_data)

###### Preprocessing
    
_The goal of both stemming and lemmatization is to reduce inflectional forms and sometimes derivationally related forms of a word to a common base form._

_However, the two words differ in their flavor. Stemming usually refers to a crude heuristic process that chops off the ends of words in the hope of achieving this goal correctly most of the time, and often includes the removal of derivational affixes. Lemmatization usually refers to doing things properly with the use of a vocabulary and morphological analysis of words, normally aiming to remove inflectional endings only and to return the base or dictionary form of a word, which is known as the lemma._

In [50]:
def text_regularize(dataframe):
    print ('***** Regularizing *****')
    # stemming
    
    
    # lematizing
    
    
 
def text_filtering(dataframe):
    print ('***** Filtering *****')
    
    # import regular expression
    import re
    
    # remove stopwords
    
    
    # remove special symbols with regular expression
        


print('Split test data')
test_q1 = test_data['question1']
test_q2 = test_data['question2']       
    
print('========= Clean testing data question 1 ====================')
# Do cleaning
text_regularize(test_q1)
text_filtering(test_q1)

print('========= Clean testing data question 2 ====================')
# Do cleaning
text_regularize(test_q2)
text_filtering(test_q2)

# Save as new file
test_q1.to_csv('test_q1.csv')

print('========= Finish preprocessing of data ==========')

Split test data
***** Regularizing *****
***** Filtering *****
***** Regularizing *****
***** Filtering *****


######  Word2vec