### Quora Question Pairs 

Can you identify question pairs that have the same intent?


###### Import libraries

In [27]:
import pandas as pd
import numpy as np
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer

In [34]:
# Eenmalig downloaden

#nltk.download('punkt')
#nltk.download('wordnet')

# Voorkomen onterechte warnings
pd.options.mode.chained_assignment = None

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Renée\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


###### Stemming

In [2]:
# Porter of Lancaster stemmers - tweede is 'aggressiever' dan de eerste.
porter = PorterStemmer()
lancaster=LancasterStemmer()

In [23]:
# Functie voor stemming (met Porter)

def stemSentence(sentence):
   
    # tokenize
    token_words=word_tokenize(sentence)
    
    # stemming
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

###### Lemmatization

In [58]:
# Functie voor lemmatizing
"""
"Lemmatization, unlike Stemming, reduces the inflected words properly ensuring that the root word belongs to the language."
Lemmatization is waarschijnlijk beter dan stemming in ons project omdat de betekenis van de woorden erg belangrijk is 
"""

def lemSentence(sentence):
    
    # tokenize
    token_words=word_tokenize(sentence)
    
    # lemmatize
    lem_sentence=[]
    for word in token_words:   
        lem_sentence.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
        lem_sentence.append(" ")
    
    return "".join(lem_sentence)

###### Testen van functies

In [61]:
# Train data inladen en tijdelijk eerste paar regels om functies te checken

train_data = pd.read_csv('train_data.csv')
train_data_head=train_data[:10]

print(train_data_head)

   id  qid1  qid2                                          question1  \
0   0     1     2  What is the step by step guide to invest in sh...   
1   1     3     4  What is the story of Kohinoor (Koh-i-Noor) Dia...   
2   2     5     6  How can I increase the speed of my internet co...   
3   3     7     8  Why am I mentally very lonely? How can I solve...   
4   4     9    10  Which one dissolve in water quikly sugar, salt...   
5   5    11    12  Astrology: I am a Capricorn Sun Cap moon and c...   
6   6    13    14                                Should I buy tiago?   
7   7    15    16                     How can I be a good geologist?   
8   8    17    18                    When do you use シ instead of し?   
9   9    19    20  Motorola (company): Can I hack my Charter Moto...   

                                           question2  is_duplicate  
0  What is the step by step guide to invest in sh...             0  
1  What would happen if the Indian government sto...             0  


In [62]:
# Toepassen van stemming functie op Train data

train_data_head['question1_stem'] = train_data_head['question1'].apply(stemSentence)
train_data_head['question2_stem'] = train_data_head['question2'].apply(stemSentence)

print(train_data_head)

   id  qid1  qid2                                          question1  \
0   0     1     2  What is the step by step guide to invest in sh...   
1   1     3     4  What is the story of Kohinoor (Koh-i-Noor) Dia...   
2   2     5     6  How can I increase the speed of my internet co...   
3   3     7     8  Why am I mentally very lonely? How can I solve...   
4   4     9    10  Which one dissolve in water quikly sugar, salt...   
5   5    11    12  Astrology: I am a Capricorn Sun Cap moon and c...   
6   6    13    14                                Should I buy tiago?   
7   7    15    16                     How can I be a good geologist?   
8   8    17    18                    When do you use シ instead of し?   
9   9    19    20  Motorola (company): Can I hack my Charter Moto...   

                                           question2  is_duplicate  \
0  What is the step by step guide to invest in sh...             0   
1  What would happen if the Indian government sto...             0 

In [63]:
# Toepassen van lemmatization functie op Train data

train_data_head['question1_lem'] = train_data_head['question1'].apply(lemSentence)
train_data_head['question2_lem'] = train_data_head['question2'].apply(lemSentence)

print(train_data_head)

   id  qid1  qid2                                          question1  \
0   0     1     2  What is the step by step guide to invest in sh...   
1   1     3     4  What is the story of Kohinoor (Koh-i-Noor) Dia...   
2   2     5     6  How can I increase the speed of my internet co...   
3   3     7     8  Why am I mentally very lonely? How can I solve...   
4   4     9    10  Which one dissolve in water quikly sugar, salt...   
5   5    11    12  Astrology: I am a Capricorn Sun Cap moon and c...   
6   6    13    14                                Should I buy tiago?   
7   7    15    16                     How can I be a good geologist?   
8   8    17    18                    When do you use シ instead of し?   
9   9    19    20  Motorola (company): Can I hack my Charter Moto...   

                                           question2  is_duplicate  \
0  What is the step by step guide to invest in sh...             0   
1  What would happen if the Indian government sto...             0 

###### Load the dataset

In [11]:
train_data = pd.read_csv('train_data.csv')
training_labels = pd.read_csv('train_labels.csv')
test_data = pd.read_csv('test_data.csv')

#print(train_data)
#print(training_labels)
#print(test_data)

###### Preprocessing
    
_The goal of both stemming and lemmatization is to reduce inflectional forms and sometimes derivationally related forms of a word to a common base form._

_However, the two words differ in their flavor. Stemming usually refers to a crude heuristic process that chops off the ends of words in the hope of achieving this goal correctly most of the time, and often includes the removal of derivational affixes. Lemmatization usually refers to doing things properly with the use of a vocabulary and morphological analysis of words, normally aiming to remove inflectional endings only and to return the base or dictionary form of a word, which is known as the lemma._

In [50]:
def text_regularize(dataframe):
    print ('***** Regularizing *****')
    # stemming
    
    
    # lematizing
    
    
 
def text_filtering(dataframe):
    print ('***** Filtering *****')
    
    # import regular expression
    import re
    
    # remove stopwords
    
    # remove special symbols with regular expression
        


print('Split test data')
test_q1 = test_data['question1']
test_q2 = test_data['question2']       
    
print('========= Clean testing data question 1 ====================')
# Do cleaning
text_regularize(test_q1)
text_filtering(test_q1)

print('========= Clean testing data question 2 ====================')
# Do cleaning
text_regularize(test_q2)
text_filtering(test_q2)

# Save as new file
test_q1.to_csv('test_q1.csv')

print('========= Finish preprocessing of data ==========')

Split test data
***** Regularizing *****
***** Filtering *****
***** Regularizing *****
***** Filtering *****


######  Word2vec