In [65]:
document = """NLP is used to analyze text, allowing machines to understand how
human’s speak. This human-computer interaction enables real-world applications
like automatic text summarization, sentiment analysis, topic extraction,
named entity recognition, parts-of-speech tagging, relationship extraction, 
stemming, and more. NLP is commonly used for text mining, machine translation,
and automated question answering."""

### 1. Sentence Segmentation

In [66]:
PYTHONWARNINGS="ignore"

In [67]:
from nltk import sent_tokenize

sentences = sent_tokenize(document)

In [68]:
len(sentences)

3

### 2. Tokenization

In [69]:
from nltk.tokenize import word_tokenize
tokens = word_tokenize(document)

tokens[:10]

['NLP',
 'is',
 'used',
 'to',
 'analyze',
 'text',
 ',',
 'allowing',
 'machines',
 'to']

### 3. Stopwords

In [70]:

from nltk.corpus import stopwords
stop_words = stopwords.words('english')
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'no

In [71]:
words = [w for w in tokens if not w in stop_words]
print(words[:20])

['NLP', 'used', 'analyze', 'text', ',', 'allowing', 'machines', 'understand', 'human', '’', 'speak', '.', 'This', 'human-computer', 'interaction', 'enables', 'real-world', 'applications', 'like', 'automatic']


### 4. Lemmatization ( and stemming)

In [73]:
from nltk.stem import PorterStemmer

# the most commonly used stemmer
ps = PorterStemmer()
ps.stem("sang"), ps.stem("singing")

('sang', 'sing')

In [74]:
from nltk.stem import WordNetLemmatizer
# Without a POS tag, it assumes the word to be NOUN, be default.
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize("sang", pos='v'), lemmatizer.lemmatize("singing", pos='v')

('sing', 'sing')

In [34]:
stemmed = [ps.stem(word) for word in words]
print(stemmed[:20])

['nlp', 'use', 'analyz', 'text', ',', 'allow', 'machin', 'understand', 'human', '’', 'speak', '.', 'thi', 'human-comput', 'interact', 'enabl', 'real-world', 'applic', 'like', 'automat']


In [36]:

lemmas = [lemmatizer.lemmatize(word, pos='v') for word in words]
print(lemmas[:20])

['NLP', 'use', 'analyze', 'text', ',', 'allow', 'machine', 'understand', 'human', '’', 'speak', '.', 'This', 'human-computer', 'interaction', 'enable', 'real-world', 'applications', 'like', 'automatic']


## CHALLENGE : Identifying Duplicate questions in Quora

In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)

  return f(*args, **kwds)


In [81]:

df = pd.read_csv("train.csv",encoding="ISO-8859-1")
df = df[:2000]

In [42]:
# from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestClassifier

# train, test = train_test_split(df, train_size=0.8)

In [43]:
# parallel computing 
# classifier = RandomForestClassifier(n_jobs=-1)

# classifier.fit(train[['q1_word_count', 'q2_word_count', 'common_word_count']], train.is_duplicate)

In [44]:
# test.shape

# score = classifier.score(test[['q1_word_count', 'q2_word_count', 'common_word_count']], test.is_duplicate)
# print('score is', score)

In [76]:
df[['question1', 'question2', 'is_duplicate']]

Unnamed: 0,question1,question2,is_duplicate
0,What is the step by step guide to invest in share market in,What is the step by step guide to invest in share,0
1,What is the story of Kohinoor,What would happen if the Indian government stole the Kohinoor diamond,0
2,How can I increase the speed of my internet connection while using a,How can Internet speed be increased by hacking through,0
3,Why am I mentally very How can I solve,Find the remainder when is divided by,0
4,Which one dissolve in water quikly methane and carbon di,Which fish would survive in salt,0
5,I am a Capricorn Sun Cap moon and cap does that say about,a triple Capricorn Moon and ascendant in What does this say about,1
6,Should I buy,What keeps childern active and far from phone and video,0
7,How can I be a good,What should I do to be a great,1
8,When do you use instead of,When do you use instead of,0
9,Motorola Can I hack my Charter Motorolla,How do I hack Motorola DCX3400 for free,0


In [9]:
# let us observe the duplicates
df[df['is_duplicate'] == 1].head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_word_count,q2_word_count,common_word_count
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?,"I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?",1,16,16,8
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1,7,9,4
11,11,23,24,How do I read and find my YouTube comments?,How can I see all my Youtube comments?,1,9,8,5
12,12,25,26,What can make Physics easy to learn?,How can you make physics easy to learn?,1,7,8,6
13,13,27,28,What was your first sexual experience like?,What was your first sexual experience?,1,7,6,5


###  is there any null entry for questions?

In [77]:

df[df['question1'].isnull()]
df[df['question2'].isnull()]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_word_count,q2_word_count,common_word_count,is_special_char1,is_special_char2,q1_removing_stopwords,q2_removing_stopwords,q1_synonyms,synonyms_count,similarity


In [11]:
# cast to str.
df['question1'] = df['question1'].astype(str)
df['question2'] = df['question2'].astype(str)

### is there any special chars?

In [82]:

import string

special_chars = string.punctuation
def check_special_char(text):
    return any(char in string.punctuation for char in text)

In [83]:
# check special chars in column : question1
df['is_special_char1'] = df['question1'].apply(lambda x: check_special_char(x))

# count entries having special chars
df[df['is_special_char1'] == True]['question1'].head(5)

0    What is the step by step guide to invest in share market in india?          
1    What is the story of Kohinoor (Koh-i-Noor) Diamond?                         
2    How can I increase the speed of my internet connection while using a VPN?   
3    Why am I mentally very lonely? How can I solve it?                          
4    Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?
Name: question1, dtype: object

In [14]:
# check special chars in column : question2
df['is_special_char2'] = df['question2'].apply(lambda x: check_special_char(x))

# count entries having special chars
df[df['is_special_char2'] == True]['question2'].head(5)

0    What is the step by step guide to invest in share market?                               
1    What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?
2    How can Internet speed be increased by hacking through DNS?                             
3    Find the remainder when [math]23^{24}[/math] is divided by 24,23?                       
4    Which fish would survive in salt water?                                                 
Name: question2, dtype: object

### DATA CLEANING

#### 1. Fill Null with empty string
#### 2. Remove special characters ('#', '~', '|'...)
#### 3. Convert mathematical symbols to text value
#### 4. Convert abbreviation to whole : ( what's -> what is)

In [15]:
# Fill null values with empty string
df = df.fillna({"question2": ''})

df[df['question2'].isnull()]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_word_count,q2_word_count,common_word_count,is_special_char1,is_special_char2


In [16]:

def remove_special_chars(text):
    return " ".join(e for e in text.split() if e.isalnum())


import re
def remove_abbr_and_math_symbol(text):
    # Mathematical symbols
    text = re.sub(r"\+", " plus ", text)
    text = re.sub(r"\-", " minus ", text)
    text = re.sub(r"\*", " multiply ", text)
    text = re.sub(r"\=", "equal", text)
    
    # abbrv.
    text = re.sub(r"What's", "What is ", text)
    text = re.sub(r"Who's", "Who is ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"'m", " am ", text)
    
    text = text.strip()
    return text


In [84]:
# Remove special chars.
df['question1'] = df['question1'].apply(lambda x: remove_special_chars(x))
df['question2'] = df['question2'].map(lambda x: remove_special_chars(x))

In [18]:
df['is_special_char2'] = df['question2'].apply(lambda x: check_special_char(x))

In [19]:
# Check special chars
df[df['is_special_char2'] == True]['question2'].head(5)

Series([], Name: question2, dtype: object)

In [20]:
# Pre-processing cases (mathematical symbols, abbrevations and )
df['question1'] = df['question1'].apply(lambda x: remove_abbr_and_math_symbol(x))
df['question2'] = df['question2'].map(lambda x: remove_abbr_and_math_symbol(x))

## FEATURE ENGINEERING

### 1. Removing stopwords
### 2. Features
 i. total words count
 ii. common words count
 iii. synonym count
 iv. similarity (cosine)

In [45]:
from sklearn.feature_extraction import text

In [46]:
stopwords = text.ENGLISH_STOP_WORDS

In [47]:
def remove_stopwords(text):
    words = [word for word in text.split()]
    words_without_stopwords = [word for word in words if word not in stopwords]
    return " ".join(words_without_stopwords)

In [48]:
output = remove_stopwords('This is a test')
output

'This test'

In [49]:
df['q1_removing_stopwords'] = df['question1'].apply(lambda x: remove_stopwords(x))
df['q2_removing_stopwords'] = df['question2'].apply(lambda x: remove_stopwords(x))

In [50]:
def common_words_count(x):
    q1, q2 = x
    return len(set(str(q1).lower().split()) & set(str(q2).lower().split()))

def total_words_count(question):
    return len(str(question).split())


In [51]:

df['q1_word_count'] = df['question1'].apply(total_words_count)
df['q2_word_count'] = df['question2'].apply(total_words_count)
df['common_word_count'] = df[['question1', 'question2']].apply(common_words_count, axis=1)

In [52]:
import nltk
from nltk.corpus import wordnet as wn

In [53]:
wn.synsets('change')[0].lemma_names()

['change', 'alteration', 'modification']

In [54]:
def get_lemmas(text):
    synsets = wn.synsets(text)
    lemmas = []
    if synsets:
        lemmas = synsets[0].lemma_names()
    return lemmas
    
def get_synonyms_list(text):
    tokens = nltk.word_tokenize(text)
    synonyms = []
    for token in tokens:
        synonyms.append(get_lemmas(token))
    synonyms_set = set([y for x in synonyms for y in x if y])
    return synonyms_set
    

def get_synonyms_count(row):
    q1_synonyms = row['q1_synonyms']
    q2 = row['q2_removing_stopwords']
    q2_tokens = nltk.word_tokenize(q2)
    common_words = q1_synonyms.intersection(q2_tokens)
    return len(common_words)
    

In [55]:
df['q1_synonyms'] = df.apply(lambda row: get_synonyms_list(row['q1_removing_stopwords']), axis=1)

In [56]:

df['synonyms_count'] = df.apply(lambda row: get_synonyms_count(row), axis=1)

In [57]:
df[['synonyms_count','q1_removing_stopwords', 'q2_removing_stopwords']][:5]

Unnamed: 0,synonyms_count,q1_removing_stopwords,q2_removing_stopwords
0,4,What step step guide invest share market,What step step guide invest share
1,0,What story Kohinoor,What happen Indian government stole Kohinoor diamond
2,1,How I increase speed internet connection using,How Internet speed increased hacking
3,0,Why I mentally How I solve,Find remainder divided
4,0,Which dissolve water quikly methane carbon di,Which fish survive salt


In [58]:
import spacy
import wmd

nlp = spacy.load('en_core_web_md')

In [59]:
def get_cosine_distance(question1, question2):
    doc1 = nlp(question1)
    doc2 = nlp(question2)
    return doc1.similarity(doc2)

In [60]:
get_cosine_distance('I speak three languages', 'I know Nepali, English, German')

0.6850770621257415

In [61]:
def get_similarity(row):
    q1 = row['q1_removing_stopwords']
    q2 = row['q2_removing_stopwords']
    return get_cosine_distance(q1, q2)

In [62]:
df['similarity'] = df.apply(lambda row: get_similarity(row), axis=1)

In [63]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_word_count,q2_word_count,common_word_count,is_special_char1,is_special_char2,q1_removing_stopwords,q2_removing_stopwords,q1_synonyms,synonyms_count,similarity
0,0,1,2,What is the step by step guide to invest in share market in,What is the step by step guide to invest in share,0,13,11,10,True,False,What step step guide invest share market,What step step guide invest share,"{portion, marketplace, market_place, invest, measure, guide, commit, market, share, percentage, place, step, put, part, usher}",4,0.982112
1,1,3,4,What is the story of Kohinoor,What would happen if the Indian government stole the Kohinoor diamond,0,6,11,3,True,False,What story Kohinoor,What happen Indian government stole Kohinoor diamond,"{narrative, tale, story, narration}",0,0.622239
2,2,5,6,How can I increase the speed of my internet connection while using a,How can Internet speed be increased by hacking through,0,13,9,4,True,False,How I increase speed internet connection using,How Internet speed increased hacking,"{addition, gain, net, connection, increase, I, speed, victimization, internet, connectedness, cyberspace, using, iodine, connexion, victimisation, exploitation, velocity, iodin, atomic_number_53}",1,0.910691
3,3,7,8,Why am I mentally very How can I solve,Find the remainder when is divided by,0,9,7,0,True,False,Why I mentally How I solve,Find remainder divided,"{puzzle_out, lick, I, work, figure_out, solve, iodine, work_out, why, mentally, wherefore, iodin, atomic_number_53}",0,0.530618
4,4,9,10,Which one dissolve in water quikly methane and carbon di,Which fish would survive in salt,0,10,6,2,True,False,Which dissolve water quikly methane carbon di,Which fish survive salt,"{methane, carbon, water, C, H2O, atomic_number_6, dissolve}",0,0.631401


In [64]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

train, test = train_test_split(df, train_size=0.8)

classifier.fit(train[['synonyms_count', 'similarity', 'q1_word_count', 'q2_word_count', 'common_word_count']], train.is_duplicate)
score = classifier.score(test[['synonyms_count', 'similarity', 'q1_word_count', 'q2_word_count', 'common_word_count']], test.is_duplicate)
print('score is', score)



score is 0.6425
