# intent detection with vector similarity

code based on https://www.kaggle.com/tj2552/similarity-techniques-nlp

In [1]:
%matplotlib inline

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sklearn
from sklearn import model_selection
import nltk
from nltk.corpus import stopwords
from collections import Counter
import matplotlib.pyplot as plt
import operator
print (sklearn.__version__)

0.19.0


In [3]:
from subprocess import check_output
# print(check_output(["ls", "../data/"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In [14]:
from sklearn.model_selection import train_test_split

def read_data():
    df = pd.read_csv("data/train.csv", nrows=20000)
    print ("Shape of base training File = ", df.shape)
    # Remove missing values and duplicates from training data
    df.drop_duplicates(inplace=True)
    df.dropna(inplace=True)
    print("Shape of base training data after cleaning = ", df.shape)
    return df

df = read_data()
df_train, df_test = train_test_split(df, test_size = 0.02)
print ('Shape of test data = ', df_test.shape)
print
print 'training data head'
print '------------------'
print (df_train.head(2))

('Shape of base training File = ', (20000, 6))
('Shape of base training data after cleaning = ', (20000, 6))
('Shape of test data = ', (400, 6))

training data head
------------------
        id  qid1  qid2                                          question1  \
88      88   177   178     Which is the best gaming laptop under 60k INR?   
1482  1482  2951  2952  What is more beneficial to learn, SolidWorks o...   

                                              question2  is_duplicate  
88      Which is the best gaming laptop under Rs 60000?             1  
1482  Are softwares like SolidWorks, CATIA and Pro E...             0  


Some Exploratory Data Analysis (EDA) on the data to get a look and feel about the data. Here we are trying to see the distribution of output data. Duplicate questions available etc.

In [17]:
def eda(df):
    print ("Duplicate Count = %s , Non Duplicate Count = %s" 
           %(df.is_duplicate.value_counts()[1],df.is_duplicate.value_counts()[0]))
    
    question_ids_combined = df.qid1.tolist() + df.qid2.tolist()
    
    print ("Unique Questions = %s" %(len(np.unique(question_ids_combined))))
    
    question_ids_counter = Counter(question_ids_combined)
    sorted_question_ids_counter = sorted(question_ids_counter.items(), key=operator.itemgetter(1))
    question_appearing_more_than_once = [i for i in question_ids_counter.values() if i > 1]
    print ("Count of Quesitons appearing more than once = %s" %(len(question_appearing_more_than_once)))

In [18]:
eda(df_train)

Duplicate Count = 7334 , Non Duplicate Count = 12266
Unique Questions = 37045
Count of Quesitons appearing more than once = 1756


### Train the dictionary

In [28]:
import re                           # regular expressions
import gensim                       # vector space modeling and topic modeling toolkit 
from gensim import corpora
from nltk.corpus import stopwords   # natural language toolkit
from nltk.stem.porter import *
nltk.__path__

['/home/ubuntu/anaconda2/lib/python2.7/site-packages/nltk']

In [40]:
# import nltk
# nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [41]:
words = re.compile(r"\w+",re.I)
stopword = stopwords.words('english')
stemmer = PorterStemmer()

In [42]:
def tokenize_questions(df):
    question_1_tokenized = []
    question_2_tokenized = []

    for q in df.question1.tolist():
        question_1_tokenized.append([stemmer.stem(i.lower()) for i in words.findall(q) if i not in stopword])

    for q in df.question2.tolist():
        question_2_tokenized.append([stemmer.stem(i.lower()) for i in words.findall(q) if i not in stopword])

    df["Question_1_tok"] = question_1_tokenized
    df["Question_2_tok"] = question_2_tokenized
    
    return df


In [43]:
def train_dictionary(df):
    
    questions_tokenized = df.Question_1_tok.tolist() + df.Question_2_tok.tolist()
    
    dictionary = corpora.Dictionary(questions_tokenized)
    dictionary.filter_extremes(no_below=5, no_above=0.8)
    dictionary.compactify()
    
    return dictionary

In [44]:
df_train = tokenize_questions(df_train)
dictionary = train_dictionary(df_train)
print ("No of words in the dictionary = %s" %len(dictionary.token2id))

df_test = tokenize_questions(df_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


No of words in the dictionary = 4823


## create sparse vector matrices
Use Bag Of Words Technique to convert sentences into vectors. 

There are two vector matrices thus created where each of the matrix is a sparse matrix to save memory in the system.

In [45]:
def get_vectors(df, dictionary):
    
    question1_vec = [dictionary.doc2bow(text) for text in df.Question_1_tok.tolist()]
    question2_vec = [dictionary.doc2bow(text) for text in df.Question_2_tok.tolist()]
    
    question1_csc = gensim.matutils.corpus2csc(question1_vec, num_terms=len(dictionary.token2id))
    question2_csc = gensim.matutils.corpus2csc(question2_vec, num_terms=len(dictionary.token2id))
    
    return question1_csc.transpose(),question2_csc.transpose()

In [50]:
q1_csc, q2_csc = get_vectors(df_train, dictionary)
print '(size of the training data) X (no of words in the dictionary)'
print '-------------------------------------------------------------'
print ('q1_csc shape', q1_csc.shape)
print ('q2_csc shape', q2_csc.shape)

(size of the training data) X (no of words in the dictionary)
-------------------------------------------------------------
('q1_csc shape', (19600, 4823))
('q2_csc shape', (19600, 4823))
