In [971]:
import pandas as pd
import numpy as np
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
%matplotlib inline
nltk.download('stopwords')
from sklearn import naive_bayes
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
import gensim


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [972]:
df= pd.read_csv('nlpdata.txt', sep=',,,', header= None,names=['Question','type'])#read the data

  """Entry point for launching an IPython kernel.


In [973]:
df.head()# displays top 5 rows 

Unnamed: 0,Question,type
0,how did serfdom develop in and then leave russ...,unknown
1,what films featured the character popeye doyle ?,what
2,how can i find a list of celebrities ' real na...,unknown
3,what fowl grabs the spotlight after the chines...,what
4,what is the full form of .com ?,what


In [974]:
df.shape# total 1483 observations that represent question and its type

(1483, 2)

In [0]:
df['type']=df['type'].str.strip()


In [976]:
df['type'].unique() #type of the qyestions present in a dataset

array(['unknown', 'what', 'when', 'who', 'affirmation'], dtype=object)

In [0]:
def clean_str(string):
  """
  String cleaning before vectorization
  """
  try:    
   
    string = re.sub(r"[^A-Za-z]", " ", string)         
    words = string.strip().lower().split()    
    words = [w for w in words if len(w)>=1]
    return " ".join(words)	
  except:
    return ""

In [0]:
df['clean_Question'] = df['Question'].apply(clean_str)

In [979]:
df.head()

Unnamed: 0,Question,type,clean_Question
0,how did serfdom develop in and then leave russ...,unknown,how did serfdom develop in and then leave russia
1,what films featured the character popeye doyle ?,what,what films featured the character popeye doyle
2,how can i find a list of celebrities ' real na...,unknown,how can i find a list of celebrities real names
3,what fowl grabs the spotlight after the chines...,what,what fowl grabs the spotlight after the chines...
4,what is the full form of .com ?,what,what is the full form of com


In [980]:
df.drop(['Question'],inplace=True, axis=1)
df.head()

Unnamed: 0,type,clean_Question
0,unknown,how did serfdom develop in and then leave russia
1,what,what films featured the character popeye doyle
2,unknown,how can i find a list of celebrities real names
3,what,what fowl grabs the spotlight after the chines...
4,what,what is the full form of com


Remove stopwords that don't have any significance.

TFIDF Vectorizer - Convert a collection of raw documents to a matrix of TF-IDF features. All stopwords are removed.

TF-IDF stands for “term frequency-inverse document frequency”, meaning the weight assigned to each token not only depends on its frequency in a document but also how recurrent that term is in the entire corpora. TF-IDF says how important that word is to that document with respect to the corpus.

In [981]:
#TFIDF Vectorizer
stopset = set(stopwords.words('english'))
vectorizer = TfidfVectorizer(use_idf=True, lowercase=True, strip_accents='ascii', stop_words=stopset)
vectorizer.fit(df) #Learn vocabulary and idf from training set.

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words={"you'll", 're', "haven't", "couldn't", 'above', 'about', 'all', 'not', 'your', 'this', 'their', 'once', 'won', 'ma', 'were', 'both', 'yourself', "you've", 'here', 'ain', 'just', 'i', 'these', "isn't", 'until', 'too', 'you', 'they', 'by', 'should', 'mustn', 'theirs', 'it', 'those', 'who',...'into', 'then', 'will', 'ourselves', 'against', 'couldn', 's', 'and', 'now', 'why', "aren't", 'but'},
        strip_accents='ascii', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [982]:
X = vectorizer.fit_transform(df.clean_Question)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y=le.fit_transform(df['type'])
print(list(le.classes_))
print(Y.shape)

Y= pd.DataFrame(data=Y, columns=["Y"])
df1 = pd.concat([df,Y],axis=1)

df1.drop('type',axis=1,inplace=True)
print(df1.head())
df1.head()

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state = 123)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

['affirmation', 'unknown', 'what', 'when', 'who']
(1483,)
                                      clean_Question  Y
0   how did serfdom develop in and then leave russia  1
1     what films featured the character popeye doyle  2
2    how can i find a list of celebrities real names  1
3  what fowl grabs the spotlight after the chines...  2
4                       what is the full form of com  2
(1038, 3420)
(445, 3420)
(1038, 1)
(445, 1)


**Naive Bayes Classifier**- We can train a classifier to try to predict the type of a question. We will start with a Naive Bayes classifier, which provides a nice baseline for this task. Scikit-learn includes several variants of this classifier, the one most suitable for text is the multinomial variant.

In [983]:
clf = naive_bayes.MultinomialNB()
model=clf.fit(X_train, y_train)
predicted_class=model.predict(X_test)

  y = column_or_1d(y, warn=True)


In [984]:
print(classification_report(predicted_class,y_test))
print("Accuracy :",clf.score(X_test,y_test))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.13      1.00      0.23        11
           2       0.93      0.47      0.62       366
           3       0.00      0.00      0.00         0
           4       0.37      0.63      0.47        68

   micro avg       0.51      0.51      0.51       445
   macro avg       0.29      0.42      0.27       445
weighted avg       0.83      0.51      0.59       445

Accuracy : 0.5078651685393258


  'recall', 'true', average, warn_for)


In [985]:

np.random.seed(42) # set numpy seed before importing keras.
df.head()

Unnamed: 0,type,clean_Question
0,unknown,how did serfdom develop in and then leave russia
1,what,what films featured the character popeye doyle
2,unknown,how can i find a list of celebrities real names
3,what,what fowl grabs the spotlight after the chines...
4,what,what is the full form of com


# **Convert Each Question to a Word List before feeding to Word2Vec**

In [0]:
documents = []


In [0]:
for doc in df['clean_Question']:
    documents.append(doc.split(' '))

# Build the Model

In [0]:
#Build the model

model = gensim.models.Word2Vec(documents, #Word list
                               min_count=3, #Ignore all words with total frequency lower than this                           
                               workers=4, #Number of CPUs, no of CPU cores
                               size=50,  #Embedding size,no of neurons
                               window=4, #Maximum Distance between current and predicted word, consider 5 neighbours on the left and 5 neighbours on the right
                               iter=10   #Number of iterations over the text corpus, how many times the network should run
                              )  

# Exploring the model
How many words in the model and how many features

In [989]:
model.wv.syn0.shape 

  """Entry point for launching an IPython kernel.


(622, 50)

In [990]:
model.wv.vocab

{'a': <gensim.models.keyedvectors.Vocab at 0x7f81b05d4550>,
 'abbreviation': <gensim.models.keyedvectors.Vocab at 0x7f81b05c84a8>,
 'about': <gensim.models.keyedvectors.Vocab at 0x7f81b060a518>,
 'academy': <gensim.models.keyedvectors.Vocab at 0x7f81b060ab00>,
 'actor': <gensim.models.keyedvectors.Vocab at 0x7f81b05af8d0>,
 'actress': <gensim.models.keyedvectors.Vocab at 0x7f81b05a6780>,
 'address': <gensim.models.keyedvectors.Vocab at 0x7f81b060ae10>,
 'africa': <gensim.models.keyedvectors.Vocab at 0x7f81b05af668>,
 'after': <gensim.models.keyedvectors.Vocab at 0x7f81b05d4668>,
 'air': <gensim.models.keyedvectors.Vocab at 0x7f81b05a6dd8>,
 'airplane': <gensim.models.keyedvectors.Vocab at 0x7f81b05fe9e8>,
 'airport': <gensim.models.keyedvectors.Vocab at 0x7f81b05f56d8>,
 'al': <gensim.models.keyedvectors.Vocab at 0x7f81b05fe0f0>,
 'all': <gensim.models.keyedvectors.Vocab at 0x7f81b05f5d68>,
 'amendment': <gensim.models.keyedvectors.Vocab at 0x7f81b05d4eb8>,
 'america': <gensim.models.k

Get an embedding for a word

In [991]:
model.wv['who']

array([ 0.17393889,  0.0788199 , -0.3677306 ,  0.61115015,  0.67875206,
        0.02729714, -0.25921696, -0.14539659,  0.30043313,  0.42497548,
       -0.34613478,  0.43910974,  0.22716029, -0.47182712, -0.1194673 ,
       -0.1190585 ,  0.18031687,  0.07993813, -0.31380054, -0.16885352,
       -0.7373604 , -0.10150864,  0.12240679,  0.27477455, -0.03392056,
        0.19255532,  0.4028848 , -0.17500515, -0.21162508, -0.29269925,
        0.21099329,  0.60459834,  0.18188244,  0.13941582, -0.4705176 ,
        0.24291776, -0.29422086,  0.17454384, -0.38761583, -0.9329524 ,
        0.26916343, -0.06634206, -0.08006347,  0.40433732, -0.07506761,
       -0.22201298,  0.39587796,  0.55937016, -0.42336994,  0.23442756],
      dtype=float32)


Saving the model

In [0]:
model.save('word2vec-question-50')

Finding Words which have similar meaning

In [993]:
model.wv.most_similar('your')

  if np.issubdtype(vec.dtype, np.int):


[('a', 0.999478816986084),
 ('have', 0.9994713068008423),
 ('to', 0.9994654655456543),
 ('with', 0.9994490146636963),
 ('for', 0.9994267225265503),
 ('me', 0.9994257688522339),
 ('that', 0.9994202852249146),
 ('called', 0.9994165897369385),
 ('has', 0.999410092830658),
 ('this', 0.999403715133667)]

In [994]:


#['who','when','what','affirmation','unknown'])

from sklearn import preprocessing
from keras.utils.np_utils import to_categorical
le = preprocessing.LabelEncoder()
Y=le.fit_transform(df['type'])
print(list(le.classes_))
print(a)
labels=to_categorical(np.asarray(Y))
labels.shape



#labels= pd.DataFrame(data=a, columns=["labels"])
#df = pd.concat([df,labels],axis=1)

#df.drop('type',axis=1,inplace=True)
#print(df.head())
#Y=df['labels']
#Y.shape

['affirmation', 'unknown', 'what', 'when', 'who']
[3 4 3 ... 2 2 2]


(1483, 5)

Split Data into Training and Test Data

In [0]:
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_Question'],
    labels,
    test_size=0.2, 
    random_state=42
)

**Build the Tokenizer**

In [0]:
from tensorflow.python.keras.preprocessing.text import Tokenizer

In [0]:
top_words = 100

In [0]:
t = Tokenizer(num_words=top_words) # num_words -> Vocablury size

In [0]:
t.fit_on_texts(X_train.tolist())

Prepare Training and Test Data

Get the word index for each of the word in the question

In [0]:
X_train = t.texts_to_sequences(X_train.tolist())
X_test = t.texts_to_sequences(X_test.tolist())

How many words in each question?

Pad Sequences 

In [1001]:
df['word_count'] = df['clean_Question'].apply(lambda x: len(str(x).split(" ")))
df[['clean_Question','word_count']].head()
df.groupby(['clean_Question'])['word_count'].max()

clean_Question
are the last two numbers the gal                                                                                      7
are the lights dimmable                                                                                               4
are there six filters for the price shown                                                                             8
are these drip pans dishwasher safe                                                                                   6
are these lead free                                                                                                   4
are they made in the usa                                                                                              6
can an extended warranty be purchased for this product                                                                9
can i get it in india                                                                                                 6
can i install on wall    

In [0]:
from tensorflow.python.keras.preprocessing import sequence
max_question_length = 30
X_train = sequence.pad_sequences(X_train,maxlen=max_question_length, padding='post')
X_test = sequence.pad_sequences(X_test, maxlen=max_question_length, padding='post')


Build Embedding Matrix from Pre-Trained Word2Vec

Load pre-trained Gensim Embeddings

In [0]:
word2vec = gensim.models.Word2Vec.load('word2vec-question-50')

Embedding Size

In [1004]:
embedding_vector_length = word2vec.wv.syn0.shape[1]

  """Entry point for launching an IPython kernel.


Build matrix for current data

In [0]:
embedding_matrix = np.zeros((top_words + 1, embedding_vector_length))

In [0]:
for word, i in sorted(t.word_index.items(),key=lambda x:x[1]):
    if i > top_words:
        break
    if word in word2vec.wv.vocab:
        embedding_vector = word2vec.wv[word]
        embedding_matrix[i] = embedding_vector

Build the Graph

In [0]:
from tensorflow.python.keras.models import Sequential

In [0]:
from tensorflow.python.keras.layers import Dropout, Dense, Embedding, Flatten, LSTM

In [0]:
model = Sequential()

Add Embedding layer



In [0]:
model.add(Embedding(top_words + 1,
                    embedding_vector_length,
                    input_length=max_question_length,
                   weights=[embedding_matrix],
                   trainable=False)
         )

Add Layer with 100 LSTM Memory Units

In [1011]:
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(5,activation='softmax'))
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_25 (Embedding)     (None, 30, 50)            5050      
_________________________________________________________________
lstm_31 (LSTM)               (None, 100)               60400     
_________________________________________________________________
dense_22 (Dense)             (None, 5)                 505       
Total params: 65,955
Trainable params: 60,905
Non-trainable params: 5,050
_________________________________________________________________
None


Execute the graph

In [1012]:
model.fit(X_train,y_train,
          epochs=20,
          batch_size=128,          
          validation_data=(X_test, y_test),
         )

Train on 1186 samples, validate on 297 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f81af2f5470>

In [0]:
example = t.texts_to_sequences(["what is a nanometer?"])

example = sequence.pad_sequences(example,maxlen=max_question_length, padding='post')

In [1014]:
example.shape

(1, 30)

In [1015]:
np.argmax(model.predict(example))

2