Chapter 6 Deep Learning for NLP

# Information Retrieval System

Usages:<br>
1. Search Engines
2. Document Retrieval systems 
3. Passage Retrieval systems 
4. Question Answering Systems

In [1]:
Doc1 = ['''With the Union cabinet approving the amendments to the
Motor Vehicles Act, 2016, those caught for drunken driving will
have to have really deep pockets, as the fine payable in court
has been enhanced to Rs 10,000 for first-time offenders.''']

In [2]:
Doc2 = ['''"Natural language processing (NLP) is an area of
computer science and artificial intelligence concerned with the
interactions between computers and human (natural) languages,
in particular how to program computers to process and analyze
large amounts of natural language data.''']

In [3]:
Doc3 = ['''"He points out that public transport is very good in
Mumbai and New Delhi, where there is a good network of suburban
and metro rail systems.''']

In [4]:
Doc4 = ['''But the man behind the wickets at the other end was
watching just as keenly. With an affirmative nod from Dhoni,
India captain Rohit Sharma promptly asked for a review. Sure
enough, the ball would have clipped the top of middle and leg.''']

In [5]:
import re
from nltk.corpus import stopwords

In [6]:
stp = stopwords.words('english')

In [7]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    #text = re.sub(r'[^\w\s]+',' ', text)
    text = ' '.join([word for word in text.split() if word not in stp])
    return text

In [8]:
path = r'E:\GoogleNews-vectors-negative300.bin'

In [9]:
import gensim
import numpy as np

In [10]:
w2vec = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True)

In [11]:
def get_embeddings(word):
    if word in w2vec.vocab:
        return w2vec[word]
    else:
        return np.zeros(300)

In [12]:
# Average embeddings

In [13]:
fin = Doc1+Doc2+Doc3+Doc4

In [14]:
out_dict = {}

In [15]:
import nltk

In [16]:
for se in fin:
    average_vector = np.mean(np.array([get_embeddings(word) for word in nltk.word_tokenize(preprocess(se))]),axis=0)
    d = {se:average_vector}
    out_dict.update(d)

In [17]:
# Get similarity between query and documents vectorsout_dict

In [18]:
import scipy

In [19]:
def get_similarity(query, doc):
    cos_sim = np.dot(query, doc)/(np.linalg.norm(query)*np.linalg.norm(doc))
    return cos_sim

In [20]:
# Function to generate ranked documents

In [23]:
def Rank_text(query):
    query_vector = np.mean(np.array([get_embeddings(word) for word in nltk.word_tokenize(preprocess(query))]),axis=0)
    rank = []
    for k,v in out_dict.items():
        rank.append((k,get_similarity(query_vector, v)))
    rank = sorted(rank, key=lambda x:x[1], reverse=True)
    return rank

In [25]:
Rank_text('who was playing cricket')

[('But the man behind the wickets at the other end was\nwatching just as keenly. With an affirmative nod from Dhoni,\nIndia captain Rohit Sharma promptly asked for a review. Sure\nenough, the ball would have clipped the top of middle and leg.',
  0.47628248),
 ('With the Union cabinet approving the amendments to the\nMotor Vehicles Act, 2016, those caught for drunken driving will\nhave to have really deep pockets, as the fine payable in court\nhas been enhanced to Rs 10,000 for first-time offenders.',
  0.2899310253890867),
 ('"He points out that public transport is very good in\nMumbai and New Delhi, where there is a good network of suburban\nand metro rail systems.',
  0.24180555),
 ('"Natural language processing (NLP) is an area of\ncomputer science and artificial intelligence concerned with the\ninteractions between computers and human (natural) languages,\nin particular how to program computers to process and analyze\nlarge amounts of natural language data.',
  0.19640841197922515

# Text Classification Using Deep Learning

Spam ham classification from sms dataset

In [5]:
import pandas as pd

In [6]:
file_content = pd.read_csv('spam.csv', encoding = "ISO-8859-1")

In [7]:
dataset = file_content[['v2','v1']]

In [22]:
dataset.columns = ['features', 'target']

In [1]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [97]:
X_train, X_test, y_train, y_test = train_test_split(dataset['features'],dataset['target'],test_size=0.2)

In [98]:
MAX_SEQUENCE_LENGTH = 300

In [99]:
MAX_NB_WORDS = 20000

In [100]:
EMB_DIM = 100

In [101]:
# Tokenize for word Embeddings

In [102]:
import tensorflow as tf
from tensorflow import keras

In [103]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [104]:
# Tokenize features

In [105]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)

In [106]:
tokenizer.fit_on_texts(X_train)

In [107]:
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [108]:
# Label encode targets and convert to categorical

In [109]:
lbl = LabelEncoder()
lbl.fit(y_train)

LabelEncoder()

In [110]:
y_train.shape

(4457,)

In [111]:
y_train = lbl.transform(y_train)
y_test = lbl.transform(y_test)

In [112]:
y_train.shape

(4457,)

In [113]:
y_train = to_categorical(np.asarray(y_train))

In [114]:
y_train.shape

(4457, 2)

In [115]:
y_test = to_categorical(np.asarray(y_test))

In [116]:
# Padding

In [117]:
X_train = pad_sequences(X_train,maxlen=MAX_SEQUENCE_LENGTH)

In [118]:
X_test = pad_sequences(X_test,maxlen=MAX_SEQUENCE_LENGTH)

In [119]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4457, 300), (1115, 300), (4457, 2), (1115, 2))

# Model Building

1. CNN

In [123]:
model = keras.models.Sequential([
    keras.layers.Embedding(input_dim=MAX_NB_WORDS, output_dim=EMB_DIM, input_length=MAX_SEQUENCE_LENGTH),
    keras.layers.Dropout(0.5),
    keras.layers.Conv1D(filters=128, kernel_size=5, activation='relu'),
    keras.layers.MaxPooling1D(5),
    keras.layers.Dropout(0.5),
    keras.layers.BatchNormalization(),
    keras.layers.Conv1D(filters=128, kernel_size=5, activation='relu'),
    keras.layers.MaxPooling1D(5),
    keras.layers.Dropout(0.5),
    keras.layers.BatchNormalization(),
    keras.layers.Flatten(),
    keras.layers.Dense(units=128, activation='relu'),
    keras.layers.Dense(units=2, activation='softmax')
])

In [124]:
model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['accuracy'])

In [125]:
history = model.fit(X_train, y_train, batch_size=64 ,epochs=5, validation_data=(X_test,y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [126]:
y_pred = model.predict(X_test)

In [129]:
y_pred

array([[0.3016559 , 0.69834405],
       [0.84343934, 0.15656064],
       [0.825979  , 0.17402096],
       ...,
       [0.79840803, 0.20159195],
       [0.7990837 , 0.2009163 ],
       [0.83477   , 0.16523004]], dtype=float32)

In [130]:
y_pred.round()

array([[0., 1.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [132]:
print(classification_report(y_test,y_pred.round()))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96       972
           1       1.00      0.45      0.62       143

   micro avg       0.93      0.93      0.93      1115
   macro avg       0.96      0.72      0.79      1115
weighted avg       0.93      0.93      0.92      1115
 samples avg       0.93      0.93      0.93      1115



2. RNN

In [135]:
model = keras.models.Sequential([
    keras.layers.Embedding(input_dim=MAX_NB_WORDS, output_dim=EMB_DIM, input_length=MAX_SEQUENCE_LENGTH),
    keras.layers.SimpleRNN(units=2, input_shape=(None,1)),
    keras.layers.Dense(units=2,activation='softmax')
])

In [137]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [138]:
history = model.fit(X_train, y_train, epochs=5, validation_data=(X_test,y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [140]:
y_pred = model.predict(X_test)

In [144]:
model.evaluate(X_test,y_test)



[0.0764990895986557, 0.9847533702850342]

In [143]:
print(classification_report(y_test,y_pred.round()))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       972
           1       1.00      0.88      0.94       143

   micro avg       0.98      0.98      0.98      1115
   macro avg       0.99      0.94      0.96      1115
weighted avg       0.99      0.98      0.98      1115
 samples avg       0.98      0.98      0.98      1115



3. LSTM

In [150]:
model = keras.models.Sequential([
    keras.layers.Embedding(input_dim=MAX_NB_WORDS, output_dim=EMB_DIM, input_length=MAX_SEQUENCE_LENGTH),
    keras.layers.LSTM(units=2, activation='relu', return_sequences=True),
    keras.layers.Dropout(0.2),
    keras.layers.BatchNormalization(),
    keras.layers.Flatten(),
    keras.layers.Dense(units=2,activation='softmax')
])

In [151]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [152]:
history = model.fit(X_train,y_train, batch_size=16, epochs=5, validation_data=(X_test,y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [153]:
y_pred = model.predict(X_test)

In [160]:
print(classification_report(y_test,y_pred.round()))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98       972
           1       0.97      0.80      0.88       143

   micro avg       0.97      0.97      0.97      1115
   macro avg       0.97      0.90      0.93      1115
weighted avg       0.97      0.97      0.97      1115
 samples avg       0.97      0.97      0.97      1115



# Next Word Prediction

In [5]:
import pandas as pd

In [16]:
dataset = pd.read_csv(r'spam.csv',encoding='latin1')