<a href="https://colab.research.google.com/github/sandipanbasu/aiml-capstone/blob/master/mrc_SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries and setting up Google Drive

In [1]:
import tensorflow as tf
import nltk
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
nltk.download('punkt')
nltk.download('wordnet')
import re
import unicodedata
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize 
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) 
import pickle
# we will store the params as we go along in this object
params_svm = {}
project_path = "/content/drive/My Drive/AIML-MRC-Capstone/datasets/Squad2.0/TrainingDataset/"
model_path = "/content/drive/My Drive/AIML-MRC-Capstone/models/"
tensorboard_logpath  = "/content/drive/My Drive/AIML-MRC-Capstone/models/tensorboard-logs/"

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


# Objective - SVM

> Model 1
  1.   Inputs: a context paragraph p = {p1, ..., pP } of length P
  2.   Output: A question q = {q1, ..., qQ} of length Q 

> Model 2
  1.   Inputs: A question q = {q1, ..., qQ} of length Q 
  2.   Output: A answer

**Model1.predict(new context) --> new question<br>
Model2.predict(new question) --> new answer**





## 1 Common Functions 

#### 1.1 Custom function for preprocessing of context and question

In [0]:
# remove unwanted chars
# convert to lowercase
# remove unwanted spaces
# remove stop words
stop_words = set(stopwords.words('english')) 

## reference 
def decontracted(phrase):
    """
    This function remooves punctuation from given sentence.
    """

    if(phrase is np.nan):
      return 'impossible'      

    try:      
      # specific
      phrase = re.sub(r"won\'t", "will not", phrase)
      phrase = re.sub(r"can\'t", "can not", phrase)

      # general
      phrase = re.sub(r"n\'t", " not", phrase)
      phrase = re.sub(r"\'re", " are", phrase)
      phrase = re.sub(r"\'s", " is", phrase)
      phrase = re.sub(r"\'d", " would", phrase)
      phrase = re.sub(r"\'ll", " will", phrase)
      phrase = re.sub(r"\'t", " not", phrase)
      phrase = re.sub(r"\'ve", " have", phrase)
      phrase = re.sub(r"\'m", " am", phrase)
      
      # string operation
      phrase = phrase.replace('\\r', ' ')
      phrase = phrase.replace('\\"', ' ')
      phrase = phrase.replace('\\n', ' ')

      phrase = re.sub('[^A-Za-z0-9]+', ' ', phrase.lower())
    except:
      print(phrase)  
    
    return phrase

def preprocess_text(corpus, text_lower_case=True, 
                      special_char_removal=True, stopword_removal=True, remove_digits=False):    
    normalized_text = []
    # normalize each document in the corpus
    for doc in corpus:
        # doc = decontracted(doc)
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
        # remove special characters and\or digits    
        if special_char_removal:
            # insert spaces between special characters to isolate them    
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits) 

        if stopword_removal:
            doc = remove_stopwords(doc)

        normalized_text.append(doc)
        
    return normalized_text

def remove_special_characters(text, remove_digits=False):
    #Using regex
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

def remove_stopwords(text):  
    word_tokens = word_tokenize(text) 
    filtered_sentence = [w for w in word_tokens if not w in stop_words]   
    filtered_sentence = [] 
    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w)                 
    return ' '.join(filtered_sentence)

## 2 Load Squad Data

### 2.1 Load 

In [4]:
from sklearn.model_selection import train_test_split

squad_df = pd.read_csv(project_path+'squad_data_final.csv')
squad_df.drop('Unnamed: 0',axis=1,inplace=True)
squad_df.tail(5)

Unnamed: 0,title,context,question,id,answer_start,answer,plausible_answer_start,plausible_answer,is_impossible,clean_context,clean_question,clean_answer,answer_len,answer_end,answer_span,answer_word_span
130301,Matter,"The term ""matter"" is used throughout physics i...",Physics has broadly agreed on the definition o...,5a7e070b70df9f001a875439,-1,,485.0,matter,True,the term matter is used throughout physics in ...,physics has broadly agreed on the definition o...,impossible,0,-1,"(-1, -1)","(-1, -1)"
130302,Matter,"The term ""matter"" is used throughout physics i...",Who coined the term partonic matter?,5a7e070b70df9f001a87543a,-1,,327.0,Alfvén,True,the term matter is used throughout physics in ...,who coined the term partonic matter,impossible,0,-1,"(-1, -1)","(-1, -1)"
130303,Matter,"The term ""matter"" is used throughout physics i...",What is another name for anti-matter?,5a7e070b70df9f001a87543b,-1,,350.0,Gk. common matter,True,the term matter is used throughout physics in ...,what is another name for anti matter,impossible,0,-1,"(-1, -1)","(-1, -1)"
130304,Matter,"The term ""matter"" is used throughout physics i...",Matter usually does not need to be used in con...,5a7e070b70df9f001a87543c,-1,,529.0,a specifying modifier,True,the term matter is used throughout physics in ...,matter usually does not need to be used in con...,impossible,0,-1,"(-1, -1)","(-1, -1)"
130305,Matter,"The term ""matter"" is used throughout physics i...",What field of study has a variety of unusual c...,5a7e070b70df9f001a87543d,-1,,37.0,physics,True,the term matter is used throughout physics in ...,what field of study has a variety of unusual c...,impossible,0,-1,"(-1, -1)","(-1, -1)"


### 2.2 Create Train, Validation and Test data

In [0]:
squad_df_cleaned = squad_df.head(13000)

In [0]:
Xcontext_train, Xcontext_test, yquestion_train, yquestion_test = train_test_split(squad_df_cleaned['clean_context'], squad_df_cleaned['clean_question'], test_size=0.33, random_state=42)
Xquestion_train, Xquestion_test, yanswer_train, yanswer_test = train_test_split(squad_df_cleaned['clean_question'], squad_df_cleaned['clean_answer'], test_size=0.33, random_state=42)

In [7]:
print(Xcontext_train.shape)
print(Xcontext_test.shape)
print(yquestion_train.shape)
print(yquestion_test.shape)

(8710,)
(4290,)
(8710,)
(4290,)


### 2.3 Build Tokenizer

In [8]:
# Define TF-ID Venctorize and Label encoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

xvectorizer = TfidfVectorizer(max_features=5000)
xvectorizer.fit(squad_df_cleaned['clean_context'] + " " + squad_df_cleaned['clean_question'])
ylblencoder = LabelEncoder()
# # generate label encoder for both question and clean answer
ylblencoder.fit(squad_df_cleaned['clean_question'].append(squad_df_cleaned['clean_answer']).reset_index(drop=True).astype(str))

LabelEncoder()

In [9]:
print(len(ylblencoder.classes_))
len(xvectorizer.vocabulary_)

20064


5000

## 3 Vectorization / Encoding

### 3.1 Context, Question, Answer vectorized

In [0]:
Xcontext_vectorized = xvectorizer.transform(Xcontext_train)
yquestion_vectorized = ylblencoder.transform(yquestion_train)

Xquestion_vectorized = xvectorizer.transform(Xquestion_train)
yanswer_vectorized = ylblencoder.transform(yanswer_train)

In [0]:
Xcontext_test_vectorized= xvectorizer.transform(Xcontext_test)
yquestion_test_vectorized= xvectorizer.transform(yquestion_test)

Xquestion_test_vectorized = xvectorizer.transform(Xquestion_test)
yanswer_test_vectorized = ylblencoder.transform(yanswer_test)

### 3.2 Check 1 Value

In [16]:
# should be a sparse matrix 
Xcontext_vectorized[0]

<1x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 23 stored elements in Compressed Sparse Row format>

In [24]:
yquestion_vectorized[0]

42016

### 3.3 Persist Models, Tokenizers and Encoders

In [0]:
with open(model_path + "svm_xvectorizer.pkl","wb") as f:
    pickle.dump(xvectorizer,f)

with open(model_path + "svm_ylblencoder.pkl","wb") as f:
    pickle.dump(ylblencoder,f)    

## 4 Model

### 4.1 Building Context - Question Model 

In [18]:
from sklearn import svm

# SVM model 1
try:
  context2question = svm.SVC()    
  context2question.fit(Xcontext_vectorized,yquestion_vectorized)
except RuntimeError as e:
  print(e)

print('Model context2question', context2question)

Model context2question SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


### 4.2 Persis Context - Question Model

In [0]:
with open(model_path + "svm_context2question.pkl","wb") as f:
    pickle.dump(context2question,f) 

### 4.3 Building Question - Answer Model 

In [20]:
try: 
  question2answer = svm.SVC(gamma=0.025, C=10)    
  question2answer.fit(Xquestion_vectorized,yanswer_vectorized)
except RuntimeError as e:
  print(e)

print('Model question2answer', question2answer)

Model question2answer SVC(C=10, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.025, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


### 4.4 Persist Question - Answer Model

In [0]:
with open(model_path + "svm_question2answer.pkl","wb") as f:
    pickle.dump(question2answer,f) 

### 4.4 Load Existing Models

In [0]:
context2question = pickle.load(open(model_path + "svm_context2question.pkl", 'rb'))
question2answer = pickle.load(open(model_path + "svm_question2answer.pkl", 'rb'))

### 4.5 Accuracy Metrics

In [0]:
train_acc = context2question.score(Xcontext_vectorized,yquestion_vectorized)
# test_acc = context2question.score(Xcontext_vectorized,yquestion_vectorized)

print('Train Accuracy', train_acc)
# print('Test Accuracy', test_acc)