# Reading QuestionSet to build the model

In [176]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import FreqDist
from nltk.tokenize import RegexpTokenizer, sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from random import sample
import string

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from keras.models import Sequential
from keras.layers import Dense, Flatten, Activation, Dropout,  Conv1D, MaxPooling2D, MaxPooling1D
from keras.optimizers import SGD

import pickle

In [177]:
#Read in the Chat Corpus
chat = pd.read_csv('./data/ChatCorpus.csv')

In [178]:
#Read in the Answer Corpus
ans_file = pd.read_csv('./data/IntentAnswers.csv')

In [179]:
chat.head(2)

Unnamed: 0,family,question_main,intent,QuestionWords,q_perm
0,Applying and account information,What information does Banco Uno® require when ...,Cardapply,what how apply information need card,what how apply information need
1,Applying and account information,What information does Banco Uno® require when ...,Cardapply,what how apply information need card,what how apply information card


In [180]:
chat[['q_perm', 'intent']].sample(n=5)

Unnamed: 0,q_perm,intent
21137,answer increase line rent question,clrent
35754,redeem travel than miles for,rewards9
1237,credit update card find application,CardAppStatus
33545,miss don’t earn rewards pay,rewards2
21655,credit line question increase limit,clspend


In [181]:
ans_file.head()

Unnamed: 0,intent,answer
0,Cardapply,"You’ll need to provide personal information, i..."
1,Authuser,Yes. You can add authorized users online after...
2,CardAppStatus,"If you apply by phone or online, you will ofte..."
3,NewCCReceive,"If you’re approved, you’ll receive your Banco ..."
4,ApplyPayments,We generally apply payments up to your minimum...


In [182]:
#set Intent as key for answers
ans = ans_file.set_index('intent')
ans.head()

Unnamed: 0_level_0,answer
intent,Unnamed: 1_level_1
Cardapply,"You’ll need to provide personal information, i..."
Authuser,Yes. You can add authorized users online after...
CardAppStatus,"If you apply by phone or online, you will ofte..."
NewCCReceive,"If you’re approved, you’ll receive your Banco ..."
ApplyPayments,We generally apply payments up to your minimum...


In [183]:
ans.loc['Cardapply'][0]

'You’ll need to provide personal information, including your:Full name, Social Security number, Date of birth, Physical address, (No P.O. Boxes) Estimated gross annual income, Checking and/or savings account information'

In [184]:
# Remove all numbers from text
remove_digits = str.maketrans('', '', string.digits)
chat['processed_q'] =chat['q_perm'].apply(lambda x: x.translate(remove_digits))

# Remove all the special characters
exclude = set(string.punctuation) # Set of all special characters

chat['processed_q'] = chat['processed_q'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))


# removing dashes 
chat['processed_q'] = [row.replace('-', '') for row in chat['q_perm']]
#chat['processed_q'] = chat['q_perm']

In [185]:
# tokenizing all rows
tokenizer = RegexpTokenizer('\w+|\$[\d.]+|S+')
chat['processed_q'] = [tokenizer.tokenize(row.lower()) for row in chat['processed_q']] 

In [186]:
# rejoining list of words in each row
chat['processed_q'] = [' '.join(row) for row in chat['processed_q']]

In [187]:
chat.head(5)

Unnamed: 0,family,question_main,intent,QuestionWords,q_perm,processed_q
0,Applying and account information,What information does Banco Uno® require when ...,Cardapply,what how apply information need card,what how apply information need,what how apply information need
1,Applying and account information,What information does Banco Uno® require when ...,Cardapply,what how apply information need card,what how apply information card,what how apply information card
2,Applying and account information,What information does Banco Uno® require when ...,Cardapply,what how apply information need card,what how apply need information,what how apply need information
3,Applying and account information,What information does Banco Uno® require when ...,Cardapply,what how apply information need card,what how apply need card,what how apply need card
4,Applying and account information,What information does Banco Uno® require when ...,Cardapply,what how apply information need card,what how apply card information,what how apply card information


### Modeling processed_text

In [188]:
#Tokenizing, lemmatizing and joining the words again to feed into the pipeline for modelling

def tok_lem_remstop_join(dataset, var):
    lemmatizer = WordNetLemmatizer()
    dataset_copy = dataset.copy()
    new_text = []
    for i in dataset_copy[var]:
        tokens = word_tokenize(i.lower())
        tokens_lem = [lemmatizer.lemmatize(token) for token in tokens]
        tokens_filtered= [word for word in tokens_lem if not word in stopwords.words('english')]
        #tokens_filtered = tokens_lem
        new_review = " ".join(tokens_filtered)
        new_text.append(new_review)
    dataset_copy[var] = new_text
    return dataset_copy


In [189]:
#Creating a new column without stopwords
chat = tok_lem_remstop_join(chat, 'processed_q')

In [190]:
chat.head(2)

Unnamed: 0,family,question_main,intent,QuestionWords,q_perm,processed_q
0,Applying and account information,What information does Banco Uno® require when ...,Cardapply,what how apply information need card,what how apply information need,apply information need
1,Applying and account information,What information does Banco Uno® require when ...,Cardapply,what how apply information need card,what how apply information card,apply information card


In [191]:
#Creating multiple rows so that X/y splits can happen and model can train
#chat_dup = pd.concat([chat]*10, ignore_index=True)

In [192]:
X = chat['processed_q']
y = chat['intent']

In [193]:
X.shape

(36615,)

In [194]:
### BELOW ROWS ARE TO GET DIFF PROCESSED QUEUES

## MODELLING STARTS HERE

In [195]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [196]:
# vectorizing tags w/default params
tvec = TfidfVectorizer()
X_train_tvec = tvec.fit_transform(X_train)
X_test_tvec = tvec.transform(X_test)

In [197]:
X_train_tvec

<27461x125 sparse matrix of type '<class 'numpy.float64'>'
	with 112652 stored elements in Compressed Sparse Row format>

In [198]:
# vectorizing w/different tuning to compare models to default
tv = TfidfVectorizer(ngram_range=(1,2), max_df=2.0, min_df=5)
X_train_tv = tv.fit_transform(X_train)
X_test_tv = tv.transform(X_test)

#### Logistic Regression

In [199]:
# logistic regression w/tuned vectorizer
lr = LogisticRegression(max_iter = 1000, solver= 'liblinear', C=100)
lr.fit(X_train_tv, y_train)

LogisticRegression(C=100, max_iter=1000, solver='liblinear')

In [200]:
# scoring logistic regression
print(f'Train score: {lr.score(X_train_tv, y_train)}')
print(f'Test score: {lr.score(X_test_tv, y_test)}')

Train score: 0.9968682859327774
Test score: 0.9954118418177845


In [137]:
X_train_tv.shape[1]

875

## Preparing Model for Streamlit

In [159]:
## Preparing Model for Streamlit
# creating pipeline for model
nlp_pipe = Pipeline([('tvec', TfidfVectorizer(ngram_range=(1,2), max_df=2.0, min_df=5)), 
                ('logreg', LogisticRegression(max_iter=1000, random_state=42, solver='liblinear', C=100))])
# fitting pipeline
nlp_pipe.fit(X_train, y_train)


Pipeline(steps=[('tvec',
                 TfidfVectorizer(max_df=2.0, min_df=5, ngram_range=(1, 2))),
                ('logreg',
                 LogisticRegression(C=100, max_iter=1000, random_state=42,
                                    solver='liblinear'))])

In [160]:
# scoring pipeline to make sure scores still accurate
nlp_pipe.score(X_train, y_train), nlp_pipe.score(X_test, y_test)




(0.9968682859327774, 0.9954118418177845)

In [161]:
# testing model on function output and it's performing well
input_text='How do i apply for a card?'
nlp_pipe.predict([input_text])

array(['Cardapply'], dtype=object)

In [163]:
# testing model on function output and it's performing well
input_text='hello?'
nlp_pipe.predict([input_text])

array(['greeting'], dtype=object)

In [164]:
# loading model to pickle for usage in streamlit
pickle.dump(nlp_pipe, open('./models/cs_model.p', 'wb'))

In [165]:
## DOING SOME PREDICTIONS with raw model
# function to preprocess user inputs for nlp model
def preprocess_nlp(question):
    input_list = []
    processed_question = question.replace('-', '')
    tokenizer = RegexpTokenizer('\w+|\$[\d.]+|S+')
    token = tokenizer.tokenize(processed_question.lower())
    lemmatizer = WordNetLemmatizer()
    lem_token = [lemmatizer.lemmatize(word) for word in token]
    joined_text = ' '.join(lem_token)
    input_list.append(joined_text)
    return input_list

In [168]:
# processing inputs for nlp model
question_text='how do i apply for a card'
input_text = preprocess_nlp(question_text)
answer_nlp = nlp_pipe.predict(input_text)
ans.loc[answer_nlp[0]][0]

'You’ll need to provide personal information, including your:Full name, Social Security number, Date of birth, Physical address, (No P.O. Boxes) Estimated gross annual income, Checking and/or savings account information'

In [169]:
# Doing predictions with pipe
question_text='hello'
input_text = preprocess_nlp(question_text)
ip_series = pd.Series(input_text)
print(ip_series)
answer_nlp = nlp_pipe.predict(input_text)
ans.loc[answer_nlp[0]][0]
#ans

0    hello
dtype: object


'Hello from Banco Uno.  How can I help you?'

## Convolution Neural Network model

In [201]:
X_train_tv = np.array(X_train_tv)
X_test_tv = np.array(X_test_tv)
# create train and test lists. X - patterns, Y - intents
#X_train = list(X_train_arr)
#y_train = list(y_train)
#print("Training data created")

In [204]:
X_train_tv

array(<27461x875 sparse matrix of type '<class 'numpy.float64'>'
	with 198187 stored elements in Compressed Sparse Row format>,
      dtype=object)

In [206]:
model = Sequential()

#Perform Convolution
model.add(Conv1D(64, 2, activation = 'relu', input_shape=(27461,1)))
#Perform maxpooling
model.add(MaxPooling1D())

#Perform 2nd COnv and pool
model.add(Conv1D(64, 2, activation='relu'))
model.add(MaxPooling1D())

#Conv connected to dense layer that is fully connected
model.add(Flatten())
model.add(Dense(64, activation='relu'))

#Output layer
model.add(Dense(1, activation='softmax'))#sigmoid since binary classification

In [207]:

# Compile model. Stochastic gradient descent with Nesterov accelerated gradient gives good results for this model
#sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
#model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])


In [None]:
#fitting and saving the model 
hist = model.fit((X_train_tv), (y_train), 
                 validation_data=(X_test_tv, y_test),
                 epochs=20, batch_size=20, verbose=1)

In [68]:
type(X_train_tv), type(y_train)

(scipy.sparse.csr.csr_matrix, numpy.ndarray)

In [None]:
from keras import layers
embedding_dim = 100

#Source: https://medium.com/saarthi-ai/sentence-classification-using-convolutional-neural-networks-ddad72c7048c#:~:text=Just%20like%20sentence%20classification%20%2C%20CNN,Textual%20Summarization%2C%20Answer%20Selection%20etc.&text=Just%20like%20sentence%20classification%20%2C%20CNN,Textual%20Summarization%2C%20Answer%20Selection%20etc.
#X_train = tokenizer.texts_to_sequences(sentences_train)
#vocab_size = len(tokenizer.word_index) + 1                          
vocab_size=10000
maxlen=100
y_train = np.array(y_train)

model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='softmax'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
history = model.fit(X_train_tv, y_train,
                    epochs=10,
                    validation_data=(X_test, y_test),
                    batch_size=10)

In [129]:
import pickle
filename = 'finalized_model.sav'
pickle.dump(m, open(filename, 'wb'))

In [130]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))

This was the best logistic regression score with all variations of hypertuning including vectorizer. The score only improved by 1 point over the null model.

In [131]:
## DOING SOME PREDICTIONS

In [23]:
# function to preprocess user inputs for nlp model
def preprocess_nlp(question):
    input_list = []
    processed_question = question.replace('-', '')
    tokenizer = RegexpTokenizer('\w+|\$[\d.]+|S+')
    token = tokenizer.tokenize(processed_question.lower())
    lemmatizer = WordNetLemmatizer()
    lem_token = [lemmatizer.lemmatize(word) for word in token]
    joined_text = ' '.join(lem_token)
    input_list.append(joined_text)
    return input_list

In [75]:
# processing inputs for nlp model
question_text='hello, how do i  apply for a card'
input_text = preprocess_nlp(question_text)
ip_series = pd.Series(input_text)
print(ip_series)
ip_tvec = tv.transform(ip_series)
print(ip_tvec.shape)
predicted_status_nlp = lr.predict(ip_tvec)
ans.loc[predicted_status_nlp[0]][0]

#ans.loc['Cardapply'][0]

0    hello how do i apply for a card
dtype: object
(1, 902)


'You’ll need to provide personal information, including your:Full name, Social Security number, Date of birth, Physical address, (No P.O. Boxes) Estimated gross annual income, Checking and/or savings account information'