In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [3]:
# load a csv file
QnA = pd.read_csv('QnA_Dataset.csv')
QnA.head()

Unnamed: 0,Question,Answer
0,What is a loan?,A loan is a sum of money borrowed from a lende...
1,What types of loans are available?,"Common types include personal loans, auto loan..."
2,How do I apply for a loan?,"You can apply for a loan online, at a bank, or..."
3,What is the difference between secured and uns...,"Secured loans require collateral, while unsecu..."
4,What is collateral?,Collateral is an asset pledged by a borrower t...


In [7]:
def convert_to_lowercase(QnA):
    QnA = QnA.lower()
    return QnA


def remove_punctuation(QnA):
    translator = str.maketrans('', '', string.punctuation)
    QnA = QnA.translate(translator)
    return QnA

def remove_stopwords(QnA):
    nltk.download('stopwords', quiet=True)
    stoplist = stopwords.words('english')
    clean_word_list = [word for word in QnA.split() if word not in stoplist]
    QnA = ' '.join(clean_word_list)
    return QnA

In [8]:
# 1. convert_to_lowercase
QnA['Question'] = QnA['Question'].apply(convert_to_lowercase)
QnA['Answer'] = QnA['Answer'].apply(convert_to_lowercase)
print(QnA[['Question', 'Answer']].head())
print("------------------------------------------------------------------")

# 2. remove_stopwords
QnA['Question'] = QnA['Question'].apply(remove_stopwords)
QnA['Answer'] = QnA['Answer'].apply(remove_stopwords)
print(QnA[['Question', 'Answer']].head())
print("------------------------------------------------------------------")

# 3. remove_punctuation
QnA['Question'] = QnA['Question'].apply(remove_punctuation)
QnA['Answer'] = QnA['Answer'].apply(remove_punctuation)
print(QnA[['Question', 'Answer']].head())

                                            Question  \
0                                    what is a loan?   
1                 what types of loans are available?   
2                         how do i apply for a loan?   
3  what is the difference between secured and uns...   
4                                what is collateral?   

                                              Answer  
0  a loan is a sum of money borrowed from a lende...  
1  common types include personal loans, auto loan...  
2  you can apply for a loan online, at a bank, or...  
3  secured loans require collateral, while unsecu...  
4  collateral is an asset pledged by a borrower t...  
------------------------------------------------------------------
                              Question  \
0                                loan?   
1               types loans available?   
2                          apply loan?   
3  difference secured unsecured loans?   
4                          collateral?   

             

In [11]:
def lemmatization(QnA):
    lemmatizer = WordNetLemmatizer()
    temp = QnA.split()
    lemmatized = [lemmatizer.lemmatize(t) for t in temp]  # Lemmatise each word
    return ' '.join(lemmatized)

In [12]:
QnA['Question'] = QnA['Question'].apply(lemmatization)
QnA['Answer'] = QnA['Answer'].apply(lemmatization)
print(QnA[['Question', 'Answer']].head())

                            Question  \
0                               loan   
1                type loan available   
2                         apply loan   
3  difference secured unsecured loan   
4                         collateral   

                                              Answer  
0  loan sum money borrowed lender expected paid b...  
1  common type include personal loan auto loan mo...  
2  apply loan online bank financial institution p...  
3  secured loan require collateral unsecured loan...  
4  collateral asset pledged borrower secure loan ...  


In [14]:
def tokenization(QnA):
    return word_tokenize(QnA)

Tokenised_QnA = QnA.copy()

Tokenised_QnA['Question'] = Tokenised_QnA['Question'].apply(tokenization)
Tokenised_QnA['Answer'] = Tokenised_QnA['Answer'].apply(tokenization)
print(Tokenised_QnA[['Question', 'Answer']].head())


                                 Question  \
0                                  [loan]   
1                 [type, loan, available]   
2                           [apply, loan]   
3  [difference, secured, unsecured, loan]   
4                            [collateral]   

                                              Answer  
0  [loan, sum, money, borrowed, lender, expected,...  
1  [common, type, include, personal, loan, auto, ...  
2  [apply, loan, online, bank, financial, institu...  
3  [secured, loan, require, collateral, unsecured...  
4  [collateral, asset, pledged, borrower, secure,...  


In [15]:
# Export the dataframe
Tokenised_QnA.to_csv('Tokenised_QnA.csv', index=False)