In [1]:
import nltk
import numpy as np
import random
import string # to process standard python strings
import os, json
import pandas as pd
import re
from nltk.stem import wordnet # to perform lemmatization
from sklearn.feature_extraction.text import CountVectorizer # to perform bow
from sklearn.feature_extraction.text import TfidfVectorizer # to perform tfidf
from nltk import pos_tag # for parts of speech 
from sklearn.metrics import pairwise_distances # tor perform cosine similarity 
from nltk import word_tokenize # to create tokens
from nltk.corpus import stopwords # for stop words 
nltk.download('punkt') 
nltk.download('wordnet') 
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\Nazmul
[nltk_data]     Hossain\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Nazmul
[nltk_data]     Hossain\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Nazmul Hossain\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to C:\Users\Nazmul
[nltk_data]     Hossain\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# Get question and nested answer 
data = []
with open('data/climatology-1.json') as json_file:
    data = json.load(json_file)

def traverseAnswerObjects(answersObjects):
    print("answer: " + answersObjects["text"])
    print("\n")
    if(len(answersObjects["answers"])):
        for answerObject in answersObjects["answers"]:
            traverseAnswerObjects(answerObject)

for questionObject in data:
    print("Question: "+ questionObject["question"])
    print("\n")
    traverseAnswerObjects(questionObject)

In [23]:
# Get question and first answer 
# Get all json file from a directory
path_to_json = 'data/'
columns = ['question', 'answer']
questionAndAnswerDf = pd.DataFrame(columns=columns)
for pos_json in os.listdir(path_to_json):
    if pos_json.endswith('.json'):
        with open(path_to_json + pos_json, encoding="utf8") as json_file:
            data = json.load(json_file)
            for index, questionObject in enumerate(data):
                answer = []
                if(len(questionObject["answers"])):
                    answer = questionObject["answers"][0]["text"]
                else :
                    answer = questionObject["text"]
                questionAndAnswerDf.loc[index] = [questionObject["question"], answer]

In [29]:
# fills the null value with previous value
questionAndAnswerDf.ffill(axis = 0, inplace = True)
questionAndAnswerDf.head()

Unnamed: 0,question,answer
0,Climate's changed before,"Greenhouse gasses – mainly CO2, but also metha..."
1,It's the sun,Over the last 35 years the sun has shown a coo...
2,It's not bad,Here’s a list of cause and effect relationship...
3,There is no consensus,Science achieves a consensus when scientists s...
4,It's cooling,"When looking for evidence of global warming, t..."


In [30]:
questionAndAnswerDf.shape[0]

197

In [39]:
# function that performs text normalization steps 
def text_normalization(text):
    text = str(text).lower() # text to lower case
    spl_char_text = re.sub(r'[^a-z0-9]', ' ', text) # removing special characters
    tokens = nltk.word_tokenize(spl_char_text) # word tokenizing
    lema = wordnet.WordNetLemmatizer() # initializing lemmatization
    tags_list = pos_tag(tokens, tagset = None) # parts of speech
    lema_words = [] # empty list
    for token,pos_token in tags_list:
        if pos_token.startswith('V'): # verb
            pos_val = 'v'
        elif pos_token.startswith('J'): # adjective
            pos_val = 'a'
        elif pos_token.startswith('R'): # adverb
            pos_val = 'r'
        else: 
            pos_val = 'n' # noun
        lema_token = lema.lemmatize(token, pos_val) # performing lemmatization
        lema_words.append(lema_token) # appending the lemmatized token into a list 
    return " ".join(lema_words) # returns the lemmatized tokens as a sentence
text_normalization("going to play football")

'go to play football'

In [40]:
# applying the function to the dataset to get clean text
questionAndAnswerDf["lemmatized_text"] = questionAndAnswerDf["question"].apply(text_normalization) 
questionAndAnswerDf.head()

Unnamed: 0,question,answer,lemmatized_text
0,Climate's changed before,"Greenhouse gasses – mainly CO2, but also metha...",climate s change before
1,It's the sun,Over the last 35 years the sun has shown a coo...,it s the sun
2,It's not bad,Here’s a list of cause and effect relationship...,it s not bad
3,There is no consensus,Science achieves a consensus when scientists s...,there be no consensus
4,It's cooling,"When looking for evidence of global warming, t...",it s cool


In [41]:
# bow of words
cv = CountVectorizer() # initializing the count vectorizer
X = cv.fit_transform(questionAndAnswerDf["lemmatized_text"]).toarray()

# returns all the unique word from data
features = cv.get_feature_names()
questionAndAnswerDf_bou = pd.DataFrame(X, columns = features)
questionAndAnswerDf_bou.head()

Unnamed: 0,000,10,1500,16,1860,1880,1910,1934,1940,1960,...,weather,west,when,will,win,winter,with,would,wrong,year
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
# all the stop words we have
stop = stopwords.words("english")

# an example to remove stop words 
question = "Should total ground heat flux be"

# checking for stop words
Q = []
a = question.split()
for i in a:
    if i in stop:
        continue
    else:
        Q.append(i)
    b = " ".join(Q)

In [36]:
questionLemma = text_normalization(b) # for text normalizing
questionBow = cv.transform([questionLemma]).toarray() # apply bow

In [37]:
# cosine similarity for the above question we considered 
cosineValue = 1 - pairwise_distances(questionAndAnswerDf_bou, questionBow, metric = "cosine")
(cosineValue)

array([[0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.4472136 ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.   

In [38]:
# the text at the above index becomes the response for the question
questionAndAnswerDf["answer"].loc[195]

'Previous major global climate changes were glacial cycles that happened long before human civilization developed. The human species evolved during the last 2.5 million years. Our far distant ancestors survived through multiple gradual cycles of cold ice ages, but did not experience any previous "hot ages."  We homo sapiens in our current form appeared only about 200,000 years ago. So our species has survived two ice ages. In each ice age global temperatures were colder by 4 °C. The warmest period ever experienced by early humans was about 1 °C warmer (global average) than today. That period occured between the two most recent ice ages, 120,000 years ago (Eemian). Over the next 100,000 years temperatures gradually decreased into a new ice age. During that colder period humans began to expand out of Africa and across the globe. Ever since the Eemian much cooler temperatures have been the norm. Image by John Garrett. Human civilization is roughly 12,000 years old, as defined by the start

In [None]:
# using tf-idf
tfidf = TfidfVectorizer() # intializing tf-id
x_tfidf = tfidf.fit_transform(questionAndAnswerDf["lemmatized_text"]).toarray() # transforming the data into array

In [None]:
# returns all the unique word from data with a scroe of that word 
questionAndAnswerDf_tfidf = pd.DataFrame(x_tfidf, columns = tfidf.get_feature_names())
questionAndAnswerDf_tfidf.head()

In [None]:
question_tfidf = tfidf.transform([questionLemma]).toarray() # applying tf - idf
cos = 1 - pairwise_distances(questionAndAnswerDf_tfidf, question_tfidf, metric = "cosine") # apply cosine similarity
cos

In [None]:
index_value1 = cos.argmax() # returns the index number of highest value
index_value1

In [None]:
questionAndAnswerDf["answer"].loc[4]

In [None]:
# create a function that returns response to query
def generate_response(text):
    lemma = text_normalization(text) # text normalization
    tf = tfidf.transform([lemma]).toarray() # apply tf-idf
    cos = 1 - pairwise_distances(questionAndAnswerDf_tfidf, tf, metric = "cosine") # apply cosine similarity
    index_value = cos.argmax() # getting index value
    print(index_value)
    return questionAndAnswerDf["answer"].loc[index_value]

In [None]:
generate_response("The sun getting hotter")