In [1]:
import nltk
import numpy as np
import random
import string # to process standard python strings
import os, json
import pandas as pd
import re
from nltk.stem import wordnet # to perform lemmatization
from sklearn.feature_extraction.text import CountVectorizer # to perform bow
from sklearn.feature_extraction.text import TfidfVectorizer # to perform tfidf
from nltk import pos_tag # for parts of speech 
from sklearn.metrics import pairwise_distances # tor perform cosine similarity 
from nltk import word_tokenize # to create tokens
from nltk.corpus import stopwords # for stop words 
nltk.download('punkt') 
nltk.download('wordnet') 
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\Nazmul
[nltk_data]     Hossain\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Nazmul
[nltk_data]     Hossain\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Nazmul Hossain\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to C:\Users\Nazmul
[nltk_data]     Hossain\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# Get question and nested answer 
data = []
with open('data/climatology-1.json') as json_file:
    data = json.load(json_file)

def traverseAnswerObjects(answersObjects):
    print("answer: " + answersObjects["text"])
    print("\n")
    if(len(answersObjects["answers"])):
        for answerObject in answersObjects["answers"]:
            traverseAnswerObjects(answerObject)

for questionObject in data:
    print("Question: "+ questionObject["question"])
    print("\n")
    traverseAnswerObjects(questionObject)

In [2]:
# Get question and first answer 
# Get all json file from a directory
path_to_json = 'data/'
columns = ['question', 'answer']
questionAndAnswerDf = pd.DataFrame(columns=columns)
indexCount = 0
for pos_json in os.listdir(path_to_json):
    if pos_json.endswith('.json'):
        with open(path_to_json + pos_json, encoding="utf8") as json_file:
            data = json.load(json_file)
            for index, questionObject in enumerate(data):
                answer = []
                if(len(questionObject["answers"])):
                    answer = questionObject["answers"][0]["text"]
                else :
                    answer = questionObject["text"]
                questionAndAnswerDf.loc[indexCount] = [questionObject["question"], answer]
                indexCount += 1

In [3]:
# fills the null value with previous value
questionAndAnswerDf.ffill(axis = 0, inplace = True)
questionAndAnswerDf.head()

Unnamed: 0,question,answer
0,Are Ice Ages Affected by the Freezing Out of C...,The freezing point of carbon dioxide is -78.5C...
1,Increase Evaporation from oceans,"Yes, you can increase evaporation by spraying ..."
2,How to calculate maximum and minimum mean mont...,"From my experience, first of all, you should c..."
3,Why is wind shear consistently high in the Sou...,I have read that perhaps the largest reason th...
4,Can we see the circulation cells in wind maps?,Earth famously possesses three distinct circul...


In [4]:
questionAndAnswerDf.shape[0]

255

In [5]:
# function that performs text normalization steps 
def text_normalization(text):
    text = str(text).lower() # text to lower case
    spl_char_text = re.sub(r'[^a-z0-9]', ' ', text) # removing special characters
    tokens = nltk.word_tokenize(spl_char_text) # word tokenizing
    lema = wordnet.WordNetLemmatizer() # initializing lemmatization
    tags_list = pos_tag(tokens, tagset = None) # parts of speech
    lema_words = [] # empty list
    for token,pos_token in tags_list:
        if pos_token.startswith('V'): # verb
            pos_val = 'v'
        elif pos_token.startswith('J'): # adjective
            pos_val = 'a'
        elif pos_token.startswith('R'): # adverb
            pos_val = 'r'
        else: 
            pos_val = 'n' # noun
        lema_token = lema.lemmatize(token, pos_val) # performing lemmatization
        lema_words.append(lema_token) # appending the lemmatized token into a list 
    return " ".join(lema_words) # returns the lemmatized tokens as a sentence
text_normalization("going to play football")

'go to play football'

In [6]:
# applying the function to the dataset to get clean text
questionAndAnswerDf["lemmatized_text"] = questionAndAnswerDf["question"].apply(text_normalization) 
questionAndAnswerDf.head()

Unnamed: 0,question,answer,lemmatized_text
0,Are Ice Ages Affected by the Freezing Out of C...,The freezing point of carbon dioxide is -78.5C...,be ice age affect by the freeze out of co2 in ...
1,Increase Evaporation from oceans,"Yes, you can increase evaporation by spraying ...",increase evaporation from ocean
2,How to calculate maximum and minimum mean mont...,"From my experience, first of all, you should c...",how to calculate maximum and minimum mean mont...
3,Why is wind shear consistently high in the Sou...,I have read that perhaps the largest reason th...,why be wind shear consistently high in the sou...
4,Can we see the circulation cells in wind maps?,Earth famously possesses three distinct circul...,can we see the circulation cell in wind map


In [7]:
# bow of words
cv = CountVectorizer() # initializing the count vectorizer
X = cv.fit_transform(questionAndAnswerDf["lemmatized_text"]).toarray()

# returns all the unique word from data
features = cv.get_feature_names()
questionAndAnswerDf_bou = pd.DataFrame(X, columns = features)
questionAndAnswerDf_bou.head()

Unnamed: 0,000,10,1500,16,1860,1880,1910,1934,1940,1960,...,win,wind,winter,wishful,with,work,world,would,wrong,year
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [8]:
# using tf-idf
tfidf = TfidfVectorizer() # intializing tf-id
x_tfidf = tfidf.fit_transform(questionAndAnswerDf["lemmatized_text"]).toarray() # transforming the data into array

# returns all the unique word from data with a scroe of that word 
questionAndAnswerDf_tfidf = pd.DataFrame(x_tfidf, columns = tfidf.get_feature_names())
questionAndAnswerDf_tfidf.head()

Unnamed: 0,000,10,1500,16,1860,1880,1910,1934,1940,1960,...,win,wind,winter,wishful,with,work,world,would,wrong,year
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.343498,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.351979,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
greets = ("hi", "hello", "good evening", "good afternoon", "hi there",
          "good morning", "morning", "evening", "hey", "hey there")

identity_qs = ("what are you", "who are you")

thanks = ("thanks", "thank you", "thank you very much", "thank you so much")

farewells = ("bye", "goodbye", "see ya", "see you", "cheers")

In [10]:
# create a function that returns response to query
def generate_response(text):
    lemma = text_normalization(text) # text normalization
    tf = tfidf.transform([lemma]).toarray() # apply tf-idf
    cos = 1 - pairwise_distances(questionAndAnswerDf_tfidf, tf, metric = "cosine") # apply cosine similarity
    index_value = cos.argmax() # getting index value
    return questionAndAnswerDf["answer"].loc[index_value][:300]

In [11]:
generate_response("hi")

'The freezing point of carbon dioxide is -78.5C. The temperature at which carbon dioxide sublimates is not a fixed value. It instead is a function of the partial pressure of carbon dioxide. That value of -78.5deg C is the temperature at which CO2 sublimates given a partial CO2 pressure of one atmosph'

In [None]:
# chat bot
keep_dialogue_alive = True
print("Hello, I am Captain Planet. Please ask me any question regarding climate change. 💪🌍")
while(keep_dialogue_alive):
    human_txt = input().lower()
    if human_txt not in farewells and human_txt not in thanks:
        if human_txt in greets:
            print("Captain Planet: " + random.choice(greets))
        elif human_txt in identity_qs:
            print("Captain Planet: "
                  + "I am a chatbot developed in a data science project "
                  + "at the University of Bremen. I am here to answer your questions about climate change.")
        else:
            print("Captain Planet: " + generate_response(human_txt))
    else:
        keep_dialogue_alive = False
        if human_txt in thanks:
            print("Captain Planet: You're welcome!")
        else:
            print("Captain Planet: Goodbye and thanks for your interest in climate change!")

In [30]:
# main function for server
def final_response(text):
    human_txt = text.decode('utf-8')
    if human_txt not in farewells and human_txt not in thanks:
        if human_txt in greets:
            return "Captain Planet: " + random.choice(greets)
        elif human_txt in identity_qs:
            return "Captain Planet: I am a chatbot developed in a data science project at the University of Bremen. I am here to answer your questions about climate change."
        else:
            return "Captain Planet: " + generate_response(human_txt)
    else:
        if human_txt in thanks:
            return "Captain Planet: You're welcome!"
        else:
            return "Captain Planet: Goodbye and thanks for your interest in climate change!"

In [31]:
# Starting the server block

In [None]:
import http.server
import socketserver

PORT = 8080
DIRECTORY = 'public'

class Handler(http.server.SimpleHTTPRequestHandler):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, directory=DIRECTORY, **kwargs)

    def do_POST(self):
        self.send_response(200)
        content_length = int(self.headers['Content-Length'])
        post_body = self.rfile.read(content_length)
        self.end_headers()
        chatbot_reply = final_response(post_body)
        self.wfile.write(str.encode(chatbot_reply))

with socketserver.TCPServer(('', PORT), Handler) as httpd:
    print('serving at port', PORT)
    try:
        httpd.serve_forever()
    except KeyboardInterrupt:
        pass
    httpd.server_close()

serving at port 8080


127.0.0.1 - - [14/Jan/2020 20:36:43] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [14/Jan/2020 20:36:43] "GET /app.js HTTP/1.1" 304 -
127.0.0.1 - - [14/Jan/2020 20:36:47] "POST / HTTP/1.1" 200 -
127.0.0.1 - - [14/Jan/2020 20:38:14] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [14/Jan/2020 20:38:14] "GET /style.css HTTP/1.1" 200 -
127.0.0.1 - - [14/Jan/2020 20:38:14] "GET /app.js HTTP/1.1" 200 -
127.0.0.1 - - [14/Jan/2020 20:38:15] code 404, message File not found
127.0.0.1 - - [14/Jan/2020 20:38:15] "GET /favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [14/Jan/2020 20:38:18] "POST / HTTP/1.1" 200 -
127.0.0.1 - - [14/Jan/2020 20:38:57] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [14/Jan/2020 20:38:57] "GET /style.css HTTP/1.1" 200 -
127.0.0.1 - - [14/Jan/2020 20:38:57] "GET /app.js HTTP/1.1" 200 -
127.0.0.1 - - [14/Jan/2020 20:38:58] code 404, message File not found
127.0.0.1 - - [14/Jan/2020 20:38:58] "GET /favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [14/Jan/2020 20:39:01] "POST / HTTP/1.1" 200 -
127.0.0.1 - - [14/J

127.0.0.1 - - [14/Jan/2020 20:58:48] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [14/Jan/2020 20:58:48] "GET /style.css HTTP/1.1" 200 -
127.0.0.1 - - [14/Jan/2020 20:58:48] "GET /app.js HTTP/1.1" 200 -
127.0.0.1 - - [14/Jan/2020 20:58:49] code 404, message File not found
127.0.0.1 - - [14/Jan/2020 20:58:49] "GET /favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [14/Jan/2020 20:58:53] "POST / HTTP/1.1" 200 -
127.0.0.1 - - [14/Jan/2020 20:59:07] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [14/Jan/2020 20:59:07] "GET /style.css HTTP/1.1" 200 -
127.0.0.1 - - [14/Jan/2020 20:59:07] "GET /app.js HTTP/1.1" 200 -
127.0.0.1 - - [14/Jan/2020 20:59:08] code 404, message File not found
127.0.0.1 - - [14/Jan/2020 20:59:08] "GET /favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [14/Jan/2020 20:59:12] "POST / HTTP/1.1" 200 -
127.0.0.1 - - [14/Jan/2020 21:00:47] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [14/Jan/2020 21:00:47] "GET /style.css HTTP/1.1" 200 -
127.0.0.1 - - [14/Jan/2020 21:00:47] "GET /app.js HTTP/1.1" 200 -
127.0.0.1 -

127.0.0.1 - - [14/Jan/2020 21:21:26] "GET /app.js HTTP/1.1" 200 -
127.0.0.1 - - [14/Jan/2020 21:21:26] code 404, message File not found
127.0.0.1 - - [14/Jan/2020 21:21:26] "GET /favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [14/Jan/2020 21:21:30] "POST / HTTP/1.1" 200 -
127.0.0.1 - - [14/Jan/2020 21:21:35] "POST / HTTP/1.1" 200 -
127.0.0.1 - - [14/Jan/2020 21:21:38] "POST / HTTP/1.1" 200 -
127.0.0.1 - - [14/Jan/2020 21:22:16] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [14/Jan/2020 21:22:16] "GET /style.css HTTP/1.1" 200 -
127.0.0.1 - - [14/Jan/2020 21:22:16] "GET /app.js HTTP/1.1" 200 -
127.0.0.1 - - [14/Jan/2020 21:22:17] code 404, message File not found
127.0.0.1 - - [14/Jan/2020 21:22:17] "GET /favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [14/Jan/2020 21:22:23] "POST / HTTP/1.1" 200 -
127.0.0.1 - - [14/Jan/2020 21:22:31] "POST / HTTP/1.1" 200 -
127.0.0.1 - - [14/Jan/2020 21:22:58] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [14/Jan/2020 21:22:58] "GET /style.css HTTP/1.1" 200 -
127.0.0.1 - - [14/Jan/2