In [None]:
import nltk
import numpy as np
import random
import string # to process standard python strings
import os, json
import pandas as pd
import re
from nltk.stem import wordnet # to perform lemmatization
from sklearn.feature_extraction.text import CountVectorizer # to perform bow
from sklearn.feature_extraction.text import TfidfVectorizer # to perform tfidf
from nltk import pos_tag # for parts of speech 
from sklearn.metrics import pairwise_distances # tor perform cosine similarity 
from nltk import word_tokenize # to create tokens
from nltk.corpus import stopwords # for stop words 
nltk.download('punkt') 
nltk.download('wordnet') 
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

In [None]:
# Get question and nested answer 
data = []
with open('data/climatology-1.json') as json_file:
    data = json.load(json_file)

def traverseAnswerObjects(answersObjects):
    print("answer: " + answersObjects["text"])
    print("\n")
    if(len(answersObjects["answers"])):
        for answerObject in answersObjects["answers"]:
            traverseAnswerObjects(answerObject)

for questionObject in data:
    print("Question: "+ questionObject["question"])
    print("\n")
    traverseAnswerObjects(questionObject)

In [None]:
# Get question and first answer 
# Get all json file from a directory
path_to_json = 'data/'
columns = ['question', 'answer']
questionAndAnswerDf = pd.DataFrame(columns=columns)
indexCount = 0
for pos_json in os.listdir(path_to_json):
    if pos_json.endswith('.json'):
        with open(path_to_json + pos_json, encoding="utf8") as json_file:
            data = json.load(json_file)
            for index, questionObject in enumerate(data):
                answer = []
                if(len(questionObject["answers"])):
                    answer = questionObject["answers"][0]["text"]
                else :
                    answer = questionObject["text"]
                questionAndAnswerDf.loc[indexCount] = [questionObject["question"], answer]
                indexCount += 1

In [None]:
# fills the null value with previous value
questionAndAnswerDf.ffill(axis = 0, inplace = True)
questionAndAnswerDf.head()

In [None]:
questionAndAnswerDf.shape[0]

In [None]:
# function that performs text normalization steps 
def text_normalization(text):
    text = str(text).lower() # text to lower case
    spl_char_text = re.sub(r'[^a-z0-9]', ' ', text) # removing special characters
    tokens = nltk.word_tokenize(spl_char_text) # word tokenizing
    lema = wordnet.WordNetLemmatizer() # initializing lemmatization
    tags_list = pos_tag(tokens, tagset = None) # parts of speech
    lema_words = [] # empty list
    for token,pos_token in tags_list:
        if pos_token.startswith('V'): # verb
            pos_val = 'v'
        elif pos_token.startswith('J'): # adjective
            pos_val = 'a'
        elif pos_token.startswith('R'): # adverb
            pos_val = 'r'
        else: 
            pos_val = 'n' # noun
        lema_token = lema.lemmatize(token, pos_val) # performing lemmatization
        lema_words.append(lema_token) # appending the lemmatized token into a list 
    return " ".join(lema_words) # returns the lemmatized tokens as a sentence
text_normalization("going to play football")

In [None]:
# applying the function to the dataset to get clean text
questionAndAnswerDf["lemmatized_text"] = questionAndAnswerDf["question"].apply(text_normalization) 
questionAndAnswerDf.head()

In [None]:
# bow of words
cv = CountVectorizer() # initializing the count vectorizer
X = cv.fit_transform(questionAndAnswerDf["lemmatized_text"]).toarray()

# returns all the unique word from data
features = cv.get_feature_names()
questionAndAnswerDf_bou = pd.DataFrame(X, columns = features)
questionAndAnswerDf_bou.head()

In [None]:
# using tf-idf
tfidf = TfidfVectorizer() # intializing tf-id
x_tfidf = tfidf.fit_transform(questionAndAnswerDf["lemmatized_text"]).toarray() # transforming the data into array

# returns all the unique word from data with a scroe of that word 
questionAndAnswerDf_tfidf = pd.DataFrame(x_tfidf, columns = tfidf.get_feature_names())
questionAndAnswerDf_tfidf.head()

In [None]:
greets = ("hi", "hello", "good evening", "good afternoon", "hi there",
          "good morning", "morning", "evening", "hey", "hey there")

identity_qs = ("what are you", "who are you")

thanks = ("thanks", "thank you", "thank you very much", "thank you so much")

farewells = ("bye", "goodbye", "see ya", "see you", "cheers")

In [None]:
# create a function that returns response to query
def generate_response(text):
    lemma = text_normalization(text) # text normalization
    tf = tfidf.transform([lemma]).toarray() # apply tf-idf
    cos = 1 - pairwise_distances(questionAndAnswerDf_tfidf, tf, metric = "cosine") # apply cosine similarity
    index_value = cos.argmax() # getting index value
    return questionAndAnswerDf["answer"].loc[index_value][:300]

In [None]:
generate_response("hi")

In [None]:
# chat bot
keep_dialogue_alive = True
print("Hello, I am Captain Planet. Please ask me any question regarding climate change. 💪🌍")
while(keep_dialogue_alive):
    human_txt = input().lower()
    if human_txt not in farewells and human_txt not in thanks:
        if human_txt in greets:
            print("Captain Planet: " + random.choice(greets))
        elif human_txt in identity_qs:
            print("Captain Planet: "
                  + "I am a chatbot developed in a data science project "
                  + "at the University of Bremen. I am here to answer your questions about climate change.")
        else:
            print("Captain Planet: " + generate_response(human_txt))
    else:
        keep_dialogue_alive = False
        if human_txt in thanks:
            print("Captain Planet: You're welcome!")
        else:
            print("Captain Planet: Goodbye and thanks for your interest in climate change!")

In [None]:
# main function for server
def final_response(text):
    human_txt = text.decode('utf-8')
    if human_txt not in farewells and human_txt not in thanks:
        if human_txt in greets:
            return "Captain Planet: " + random.choice(greets)
        elif human_txt in identity_qs:
            return "Captain Planet: I am a chatbot developed in a data science project at the University of Bremen. I am here to answer your questions about climate change."
        else:
            return "Captain Planet: " + generate_response(human_txt)
    else:
        if human_txt in thanks:
            return "Captain Planet: You're welcome!"
        else:
            return "Captain Planet: Goodbye and thanks for your interest in climate change!"

In [None]:
# Starting the server block

In [None]:
import http.server
import socketserver

PORT = 8080
DIRECTORY = 'public'

class Handler(http.server.SimpleHTTPRequestHandler):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, directory=DIRECTORY, **kwargs)

    def do_POST(self):
        self.send_response(200)
        content_length = int(self.headers['Content-Length'])
        post_body = self.rfile.read(content_length)
        self.end_headers()
        chatbot_reply = final_response(post_body)
        self.wfile.write(str.encode(chatbot_reply))

with socketserver.TCPServer(('', PORT), Handler) as httpd:
    print('serving at port', PORT)
    try:
        httpd.serve_forever()
    except KeyboardInterrupt:
        pass
    httpd.server_close()