<a href="https://colab.research.google.com/github/praj9719/ref_repo/blob/main/BaseBot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import nltk
# nltk.download('stopwords')  
# nltk.download('punkt')
# !pip install num2words
# !pip install transformers

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter
from num2words import num2words

import nltk
import os
import numpy as np
import math
import json

import spacy
from gensim.summarization.bm25 import BM25
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, QuestionAnsweringPipeline
import concurrent.futures
import itertools
import operator
import re

In [None]:
class PreProcessor:

    def __init__(self, data):
        self.data = data;

    def execute(self):
        self.convert_lower_case()
        self.remove_punctuation() #remove comma seperately
        self.remove_apostrophe()
        self.remove_stop_words()
        self.convert_numbers()
        self.stemming()
        self.remove_punctuation()
        self.convert_numbers()
        self.stemming() #needed again as we need to stem the words
        self.remove_punctuation() #needed again as num2word is giving few hypens and commas fourty-one
        self.remove_stop_words() #needed again as num2word is giving stop words 101 - one hundred and one        
        return self.data

    def convert_lower_case(self):
        self.data = np.char.lower(self.data)

    def remove_stop_words(self):
        stop_words = stopwords.words('english')
        words = word_tokenize(str(self.data))
        new_text = ""
        for w in words:
            if w not in stop_words and len(w) > 1:
                new_text = new_text + " " + w
        self.data = new_text
    
    def remove_punctuation(self):
        symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
        for i in range(len(symbols)):
            data = np.char.replace(self.data, symbols[i], ' ')
            data = np.char.replace(data, "  ", " ")
        data = np.char.replace(data, ',', '')
        self.data = data

    def remove_apostrophe(self):
        self.data = np.char.replace(self.data, "'", "")

    def stemming(self):
        stemmer= PorterStemmer()
        
        tokens = word_tokenize(str(self.data))
        new_text = ""
        for w in tokens:
            new_text = new_text + " " + stemmer.stem(w)
        self.data = new_text

    def convert_numbers(self):
        tokens = word_tokenize(str(self.data))
        new_text = ""
        for w in tokens:
            try:
                w = num2words(int(w))
            except:
                a = 0
            new_text = new_text + " " + w
        new_text = np.char.replace(new_text, "-", " ")
        self.data = new_text

In [None]:
class DocumentRetrieval:
    
    def __init__(self, dataset_path, model_path, head):
        self.dataset_path = dataset_path
        self.head = head
        json_path = f'{model_path}{head}.json'
        npy_path = f'{model_path}{head}.npy'
        with open(json_path, 'r') as json_file:
            data = json.load(json_file)
        self.N = data['N']
        self.topics = data['topics']
        self.DF = data['DF']
        self.total_vocab = data['total_vocab']
        self.D = np.load(npy_path)
        print(f"[Info] model loaded {head}")

    def doc_freq(self, word):
        c = 0
        try:
            c = self.DF[word]
        except:
            pass
        return c

    def gen_vector(self, tokens):
        Q = np.zeros((len(self.total_vocab)))
        counter = Counter(tokens)
        words_count = len(tokens)

        query_weights = {}
        
        for token in np.unique(tokens):
            tf = counter[token]/words_count
            df = self.doc_freq(token)
            idf = math.log((self.N+1)/(df+1))
            try:
                ind = self.total_vocab.index(token)
                Q[ind] = tf*idf
            except:
                pass
        return Q

    def cosine_sim(self, a, b):
        cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
        return cos_sim

    def classify(self, k, query):
        preprocessed_query = PreProcessor(query).execute()
        tokens = word_tokenize(str(preprocessed_query))

        d_cosines = []
        query_vector = self.gen_vector(tokens)

        for d in self.D:
            sim = self.cosine_sim(query_vector, d)
            d_cosines.append(sim)

        out = np.array(d_cosines).argsort()[-k:][::-1]
        result = []
        for o in out:
            result.append({"topic" : self.topics[o], "similarity" : round(d_cosines[o]+0.0001, 4)})
        return result

    def search(self, query):
        count = min(self.N, 7)
        relevent_docs = self.classify(count, query)
        threshold = 0.02
        docs = []
        for doc in relevent_docs:
            if doc['similarity'] > threshold:
                with open(f'{self.dataset_path}/{self.head}/{doc["topic"]}.txt', 'r') as file:
                    docs.append(file.read())
        return docs



In [None]:
class PassageRetrieval:

  def __init__(self, nlp):
    self.tokenize = lambda text: [token.lemma_ for token in nlp(text)]
    self.bm25 = None
    self.passages = None

  def preprocess(self, doc):
    passages = [p for p in doc.split('\n') if p and not p.startswith('=')]
    return passages

  def fit(self, docs):
    passages = list(itertools.chain(*map(self.preprocess, docs)))
    corpus = [self.tokenize(p) for p in passages]
    self.bm25 = BM25(corpus)
    self.passages = passages

  def most_similar(self, question, topn=10):
    tokens = self.tokenize(question)
    average_idf = sum(map(lambda k: float(self.bm25.idf[k]), self.bm25.idf.keys()))
    scores = self.bm25.get_scores(tokens, average_idf)
    # print(scores)
    pairs = [(s, i) for i, s in enumerate(scores)]
    pairs.sort(reverse=True)
    print(pairs)
    passages = [self.passages[i] for _, i in pairs[:topn]]
    return passages

In [None]:
class AnswerExtractor:

  def __init__(self, tokenizer, model):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer)
    model = AutoModelForQuestionAnswering.from_pretrained(model)
    self.nlp = QuestionAnsweringPipeline(model=model, tokenizer=tokenizer)

  def extract(self, question, passages):
    answers = []
    for passage in passages:
      try:
        answer = self.nlp(question=question, context=passage)
        answer['text'] = passage
        answers.append(answer)
      except KeyError:
        pass
    answers.sort(key=operator.itemgetter('score'), reverse=True)
    return answers

In [None]:
SPACY_MODEL = os.environ.get('SPACY_MODEL', 'en_core_web_sm')
QA_MODEL = os.environ.get('QA_MODEL', 'distilbert-base-cased-distilled-squad')

In [None]:
nlp = spacy.load(SPACY_MODEL, disable=['ner', 'parser', 'textcat'])
passage_retriever = PassageRetrieval(nlp)
answer_extractor = AnswerExtractor(QA_MODEL, QA_MODEL)

In [None]:
science = DocumentRetrieval('/content/gdrive/MyDrive/ircb/datasets/base', '/content/gdrive/MyDrive/ircb/vectors/Base/', 'Science')
history = DocumentRetrieval('/content/gdrive/MyDrive/ircb/datasets/base', '/content/gdrive/MyDrive/ircb/vectors/Base/', 'History')

[Info] model loaded Science
[Info] model loaded History


In [None]:
def answer(document_retriever, question):
    docs = document_retriever.search(question)
    if len(docs) < 1:
        print('Question is out of context!')
        return
    passage_retriever.fit(docs)
    passages = passage_retriever.most_similar(question)
    answers = answer_extractor.extract(question, passages)
    print(f'Que: {question}\n')
    for i in range(min(len(answers), 3)):
        print(f'Ans {i+1}: {answers[i]["answer"]} \n\t{answers[i]["text"]}\n')

In [None]:
# Temorary code
question = "duration of indian independance movement"
passage_retriever = PassageRetrieval(nlp)
docs = history.search(question)
# docs

In [None]:
def preprocess(doc):
    passages = [p for p in doc.split('\n') if p and not p.startswith('=')]
    return passages

psgs = list(itertools.chain(*map(preprocess, docs)))
# psgs

In [None]:
passage_retriever.fit(docs)
passages = passage_retriever.most_similar(question)

[(1284.150524650103, 11), (1244.1819395441587, 14), (1238.8297483393753, 16), (1233.110835198444, 2), (1196.925898357967, 9), (1192.8878053909289, 21), (1183.678084392532, 19), (1174.451515403772, 4), (1130.1086466452473, 0), (1115.434427580766, 12), (1097.532724319158, 3), (1094.0211164819013, 18), (1086.283789879479, 5), (1028.6449364694759, 8), (1013.3811728960579, 22), (968.4721806482559, 20), (958.9820988290039, 10), (957.1486875051315, 17), (953.7444094927822, 6), (937.8887091066047, 1), (892.5297637553176, 7), (838.2034181977199, 15), (795.4628438254807, 13)]


In [None]:
# ---------------------------------------------

In [None]:
answer(history, 'duration of indian independance movement')

Que: duration of indian independance movement

Ans 1: The movement spanned from 1857 to 1947 
	The Indian independence movement was a series of historic events with the ultimate aim of ending the British rule in India. The movement spanned from 1857 to 1947. 

Ans 2: The Cold War 
	The Cold War and its events have left a significant legacy. It is often referred to in popular culture, especially with themes of espionage and the threat of nuclear warfare.

Ans 3: Violent clashes between Hindus, Sikhs, and Muslims followed. 
	Violent clashes between Hindus, Sikhs, and Muslims followed. Prime Minister Nehru and deputy prime minister Sardar Vallabhbhai Patel invited Mountbatten to continue as Governor General of India during the period of transition. He was replaced in June 1948 by Chakravarti Rajagopalachari. Patel took on the responsibility for bringing 565 princely states into the Union of India, steering efforts by his "iron fist in a velvet glove" policies, exemplified by the use of mi

In [None]:
answer(science, 'what is atom')

Que: what is atom

Ans 1: copper 
	The number of protons in the nucleus is the atomic number and it defines to which chemical element the atom belongs. For example, any atom that contains 29 protons is copper. The number of neutrons defines the isotope of the element. Atoms can attach to one or more other atoms by chemical bonds to form chemical compounds such as molecules or crystals. The ability of atoms to associate and dissociate is responsible for most of the physical changes observed in nature. Chemistry is the discipline that studies these changes.

Ans 2: the smallest unit of ordinary matter that forms a chemical element 
	An atom is the smallest unit of ordinary matter that forms a chemical element. Every solid, liquid, gas, and plasma is composed of neutral or ionized atoms. Atoms are extremely small, typically around 100 picometers across. They are so small that accurately predicting their behavior using classical physics—as if they were tennis balls, for example—is not poss

In [None]:
answer(history, 'when world war one started')

Que: when world war one started

Ans 1: Following the Cuban Missile Crisis 
	Following the Cuban Missile Crisis, a new phase began that saw the Sino-Soviet split between China and the Soviet Union complicate relations within the Communist sphere, while US ally France began to demand greater autonomy of action. The USSR invaded Czechoslovakia to suppress the 1968 Prague Spring, while the US experienced internal turmoil from the civil rights movement and opposition to the Vietnam War. In the 1960s–70s, an international peace movement took root among citizens around the world. Movements against nuclear arms testing and for nuclear disarmament took place, with large anti-war protests. By the 1970s, both sides had started making allowances for peace and security, ushering in a period of détente that saw the Strategic Arms Limitation Talks and the US opening relations with the People's Republic of China as a strategic counterweight to the USSR.

Ans 2: between 1914 and 1918 
	It was known as

In [None]:
answer(history, 'why world war two started')

Que: why world war two started

Ans 1: the assassination of Austrian Archduke Franz Ferdinand 
	Although there were a number of causes for the war, the assassination of Austrian Archduke Franz Ferdinand was the main catalyst for starting the war. After the assassination, Austria declared war on Serbia. Then Russia prepared to defend its ally Serbia. Next, Germany declared war on Russia to protect Austria. This caused France to declare war on Germany to protect its ally Russia. Germany invaded Belgium to get to France which caused 

Ans 2: the assassination of Austrian Archduke Franz Ferdinand 
	Although there were a number of causes for the war, the assassination of Austrian Archduke Franz Ferdinand was the main catalyst for starting the war. After the assassination, Austria declared war on Serbia. Then Russia prepared to defend its ally Serbia. Next, Germany declared war on Russia to protect Austria. This caused France to declare war on Germany to protect its ally Russia. Germany inva

In [None]:
answer(science, 'what is periodic table')

Que: what is periodic table

Ans 1: International Year of the Periodic Table 
		UNESCO named 2019 the International Year of the Periodic Table to mark the 150th anniversary of Mendeleev’s publication. Researchers and teachers worldwide took this opportunity to reflect on the importance of the periodic table and spread awareness about it in classrooms and beyond. Workshops and conferences encouraged people to use the knowledge of the periodic table to solve problems in health, technology, agriculture, environment and education. Publication houses organized monthly activities such as quiz contests, podcasts, personal story sections and industry site tours. These initiatives demonstrated how the elements are integral to our daily lives in medicines, pesticides and lithium batteries. 

Ans 2: the framework 
		In 1869, Russian chemist Dmitri Mendeleev created the framework that became the modern periodic table, leaving gaps for elements that were yet to be discovered. While arranging the el

UNESCO named 2019 the International Year of the Periodic Table to mark the 150th anniversary of Mendeleev’s publication

In [None]:
answer(science, 'year of periodic table')

Que: year of periodic table

Ans 1: 1870 
		German chemist Lothar Meyer produced a version of the periodic table similar to Mendeleev’s in 1870. He left gaps for undiscovered elements but never predicted their properties. The Royal Society of London awarded the Davy Medal in 1882 to both Mendeleev and Meyer. The later discovery of elements predicted by Mendeleev, including gallium (1875), scandium (1879) and germanium (1886), verified his predictions and his periodic table won universal recognition. In 1955 the 101st element was named mendelevium in his honor

Ans 2: 1913 
		The concept of sub-atomic particles did not exist in the 19th century. In 1913, English physicist Henry Moseley used X-rays to measure the wavelengths of elements and correlated these measurements to their atomic numbers. He then rearranged the elements in the periodic table on the basis of atomic numbers. This helped explain disparities in earlier versions that had used atomic masses. 

Ans 3: 1869 
	The organizat

In [None]:
answer(science, 'International year of periodic table')

Que: International year of periodic table

Ans 1: 1870 
		German chemist Lothar Meyer produced a version of the periodic table similar to Mendeleev’s in 1870. He left gaps for undiscovered elements but never predicted their properties. The Royal Society of London awarded the Davy Medal in 1882 to both Mendeleev and Meyer. The later discovery of elements predicted by Mendeleev, including gallium (1875), scandium (1879) and germanium (1886), verified his predictions and his periodic table won universal recognition. In 1955 the 101st element was named mendelevium in his honor

Ans 2: 1913 
		The concept of sub-atomic particles did not exist in the 19th century. In 1913, English physicist Henry Moseley used X-rays to measure the wavelengths of elements and correlated these measurements to their atomic numbers. He then rearranged the elements in the periodic table on the basis of atomic numbers. This helped explain disparities in earlier versions that had used atomic masses. 

Ans 3: 2019 


In [None]:
answer(science, 'why UNESO name 2019 the International year of periodic table')

Que: why UNESO name 2019 the International year of periodic table

Ans 1: to mark the 150th anniversary of Mendeleev’s publication 
		UNESCO named 2019 the International Year of the Periodic Table to mark the 150th anniversary of Mendeleev’s publication. Researchers and teachers worldwide took this opportunity to reflect on the importance of the periodic table and spread awareness about it in classrooms and beyond. Workshops and conferences encouraged people to use the knowledge of the periodic table to solve problems in health, technology, agriculture, environment and education. Publication houses organized monthly activities such as quiz contests, podcasts, personal story sections and industry site tours. These initiatives demonstrated how the elements are integral to our daily lives in medicines, pesticides and lithium batteries. 

Ans 2: The modern periodic table 
		The modern periodic table arranges the elements by their atomic numbers and periodic properties. Several scientists w

In [None]:
answer(science, 'who named 2019 the International year of periodic table')

Que: who named 2019 the International year of periodic table

Ans 1: UNESCO 
		UNESCO named 2019 the International Year of the Periodic Table to mark the 150th anniversary of Mendeleev’s publication. Researchers and teachers worldwide took this opportunity to reflect on the importance of the periodic table and spread awareness about it in classrooms and beyond. Workshops and conferences encouraged people to use the knowledge of the periodic table to solve problems in health, technology, agriculture, environment and education. Publication houses organized monthly activities such as quiz contests, podcasts, personal story sections and industry site tours. These initiatives demonstrated how the elements are integral to our daily lives in medicines, pesticides and lithium batteries. 

Ans 2: John Newlands 
		British chemist John Newlands was the first to arrange the elements into a periodic table with increasing order of atomic masses. He found that every eight elements had similar propert

In [None]:
!pip list -v 

Package                       Version        Location                               Installer
----------------------------- -------------- -------------------------------------- ---------
absl-py                       0.12.0         /usr/local/lib/python3.7/dist-packages pip      
alabaster                     0.7.12         /usr/local/lib/python3.7/dist-packages pip      
albumentations                0.1.12         /usr/local/lib/python3.7/dist-packages pip      
altair                        4.1.0          /usr/local/lib/python3.7/dist-packages pip      
appdirs                       1.4.4          /usr/local/lib/python3.7/dist-packages pip      
argon2-cffi                   20.1.0         /usr/local/lib/python3.7/dist-packages pip      
astor                         0.8.1          /usr/local/lib/python3.7/dist-packages pip      
astropy                       4.2.1          /usr/local/lib/python3.7/dist-packages pip      
astunparse                    1.6.3          /usr/local/lib/