In [1]:
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import wikipedia
import sys
import speech_recognition as sr
# Download NLTK resources
#nltk.download('averaged_perceptron_tagger')
#nltk.download('wordnet')
#nltk.download('punkt')

def speech2text():
    r= sr.Recognizer()
    print("Running")

    with sr.Microphone(1) as source:
        r.adjust_for_ambient_noise(source, 1)  # Adjust for ambient
        print("Say something!")
        audio=r.listen(source)
    print("Recording")
    try:
        a=r.recognize_google(audio, language='en-IN')
        print(a)
        return a
    except Exception:
        print("Something went wrong")
        a=input("I couldn't understand you, so please enter your question: \n")
        return a


# Fetch the Wikipedia content
print('Enter the wikipedia article name you want to ask questions on: ')
cnt=speech2text()
def get_wikipedia_content(topic):
    try:
        return wikipedia.page(topic).content
    except wikipedia.exceptions.DisambiguationError as e:
        return wikipedia.page(e.options[0]).content
    except wikipedia.exceptions.PageError:
        return "Wikipedia page not found."

val = get_wikipedia_content(cnt)

#check if its a valid page
if val=="Wikipedia page not found.":
    print(val)
    sys.exit()
else:
    text=val

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to lemmatize tokens based on their POS tags
def lemma_me(tokens):
    pos_tags = nltk.pos_tag(tokens)
    return [lemmatizer.lemmatize(token, pos_tag[1][0].lower()) if pos_tag[1][0].lower() in ['n', 'v', 'a', 'r'] else token for token, pos_tag in zip(tokens, pos_tags)]

# Function to process the text and find the best response to a question
def process(text, question):
    sentence_tokens = nltk.sent_tokenize(text)
    sentence_tokens.append(question)

    tv = TfidfVectorizer(tokenizer=lambda x: lemma_me(nltk.word_tokenize(x.lower())))
    tf = tv.fit_transform(sentence_tokens)
    values = cosine_similarity(tf[-1], tf[:-1])
    values_flat = values.flatten()
    best_index = values_flat.argmax()  # Get the index of the top-scoring sentence

    if values_flat[best_index] > 0.1:  # Set a threshold to ensure the response is relevant
        return sentence_tokens[best_index]
    else:
        return None

# Main loop to interact with the user
while True:
    print("Hi, what do you want to know?\n")
    question=speech2text()
    if question.lower() == 'stop':
        break
    output = process(text, question)
    if output:
        print('\n')
        print(output)
    else:
        print("I don't know....")


Enter the wikipedia article name you want to ask questions on: 
Running
Say something!
Recording
Prime Minister Narendra Modi
Hi, what do you want to know?

Running
Say something!
Recording
where he was born






Modi was born and raised in Vadnagar in northeastern Gujarat, where he completed his secondary education.
Hi, what do you want to know?

Running
Say something!
Recording
where is studied


A study by UNICEF and the Indian government found Gujarat under Modi had a poor record in immunisation of children.
Hi, what do you want to know?

Running
Say something!
Recording
no this is wrong stop


The government also tried to reconstitute the National Board for Wildlife so it would no longer have representatives from NGOs but the Supreme Court of India blocked this move.
Hi, what do you want to know?

Running
Say something!
Recording
to pick the best


Modi has written eight other books, mostly containing short stories for children.Abundance in Millets, a song that featured a speech given by Modi for the promotion of millet, received a nomination in the Best Global Music Performance category for the 2024 Grammy Awards.
Hi, what do you want to know?

Running
Say something!
Recording
Something