# NLP Interview Assistant: Combining Knowledge Retrieval with LLMs

## Sudip Das

In [None]:
import torch
import PyPDF2
from PyPDF2 import PdfReader
import re
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os

In [2]:
from google.colab import drive
drive.mount("/content/drive/")

Mounted at /content/drive/


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


## Data Collection
### PyPDF (scrape content from PDFs)

In [None]:
!pip install PyPDF2

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [None]:
# Function to clean question text
def clean_question_text(question):
    question = re.sub(r"^Q\d+\.\s*", "", question).strip()
    return question

# Load and extract text
file_path = "100 NLP interview questions and answers.pdf"
reader = PdfReader(file_path)

# Combine all text from the PDF
all_text = ""
for page in reader.pages:
    all_text += page.extract_text()

# Split text into individual Q&A
qa_pattern = r"Q\d+\..*?(?=Q\d+\.|$)"
qa_matches = re.findall(qa_pattern, all_text, re.DOTALL)

# Function to clean the answer text
def clean_answer_text(answer):
    return re.sub(r"^Answer\s*:\s*", "", answer).strip()

# Parse questions and answers
qa_pairs = []
for qa in qa_matches:
    question_match = re.search(r"(Q\d+\..*?)(Answer\s*:.*)", qa, re.DOTALL)
    if question_match:
        # Clean the question and answer
        question = clean_question_text(question_match.group(1).strip())
        answer = clean_answer_text(question_match.group(2).strip())      # Remove Answer
        qa_pairs.append({"Question": question, "Answer": answer})

# Create a DataFrame for structured processing
qa_df_pdf = pd.DataFrame(qa_pairs)

print(qa_df_pdf.head())

                                            Question  \
0  Which of the following techniques can be used ...   
1  Which of the following techniques can be used ...   
2  What are the possible features of a text corpu...   
3  You created a document term matrix on the inpu...   
4  Which of the text parsing techniques can be us...   

                                              Answer  
0  a) Lemmatization helps to get to the base form...  
1  b) and c) \nDistance between two word vectors ...  
2  e)All of the above can be used as features of ...  
3                                                 d)  
4                                                 d)  


In [None]:
# Print the first 3 question-answer pairs
for i in range(3):
    print(f"Question {i+1}:")
    print(qa_df_pdf.loc[i, "Question"])
    print("Answer:")
    print(qa_df_pdf.loc[i, "Answer"])
    print("-" * 50)

Question 1:
Which of the following techniques can be used for keyword normalization in 
NLP, the process of converting a keyword into its base form? 
a. Lemmatization 
b. Soundex 
c. Cosine Similarity 
d. N-grams
Answer:
a) Lemmatization helps to get to the base form of a word, e.g. are playing -> play, ea ting 
-> eat, etc.Other options are meant for different purposes.
--------------------------------------------------
Question 2:
Which of the following techniques can be used to compute the distance 
between two word vectors in NLP? 
a. Lemmatization 
b. Euclidean distance 
c. Cosine Similarity 
d. N-grams
Answer:
b) and c) 
Distance between two word vectors can be computed using Cosine similarity and Euclidean 
Distance.  Cosine Similarity establishes a cosine angle between the vector of two words . A cosi ne 
angle close to each other between two word vectors indicates the words are simil ar and vice a 
versa. 
E.g. cosine angle between two words “Football” and “Cricket” will be cl

### Web Scraping (BeautifulSoup)

In [None]:
# URL of the webpage to scrape
url = "https://www.projectpro.io/article/nlp-interview-questions-and-answers/439"

response = requests.get(url)
if response.status_code == 200:
    print("Successfully fetched the webpage!")
else:
    print(f"Failed to fetch the webpage. Status code: {response.status_code}")

soup = BeautifulSoup(response.content, "html.parser")

def clean_question_text(question):
    question = re.sub(r"^Question \d+:", "", question).strip()
    # Remove any numbering like "1.", "2." at the start of the actual question
    question = re.sub(r"^\d+\.\s*", "", question).strip()
    return question

# Extract Q&A pairs
qa_pairs = []
questions = soup.find_all("p", {"style": "padding-left: 40px;"})  # Match questions based on style

for question in questions:
    # Extract and clean question text
    question_text = question.find("strong")
    if question_text:
        q_text = clean_question_text(question_text.get_text(strip=True))

        answer_paragraphs = []
        sibling = question.find_next_sibling()
        while sibling and sibling.name == "p" and "padding-left: 40px;" not in sibling.get("style", ""):
            if sibling.find("table") is None:
                answer_paragraphs.append(sibling.get_text(strip=True))
            sibling = sibling.find_next_sibling()

        if answer_paragraphs:
            a_text = " ".join(answer_paragraphs)
            qa_pairs.append({"Question": q_text, "Answer": a_text})

web_qa_df = pd.DataFrame(qa_pairs)

print(web_qa_df.head())

Successfully fetched the webpage!
                                            Question  \
0                        What do you know about NLP?   
1  Give examples of any two real-world applicatio...   
2                       What is tokenization in NLP?   
3  What is the difference between stemming and le...   
4                                       What is NLU?   

                                              Answer  
0  NLP stands for Natural Language Processing. It...  
1  1. Spelling/Grammar Checking Apps:The mobile a...  
2  Tokenization is the process of splitting runni...  
3  Both stemming and lemmatization are keyword no...  
4  NLU stands for Natural Language Understanding....  


In [None]:
# Print the first 3 questions and answers
for i in range(3):
    print(f"Question {i+1}:")
    print(web_qa_df.loc[i, "Question"])
    print("Answer:")
    print(web_qa_df.loc[i, "Answer"])
    print("-" * 50)


Question 1:
What do you know about NLP?
Answer:
NLP stands for Natural Language Processing. It deals with making a machine understand the way human beings read and write in a language. This task is achieved by designing algorithms that can extract meaning from large datasets in audio or text format by applying machine learning algorithms.
--------------------------------------------------
Question 2:
Give examples of any two real-world applications of NLP.
Answer:
1. Spelling/Grammar Checking Apps:The mobile applications and websites that offer users correct grammar mistakes in the entered text rely on NLP algorithms. These days, they can also recommend the following few words that the user might type, which is also because of specific NLP models being used in the backend. 2.ChatBots:Many websites now offer customer support through these virtual bots that chat with the user and resolve their problems. It acts as a filter to the issues that do not require an interaction with the compani

In [None]:
# Website 2
url = "https://www.adaface.com/blog/nlp-interview-questions/"

response = requests.get(url)
if response.status_code == 200:
    print("Successfully fetched the webpage!")
    page_content = response.text
else:
    print(f"Failed to fetch the webpage. Status code: {response.status_code}")
    exit()

# Parse the webpage content
soup = BeautifulSoup(page_content, 'html.parser')

qa_pairs = []

# questions are in <h3> tags, checked from developer tools
questions = soup.find_all('h3')

for question in questions:
    # Extract the question text
    q_text = question.get_text(strip=True)

    # Remove leading numbers like "1.", "2.", etc.
    q_text = re.sub(r"^\d+\.\s*", "", q_text)

    # Collect all answer elements until the next <h3> tag
    answer_elements = question.find_next_siblings()
    answer_text = ""

    for element in answer_elements:
        # Stop if another question is encountered
        if element.name == "h3":
            break
        if element.name == "p":
            answer_text += element.get_text(strip=True) + " "

    qa_pairs.append({"Question": q_text.strip(), "Answer": answer_text.strip()})

qa_df = pd.DataFrame(qa_pairs)


# Display the first 3 question-answer pairs
for i in range(min(3, len(qa_df))):  # Ensure we don't try to access rows if fewer than 3 exist
    print(f"Question {i+1}:")
    print(qa_df.loc[i, "Question"])
    print("Answer:")
    print(qa_df.loc[i, "Answer"])
    print("-" * 50)



Successfully fetched the webpage!
Question 1:
What is Natural Language Processing (NLP), and why is it important?
Answer:
Natural Language Processing (NLP) is a field of artificial intelligence that focuses on the interaction between computers and humans through natural language. It involves teaching machines to understand, interpret, and generate human language in a way that is valuable. NLP is important because it enables applications like translation services, sentiment analysis, chatbots, and more, which can process large amounts of data quickly and efficiently. It helps in automating routine tasks, improving customer service, and providing insights from unstructured data. When evaluating the candidate’s response, look for a clear and concise explanation, an understanding of practical applications, and the ability to relate NLP to real-world scenarios and benefits.
--------------------------------------------------
Question 2:
Can you explain the difference between NLP and text min

## Data Preprocessing

In [None]:
# Combine all scraped data into a single DataFrame
qa_df_combined = pd.concat([qa_df_pdf, web_qa_df, qa_df], ignore_index=True)

In [None]:
# Calculate word count in the combined dataframe
qa_df_combined['Word Count'] = qa_df_combined['Question'].apply(lambda x: len(str(x).split())) + qa_df_combined['Answer'].apply(lambda x: len(str(x).split()))
print(f"Total Word Count in qa_df_combined: {qa_df_combined['Word Count'].sum()}")

Total Word Count in qa_df_combined: 11959


In [None]:
!pip install sentence_transformers
!pip install faiss-cpu
!pip install faiss-gpu
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM





In [None]:
# Load a sentence transformer model to encode the questions
os.environ["HF_TOKEN"] = "" #add your HF token
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")  # Lightweight embedding model
questions = qa_df_combined["Question"].tolist()

# Convert questions to embeddings
embeddings = embedding_model.encode(questions)

In [None]:
# Create a FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))  # Add embeddings to the index

faiss.write_index(index, "faiss_index.bin") #Save

In [None]:
# Retrieval Function
def retrieve_context(query, index, questions, model, top_k=3, similarity_threshold=0.7):
    query_embedding = model.encode([query])

    # Search the FAISS index
    distances, indices = index.search(query_embedding, top_k)

    # Retrieve the top-k results
    context = []
    for idx, dist in zip(indices[0], distances[0]):
        if dist < similarity_threshold:
            context.append(questions[idx])

    return context if context else []


## LLMs - Llama-3.2 and Flan-T5

In [None]:
# Load Llama 3.2 model from hugging face
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
lm_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B-Instruct").to(device)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
from transformers import AutoModelForSeq2SeqLM
# Load Flan-T5 model
flan_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
flan_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large").to(device)

In [None]:
def generate_answer(query, context, tokenizer, model):

    #corpus has mcq questions, hence needs to handle it separately if question asked later is not an mcq and matches with the original question
    is_mcq_context = context and any(opt in '. '.join(context) for opt in ["a.", "b.", "c.", "d."])
    is_mcq_query = any(keyword in query.lower() for keyword in ["choose", "select", "option", "correct"])

    if not context:
        input_text = f"Question: {query}\n\nProvide a brief answer:" #use llm if context not available
    elif is_mcq_context and is_mcq_query:
        # Handle MCQ format only if the query is MCQ
        input_text = f"Context: {'. '.join(context)}\n\nQuestion: {query}\n\nProvide a concise explanation for the correct answer only, ignoring other options:"
    else:
        # Handle normal question format or non-MCQ queries
        input_text = f"Context: {'. '.join(context)}\n\nQuestion: {query}\n\nProvide a short and accurate Answer:"


    # Tokenize and generate
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
    outputs = model.generate(**inputs, max_length=400, num_beams=5, early_stopping=True)

    # Decode the answer
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)


    # Remove MCQ artifacts if the query is not explicitly an MCQ
    if not is_mcq_query:
        answer = re.sub(r"^[A-Da-d]\)\s*", "", answer)
        # truncate the explanation for incorrect options after the correct answer
        answer = re.split(r"Note:|Explanation:", answer)[0].strip()



    # Generic cleanup
    finality_patterns = [
        r"(?i)the best answer is.*",
        r"(?i)therefore, the correct answer is.*",
        r"(?i)correct answer:.*"
    ]
    for pattern in finality_patterns:
        answer = re.sub(pattern, "", answer)


    # Remove the question from the answer (case-insensitive)
    pattern = rf"(?i)Question:\s*{re.escape(query)}?"
    answer = re.sub(pattern, "", answer, flags=re.DOTALL)
    # Remove context from the answer
    context_pattern = rf"(?i)Context:\s*{re.escape('. '.join(context) if context else '')}?"
    answer = re.sub(context_pattern, "", answer, flags=re.DOTALL)

    # Remove generic headers like "Context" or "Provide a brief answer"
    answer = re.sub(r"(?i)context:.*?Question:.*?", "", answer, flags=re.DOTALL)
    answer = re.sub(r"(?i)provide a brief answer:?", "", answer, flags=re.DOTALL)
    answer = re.sub(r"(?i)Provide a concise explanation for the correct answer only, ignoring other optionsr*?:", "", answer)
    answer = re.sub(r"(?i)Provide a short and accurate Answer*?:", "", answer)
    answer = answer.split("Answer:", 1)[-1].strip()

    return answer


In [None]:
# Test model
query = "What are stop words, and why are they removed in NLP?"
retrieved_context = retrieve_context(query, index, questions, embedding_model)
generated_answer = generate_answer(query, retrieved_context, tokenizer, lm_model)

print(f"Question: {query}")
print(f"Retrieved Context: {retrieved_context}")
print("-" * 80)
print(f"Generated Answer: {generated_answer}")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: What are stop words, and why are they removed in NLP?
Retrieved Context: ['What are stop words, and why are they removed in text processing?', 'What are stop words?', 'In NLP, The process of removing words like “and”, “is”, “a”, “an”, “the” from \na sentence is called as \na. Stemming \nb. Lemmatization \nc. Stop word \nd. All of the above']
--------------------------------------------------------------------------------
Generated Answer: Stop words are common words like “and”, “is”, “a”, “an”, “the” that do not add much value to the meaning of a sentence. They are removed in NLP to reduce the dimensionality of the text data and improve the performance of NLP tasks like text classification, sentiment analysis, and information retrieval.


## Evaluation

In [None]:
# List of random 30 common questions in NLP
q = [
    "What is Natural Language Processing (NLP)?",
    "Explain the difference between NLP and text mining.",
    "What are stop words, and why are they removed in NLP?",
    "What is tokenization in NLP?",
    "How can I prepare for a Natural Language Processing (NLP) interview?",
    "Which technique can be used for noun phrase detection in NLP?",
    "What is the purpose of Part-of-Speech (POS) tagging in NLP?",
    "How does named entity recognition (NER) work, and what are its applications?",
    "Define Dependency Parsing and its importance in NLP.",
    "What is the Bag-of-Words (BoW) model?",
    "How is TF-IDF used for text representation in NLP?",
    "What is word embedding, and how is it different from one-hot encoding?",
    "Explain the Skip-Gram model in Word2Vec.",
    "How does Transfer Learning improve NLP model performance?",
    "What are the advantages of using pre-trained models in NLP?",
    "How does a Transformer model work in NLP?",
    "What is the significance of the attention mechanism in NLP models?",
    "How do BERT and GPT differ in their approach to NLP tasks?",
    "What is a language model, and why is it important in NLP?",
    "Explain the concept of fine-tuning a pre-trained model for NLP tasks.",
    "How is sentiment analysis performed using NLP techniques?",
    "What are some real-world applications of NLP in healthcare?",
    "How does machine translation work in NLP?",
    "What is text summarization, and how can it be implemented in NLP?",
    "How does question answering work in NLP systems?",
    "What are some common challenges in text preprocessing for NLP?",
    "How does ambiguity in human language affect NLP systems?",
    "What are the limitations of current NLP models?",
    "How does NLP handle multilingual text?",
    "How can NLP models be evaluated for performance and accuracy?"
]


In [None]:
# Function to store the outputs of the 30 questions
def store_answers(queries, filename, tokenizer, lm_model):
    results = []
    for query in q:
        retrieved_context = retrieve_context(query, index, questions, embedding_model)
        generated_answer = generate_answer(query, retrieved_context, tokenizer, lm_model)
        print(f"Question: {query}")
        #print(f"Retrieved Context: {retrieved_context}")
        print(f"Generated Answer: {generated_answer}")
        print("-" * 50)

        results.append({
            "Question": query,
            "Generated Answer": generated_answer
        })

    # Save results to a JSON file
    with open(filename, "w") as f:
        import json
        json.dump(results, f, indent=4)

### Testing Llama 3.2 response

In [None]:
store_answers(q, "llama_answers.json", tokenizer, lm_model) #and display

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: What is Natural Language Processing (NLP)?
Generated Answer: Natural Language Processing (NLP) is a subfield of artificial intelligence (AI) that deals with the interaction between computers and humans in natural language. It involves the development of algorithms and statistical models that enable computers to process, understand, and generate natural language data. NLP is a crucial component of many applications, including speech recognition, language translation, sentiment analysis, and text summarization.

Common elements of NLP:

1.  **Tokenization**: Breaking down text into individual words or tokens.
2.  **Stopword removal**: Removing common words like "the," "and," and "a" that do not add much value to the meaning of the text.
3.  **Stemming or Lemmatization**: Reducing words to their base form (e.g., "running" becomes "run").
4.  **Part-of-speech tagging**: Identifying the grammatical category of each word (e.g., noun, verb, adjective).
5.  **Named entity recognition

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: Explain the difference between NLP and text mining.
Generated Answer: NLP (Natural Language Processing) is a subfield of artificial intelligence that deals with the interaction between computers and humans in natural language. It involves the development of algorithms and statistical models that enable computers to process, understand, and generate natural language data. Text mining, on the other hand, is a subset of NLP that focuses specifically on extracting insights and patterns from large volumes of unstructured text data. In other words, NLP is a broader field that encompasses text mining, among other tasks such as sentiment analysis, language translation, and speech recognition.

Key Points:

*   NLP is a subfield of AI that deals with human-computer interaction in natural language.
*   Text mining is a subset of NLP that focuses on extracting insights from unstructured text data.
*   NLP encompasses a range of tasks beyond text mining, including sentiment analysis, lan

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: What are stop words, and why are they removed in NLP?
Generated Answer: Stop words are common words like “and”, “is”, “a”, “an”, “the” that do not add much value to the meaning of a sentence. They are removed in NLP to reduce the dimensionality of the text data and improve the performance of NLP tasks like text classification, sentiment analysis, and information retrieval.
--------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: What is tokenization in NLP?
Generated Answer: Tokenization in NLP refers to the process of breaking down text into individual units called tokens. These tokens can be words, subwords, or even characters, depending on the specific approach used. Tokenization is a fundamental step in natural language processing (NLP) tasks, such as text classification, sentiment analysis, and language modeling.
--------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: How can I prepare for a Natural Language Processing (NLP) interview?
Generated Answer: To prepare for a Natural Language Processing (NLP) interview, focus on the following areas:

* Review the fundamentals of NLP, including text processing, sentiment analysis, and machine learning algorithms.
* Familiarize yourself with popular NLP libraries and tools, such as NLTK, spaCy, and Stanford CoreNLP.
* Practice solving common NLP problems, such as named entity recognition, part-of-speech tagging, and text classification.
* Study real-world applications of NLP, including chatbots, sentiment analysis, and language translation.
* Brush up on your programming skills, particularly in languages like Python, Java, or C++.

By focusing on these areas, you'll be well-prepared to tackle NLP-related questions and demonstrate your expertise to potential employers.
--------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: Which technique can be used for noun phrase detection in NLP?
Generated Answer: a. Part of speech tagging
--------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: What is the purpose of Part-of-Speech (POS) tagging in NLP?
Generated Answer: The primary purpose of Part-of-Speech (POS) tagging in Natural Language Processing (NLP) is to identify the grammatical category of each word in a sentence, such as noun, verb, adjective, adverb, etc. This helps in understanding the meaning and context of the sentence, and is a crucial step in many NLP tasks, including text classification, sentiment analysis, and machine translation.
--------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: How does named entity recognition (NER) work, and what are its applications?
Generated Answer: Named entity recognition (NER) is a subtask of natural language processing (NLP) that involves identifying and categorizing named entities in unstructured text into predefined categories, such as person, organization, location, and date. NER is important because it enables machines to extract relevant information from text data, which can be used for various applications, such as information retrieval, text summarization, and sentiment analysis.

To approach the task of NER using machine learning, one would typically follow these steps:

1.  **Data Collection**: Gather a large dataset of labeled text examples, where each example includes the text and the corresponding entity types (e.g., person, organization, location).
2.  **Data Preprocessing**: Preprocess the text data by tokenizing it, removing stop words, and converting all text to lowercase.
3.  **Feature Extraction**: Extract

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: Define Dependency Parsing and its importance in NLP.
Generated Answer: Dependency Parsing is a type of syntactic parsing in Natural Language Processing (NLP) that analyzes the grammatical structure of a sentence by identifying the relationships between words. It represents the sentence as a graph, where each word is a node, and the relationships between words are edges. The goal of Dependency Parsing is to identify the grammatical dependencies between words, such as subject-verb-object relationships, and to represent the sentence in a way that captures the underlying syntactic structure.

Importance in NLP: Dependency Parsing is important in NLP because it can be used for a variety of tasks, such as:

* Part-of-speech tagging
* Named entity recognition
* Sentiment analysis
* Machine translation
* Question answering

By analyzing the grammatical structure of a sentence, Dependency Parsing can provide insights into the meaning of the sentence and the relationships between words

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: What is the Bag-of-Words (BoW) model?
Generated Answer: The Bag-of-Words (BoW) model is a simple and widely used statistical model in Natural Language Processing (NLP) that represents text as a bag, or a set, of its word frequencies, without considering the order or context of the words. This model is useful for tasks such as text classification, topic modeling, and information retrieval, as it allows for efficient and compact representation of text data.
--------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: How is TF-IDF used for text representation in NLP?
Generated Answer: TF-IDF is a technique used for text representation in NLP that weighs the importance of each word in a document based on its frequency and rarity across the entire corpus. It helps to reduce the impact of common words and focuses on the most informative words, leading to better performance in text classification, clustering, and information retrieval tasks.
--------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: What is word embedding, and how is it different from one-hot encoding?
Generated Answer: A word embedding is a way to represent words as vectors in a high-dimensional space, where semantically similar words are mapped to nearby points. This is different from one-hot encoding, which represents each word as a binary vector where only one element is 1, and the rest are 0. Word embeddings capture multiple dimensions of data and are represented as vectors, whereas one-hot encoding is a simple binary representation.
--------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: Explain the Skip-Gram model in Word2Vec.
Generated Answer: The Skip-Gram model predicts the context words that are most likely to appear in the same sentence as the target word.
--------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: How does Transfer Learning improve NLP model performance?
Generated Answer: Transfer learning improves NLP model performance by leveraging pre-trained models on large datasets, allowing the model to learn general features and representations that can be fine-tuned for specific NLP tasks. This approach reduces the need for large amounts of labeled data and accelerates the training process.

Example: BERT (Bidirectional Encoder Representations from Transformers) is a pre-trained language model that has achieved state-of-the-art results in various NLP tasks. By fine-tuning BERT on a specific task, such as sentiment analysis or question answering, the model can adapt to the task-specific requirements and improve its performance.

Evaluation of NLP model performance can be done using metrics such as:

* Accuracy
* F1-score
* Precision
* Recall
* ROUGE score (for text generation tasks)
* BLEU score (for machine translation tasks)
* Perplexity (for language modeling tasks)

These me

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: What are the advantages of using pre-trained models in NLP?
Generated Answer: Pre-trained models in NLP have several advantages, including:

* Improved performance on downstream tasks
* Reduced training time and data requirements
* Better handling of out-of-vocabulary words
* Ability to capture complex patterns and relationships in language

These advantages make pre-trained models a popular choice in many NLP applications.
--------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: How does a Transformer model work in NLP?
Generated Answer: A Transformer model works in NLP by using self-attention mechanisms to weigh the importance of different words in a sentence relative to each other, allowing the model to capture long-range dependencies and contextual relationships between words. This is achieved through a multi-head attention mechanism, where the model attends to all positions in the input sequence simultaneously, and a feed-forward neural network (FFNN) to transform the output. The self-attention mechanism allows the model to weigh the importance of different words in a sentence relative to each other, allowing the model to capture long-range dependencies and contextual relationships between words. This is achieved through a multi-head attention mechanism, where the model attends to all positions in the input sequence simultaneously, and a feed-forward neural network (FFNN) to transform the output. The self-attention mechanism allows the model to w

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: What is the significance of the attention mechanism in NLP models?
Generated Answer: The attention mechanism in NLP models allows the model to focus on specific parts of the input data that are relevant for the task at hand, rather than processing the entire input simultaneously. This enables the model to selectively weigh the importance of different words or phrases in a sentence, improving its ability to capture nuanced semantic relationships and context-dependent information. By doing so, the attention mechanism enhances the model's performance on tasks such as machine translation, question answering, and text summarization.
--------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: How do BERT and GPT differ in their approach to NLP tasks?
Generated Answer: BERT and GPT differ in their approach to NLP tasks. BERT uses a pre-training approach to learn contextualized representations of words, while GPT uses a self-supervised learning approach to predict the next word in a sequence. BERT is typically used for question-asking and text classification tasks, while GPT is used for language generation and text-to-text tasks. BERT and GPT differ in their approach to NLP tasks. BERT uses a pre-training approach to learn contextualized representations of words, while GPT uses a self-supervised learning approach to predict the next word in a sequence. BERT is typically used for question-asking and text classification tasks, while GPT is used for language generation and text-to-text tasks.
--------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: What is a language model, and why is it important in NLP?
Generated Answer: A language model is a statistical model that predicts the next word in a sequence of text based on the context of the previous words. It is a type of Bag-of-Words model, which represents text as a collection of words and their frequencies. Language models are important in NLP because they enable computers to understand and generate human-like language, making them useful for tasks such as language translation, text summarization, and chatbots.
--------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: Explain the concept of fine-tuning a pre-trained model for NLP tasks.
Generated Answer: Fine-tuning a pre-trained model for NLP tasks involves adjusting the model's parameters to better suit a specific task or dataset. This is done by adding a few layers on top of the pre-trained model and training the entire model from scratch on the new task. The goal is to leverage the pre-trained model's knowledge and adapt it to the new task, resulting in improved performance.

Would you like me to expand on this answer?

Yes, I'd be happy to provide a more detailed explanation.

Here's a more detailed explanation:

Fine-tuning a pre-trained model for NLP tasks involves several key steps:

1. **Choosing a pre-trained model**: Select a pre-trained model that has been trained on a large corpus of text data, such as BERT, RoBERTa, or XLNet. These models have learned to represent language in a way that is useful for a wide range of NLP tasks.
2. **Adding a few layers on top**: Add a few laye

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: How is sentiment analysis performed using NLP techniques?
Generated Answer: Sentiment analysis is performed using NLP techniques by analyzing the linguistic features of text data, such as part-of-speech tags, named entities, and sentiment lexicons, to determine the emotional tone or attitude conveyed by the text. Common NLP techniques used for sentiment analysis include:

1.  **Text Preprocessing**: Cleaning and normalizing the text data to remove noise and irrelevant information.
2.  **Tokenization**: Breaking down the text into individual words or tokens.
3.  **Part-of-Speech (POS) Tagging**: Identifying the grammatical category of each word (e.g., noun, verb, adjective).
4.  **Named Entity Recognition (NER)**: Identifying specific entities mentioned in the text (e.g., names, locations).
5.  **Sentiment Lexicons**: Using pre-trained models or dictionaries to map words to their corresponding sentiment scores.
6.  **Machine Learning**: Training machine learning models on labe

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: What are some real-world applications of NLP in healthcare?
Generated Answer: Some real-world applications of NLP in healthcare include:

1. **Clinical Decision Support Systems**: NLP is used to analyze patient data, medical literature, and clinical guidelines to provide healthcare professionals with relevant information to make informed decisions.
2. **Medical Text Analysis**: NLP is used to analyze large amounts of medical text data, such as patient notes, medical records, and research papers, to extract relevant information and identify patterns.
3. **Chatbots and Virtual Assistants**: NLP is used to power chatbots and virtual assistants that can understand and respond to patient queries, providing 24/7 support and improving patient engagement.
4. **Sentiment Analysis**: NLP is used to analyze patient feedback and sentiment, helping healthcare organizations to identify areas for improvement and improve patient satisfaction.
5. **Disease Diagnosis**: NLP is used to analyze 

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: How does machine translation work in NLP?
Generated Answer: Machine translation is a subfield of natural language processing (NLP) that uses algorithms and statistical models to translate text from one language to another. The process typically involves the following steps:

1. Tokenization: Breaking down text into individual words or tokens.
2. Part-of-speech tagging: Identifying the grammatical category of each word (e.g., noun, verb, adjective).
3. Dependency parsing: Analyzing the grammatical structure of the sentence.
4. Semantic role labeling: Identifying the roles played by entities in the sentence (e.g., "Who" did "what" to "whom").
5. Machine translation: Using statistical models to generate a translation of the input text.
6. Post-processing: Refining the translation to improve its quality and fluency.

Machine translation can be performed using various techniques, including:

* Statistical machine translation (SMT)
* Neural machine translation (NMT)
* Rule-based ma

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: What is text summarization, and how can it be implemented in NLP?
Generated Answer: Text summarization is the process of extracting the most important information from a large piece of text and condensing it into a shorter summary. There are two main types of text summarization:

1. **Extractive Summarization**: This type of summarization involves selecting the most important sentences or phrases from the original text and combining them into a summary.
2. **Abstractive Summarization**: This type of summarization involves generating a new summary that is not a direct copy of the original text, but rather a condensed version that captures the main ideas and key points.

Both types of summarization can be implemented in Natural Language Processing (NLP) using various techniques such as machine learning, deep learning, and rule-based approaches.
--------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: How does question answering work in NLP systems?
Generated Answer: In NLP systems, question answering (QA) typically involves the following steps:

1. **Text Preprocessing**: Cleaning and normalizing the input text to remove noise and irrelevant information.
2. **Question Classification**: Identifying the type of question (e.g., open-ended, multiple-choice) to determine the best approach for answering.
3. **Knowledge Retrieval**: Searching a knowledge base or database to find relevant information related to the question.
4. **Answer Generation**: Using the retrieved information to generate a response to the question.
5. **Post-processing**: Refining the answer through spell-checking, grammar-checking, and fluency evaluation.

This process enables NLP systems to provide accurate and relevant answers to user queries.
--------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: What are some common challenges in text preprocessing for NLP?
Generated Answer: Some common challenges in text preprocessing for NLP include:

1.  **Handling Out-of-Vocabulary (OOV) words**: Words that are not present in the training data.
2.  **Removing stop words**: Common words like "the", "and", etc. that do not add much value to the meaning of the text.
3.  **Dealing with punctuation and special characters**: Punctuation marks and special characters that can affect the meaning of the text.
4.  **Removing noise and irrelevant data**: Removing irrelevant or noisy data that can affect the accuracy of the model.
5.  **Scaling and normalization**: Scaling and normalizing the data to ensure that all features are on the same scale.
6.  **Handling missing values**: Dealing with missing values in the data.
7.  **Text normalization**: Normalizing the text to a standard format, such as converting all text to lowercase.
8.  **Tokenization**: Tokenizing the text into individual word

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: How does ambiguity in human language affect NLP systems?
Generated Answer: Ambiguity in human language can significantly impact NLP systems, as it can lead to misinterpretation and incorrect processing of text. For example, homophones (words with the same pronunciation but different meanings) and homographs (words with the same spelling but different meanings) can cause confusion. Additionally, context-dependent words and idioms can be challenging for NLP systems to understand. To mitigate these issues, NLP systems can employ techniques such as part-of-speech tagging, named entity recognition, and semantic role labeling to disambiguate language and improve accuracy.
--------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: What are the limitations of current NLP models?
Generated Answer: Current NLP models face several limitations, including:

1.  **Lack of common sense**: NLP models often struggle to understand the nuances of human language, such as idioms, sarcasm, and figurative language.
2.  **Limited contextual understanding**: NLP models may not fully comprehend the context in which a piece of text is being used, leading to misinterpretations.
3.  **Inability to generalize**: NLP models are often trained on specific datasets and may not generalize well to new, unseen data.
4.  **Vulnerability to bias**: NLP models can perpetuate biases present in the training data, leading to unfair or discriminatory outcomes.
5.  **Inadequate handling of ambiguity**: NLP models may struggle to handle ambiguous or unclear language, leading to incorrect interpretations.

These limitations highlight the need for continued research and development in NLP to improve the accuracy and reliability of NLP models.

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: How does NLP handle multilingual text?
Generated Answer: ## Step 1: Understanding NLP Basics
NLP (Natural Language Processing) is a subfield of AI that deals with the interaction between computers and humans in natural language. It involves tasks such as text processing, sentiment analysis, and machine translation.

## Step 2: Handling Multilingual Text
To handle multilingual text in NLP, several approaches can be employed. These include:

- **Tokenization**: Breaking down text into individual words or tokens, regardless of language.
- **Stemming or Lemmatization**: Reducing words to their base form to facilitate comparison across languages.
- **Machine Translation**: Using machine learning models to translate text from one language to another.
- **Language Detection**: Identifying the language of the text to apply the most suitable processing techniques.
- **Multilingual Models**: Training models that can handle multiple languages simultaneously.

## Step 3: Key Consideratio

### Testing Flan-T5 response

In [None]:
store_answers(q, "T5_answers.json",flan_tokenizer, flan_model)

Question: What is Natural Language Processing (NLP)?
Generated Answer: Natural Language Processing (NLP) is a branch of computer science that deals with the processing of natural language.
--------------------------------------------------
Question: Explain the difference between NLP and text mining.
Generated Answer: Natural Language Processing (NLP) is a branch of computer science that focuses on the processing of natural language.
--------------------------------------------------
Question: What are stop words, and why are they removed in NLP?
Generated Answer: In NLP, The process of removing words like “and”, “is”, “a”, “an”, “the” from a sentence is called as a. Stemming b. Lemmatization c. Stop word d.
--------------------------------------------------
Question: What is tokenization in NLP?
Generated Answer: Tokenization is the process of separating a set of words from a set of words.
--------------------------------------------------
Question: How can I prepare for a Natural Lan

In [7]:
!pip install evaluate
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=15781feef968465afb914cd80b559167ad4ecefefbe7e32d29363f9573fb1204
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [8]:
import json
import evaluate

def load_answers_from_file(filepath):

    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)
    answers = [item["Generated Answer"] for item in data]
    return answers

# File paths for reference and model answer files
reference_file = ('/content/drive/My Drive/references.json')
llama_file = ('/content/drive/My Drive/llama_answers.json')
t5_file = ('/content/drive/My Drive/T5_answers.json')

# Load the answers
reference_answers = load_answers_from_file(reference_file)
llama_answers = load_answers_from_file(llama_file)
t5_answers = load_answers_from_file(t5_file)

# Validate that all lists have the same length
if not (len(reference_answers) == len(llama_answers) == len(t5_answers)):
    raise ValueError("The number of answers in reference, llama, and t5 files must match.")

# Load evaluation metrics
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")

# Prepare references for BLEU (as BLEU expects a list of lists for references)
references_for_bleu = [[ref] for ref in reference_answers]

# Compute BLEU for each model
bleu_llama = bleu.compute(predictions=llama_answers, references=references_for_bleu)
bleu_t5 = bleu.compute(predictions=t5_answers, references=references_for_bleu)

# Compute ROUGE for each model
rouge_llama = rouge.compute(predictions=llama_answers, references=reference_answers)
rouge_t5 = rouge.compute(predictions=t5_answers, references=reference_answers)

# Print the results
print("=== Model Evaluation Results ===\n")

print("--- BLEU Scores ---")
print(f"LLaMA BLEU: {bleu_llama}")
print(f"T5 BLEU: {bleu_t5}")

print("\n--- ROUGE Scores ---")
print("LLaMA ROUGE:", rouge_llama)
print("T5 ROUGE:", rouge_t5)

# Simple summary comparison based on BLEU and ROUGE-L scores
bleu_llama_score = bleu_llama["bleu"]
bleu_t5_score = bleu_t5["bleu"]

rouge_llama_l = rouge_llama["rougeL"]
rouge_t5_l = rouge_t5["rougeL"]

print("\n--- Summary ---")
if bleu_llama_score > bleu_t5_score:
    print("LLaMA performed better on BLEU.")
else:
    print("T5 performed better on BLEU.")

if rouge_llama_l > rouge_t5_l:
    print("LLaMA performed better on ROUGE-L.")
else:
    print("T5 performed better on ROUGE-L.")

=== Model Evaluation Results ===

--- BLEU Scores ---
LLaMA BLEU: {'bleu': 0.030191903818260386, 'precisions': [0.12669100279149667, 0.03911821914847633, 0.01740265390472047, 0.00963433326034596], 'brevity_penalty': 1.0, 'length_ratio': 4.583661417322834, 'translation_length': 4657, 'reference_length': 1016}
T5 BLEU: {'bleu': 0.021804534750438788, 'precisions': [0.33405639913232105, 0.08120649651972157, 0.04239401496259352, 0.02425876010781671], 'brevity_penalty': 0.3000204754021258, 'length_ratio': 0.453740157480315, 'translation_length': 461, 'reference_length': 1016}

--- ROUGE Scores ---
LLaMA ROUGE: {'rouge1': 0.24456696186268784, 'rouge2': 0.08778528539099956, 'rougeL': 0.18828350598392585, 'rougeLsum': 0.2019726109482999}
T5 ROUGE: {'rouge1': 0.2134161650600847, 'rouge2': 0.075212847308192, 'rougeL': 0.17982557010379158, 'rougeLsum': 0.17963872543855225}

--- Summary ---
LLaMA performed better on BLEU.
LLaMA performed better on ROUGE-L.


## Test your knowledge

In [None]:
import random

def get_random_mcq_question(qa_df_combined):
    # Filter rows that are likely to be MCQs (contain "a.", "b.", etc.)
    mcq_questions = qa_df_combined[qa_df_combined['Question'].str.contains(r'\ba\.', regex=True, na=False)]

    if mcq_questions.empty:
        return "No MCQ questions found in the corpus."

    # Randomly select one MCQ
    mcq = mcq_questions.sample(n=1).iloc[0]
    question = mcq['Question']
    answer = mcq['Answer']

    return question, answer

# Example usage
question, answer = get_random_mcq_question(qa_df_combined)
print(f"Question: {question}")  # Displays the question

Question: In NLP, The process of converting a sentence or paragraph into tokens is 
referred to as Stemming 
a. True 
b. False


In [None]:
# Display the answer
answer

'b) The statement describes the process of tokenization and not stemming, hence it  is \nFalse.'

In [None]:
# Test model
query = "What are stop words, and why are they removed in NLP?"
retrieved_context = retrieve_context(query, index, questions, embedding_model)
generated_answer = generate_answer(query, retrieved_context, tokenizer, lm_model)

print(f"Question: {query}")
print(f"Retrieved Context: {retrieved_context}")
print("-" * 80)
print(f"Generated Answer: {generated_answer}")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: What are stop words, and why are they removed in NLP?
Retrieved Context: ['What are stop words, and why are they removed in text processing?', 'What are stop words?', 'In NLP, The process of removing words like “and”, “is”, “a”, “an”, “the” from \na sentence is called as \na. Stemming \nb. Lemmatization \nc. Stop word \nd. All of the above']
--------------------------------------------------------------------------------
Generated Answer: Stop words are common words like “and”, “is”, “a”, “an”, “the” that do not add much value to the meaning of a sentence. They are removed in NLP to reduce the dimensionality of the text data and improve the performance of NLP tasks like text classification, sentiment analysis, and information retrieval.


## UI using Gradio

In [None]:
!pip install gradio
import gradio as gr

def chatbot(model_choice, query):
    if model_choice == "Llama_3.2":
        retrieved_context = retrieve_context(query, index, questions, embedding_model)
        return generate_answer(query, retrieved_context, tokenizer, lm_model)
    elif model_choice == "Flan-T5":
        retrieved_context = retrieve_context(query, index, questions, embedding_model)
        return generate_answer(query, retrieved_context, flan_tokenizer, flan_model)

# Define the Gradio UI
def gradio_ui():
    model_choices = ["Llama_3.2", "Flan-T5"]

    with gr.Blocks() as demo:
        gr.Markdown("# NLP Interview Assistant")

        with gr.Row():
            model_selector = gr.Dropdown(choices=model_choices, label="Select Model", value="Llama_3.2")

        query_input = gr.Textbox(label="Enter your query", placeholder="Type your query here...", lines=2)

        output_box = gr.Textbox(label="Model Output", placeholder="Response will appear here", lines=4)

        submit_button = gr.Button("Submit")

        # Link the chatbot function to the Gradio UI
        submit_button.click(chatbot, inputs=[model_selector, query_input], outputs=output_box)

    return demo

# Create the Gradio UI
demo = gradio_ui()

# Launch the UI
demo.launch(share=True)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


* Running on local URL:  http://127.0.0.1:7871
* Running on public URL: https://598e98d5bb0023c320.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
