In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [56]:
OPENAI_API_KEY = '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'

In [24]:
import pandas as pd
import numpy as np

### PART - 1 : DATA COLLECTION


In [None]:
!pip install gdelt

Collecting gdelt
  Downloading gdelt-0.1.14-py2.py3-none-any.whl (787 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m787.4/787.4 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: gdelt
Successfully installed gdelt-0.1.14


In [None]:
from gdelt import gdelt
import pandas as pd
from datetime import datetime, timedelta

# Initialize the gdelt class
g = gdelt()

# Define the date range
start_date = datetime(2024, 3, 1)
end_date = datetime(2024, 3, 10)
date_generated = [start_date + timedelta(days=x) for x in range(0, (end_date-start_date).days + 1)]

# Fetch the event data and concatenate into a single DataFrame
list_df = []
for date in date_generated:
    print(f"Downloading data for {date.strftime('%Y-%m-%d')}")
    try:
        events = g.Search(date.strftime('%Y %m %d'), table='events', coverage=True)
        list_df.append(events)
    except Exception as e:
        print(f"An error occurred: {e}")

combined_df = pd.concat(list_df, ignore_index=True)

# Save to CSV
combined_df.to_csv('/content/drive/MyDrive/NLP/gdelt_events_20240301_20240310.csv', index=False)

print("All data has been successfully downloaded and saved.")


ModuleNotFoundError: No module named 'gdelt'

In [None]:
combined_df.drop_duplicates(subset=['GLOBALEVENTID'], inplace=True)
combined_df.drop_duplicates(subset=['SOURCEURL'], inplace=True)

In [None]:
columns_of_interest = [
    'GLOBALEVENTID', 'SQLDATE', 'AvgTone', 'GoldsteinScale',
    'NumMentions', 'NumSources', 'NumArticles', 'SOURCEURL'
]

filtered_df = combined_df[columns_of_interest].copy()

In [None]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np

# Select only the relevant columns for scaling
columns_to_scale = ['NumMentions', 'NumSources', 'NumArticles']
scaler = MinMaxScaler()

# Apply MinMaxScaler to the relevant columns
scaled_values = scaler.fit_transform(filtered_df[columns_to_scale])

# Create a DataFrame with the scaled values
scaled_df = pd.DataFrame(scaled_values, columns=columns_to_scale)

# Calculate the absolute values of the GoldsteinScale
filtered_df['AbsGoldsteinScale'] = filtered_df['GoldsteinScale'].abs()

# Multiply scaled values by the absolute GoldsteinScale values
for column in columns_to_scale:
    filtered_df[f'Weighted{column}'] = filtered_df['AbsGoldsteinScale'] * scaled_df[column]

# Optionally, create a single representation by summing the weighted values
filtered_df['Importance'] = filtered_df[[f'Weighted{column}' for column in columns_to_scale]].sum(axis=1)


In [None]:
important_events = filtered_df[filtered_df['Importance'] > 0].copy()
important_events = important_events.sort_values(by='Importance', ascending=False)
important_events.drop(columns=['AbsGoldsteinScale', 'WeightedNumMentions', 'WeightedNumSources', 'WeightedNumArticles', 'ImportanceScore'], inplace=True)

In [None]:
# Select the top 300 most important events
top_300_important_events = important_events.head(300)

# Specify the file path where you want to save the CSV
file_path = '/content/drive/MyDrive/NLP/top_300_important_events.csv'

# Save the DataFrame to a CSV file
top_300_important_events.to_csv(file_path, index=False)

In [None]:
import os

# List of the text file paths
file_paths = [
    '/content/drive/MyDrive/NLP/Output151_200.txt',
    '/content/drive/MyDrive/NLP/Output201_250.txt',
    '/content/drive/MyDrive/NLP/Output251_300.txt'
]

# Path for the new concatenated file
concatenated_file_path = '/content/drive/MyDrive/NLP/FinalData_150_300.txt'

# Concatenating the content of all files into one text file
with open(concatenated_file_path, 'w') as concatenated_file:
    for file_path in file_paths:
        if os.path.exists(file_path):
            with open(file_path, 'r') as read_file:
                concatenated_file.write(read_file.read() + '\n\n')

# Return the path of the new concatenated file
concatenated_file_path

--------------------------------------------


## PART - 2 : PREPROCESS TEXT

#### ***`Part 2(a) - Parse txt file and convert into a csv file`***

In [None]:
import pandas as pd
import numpy as np

In [None]:
import csv

def parse_and_convert_to_csv(input_txt_file, output_csv_file):
    # Open the input text file and the output CSV file
    with open('/content/drive/MyDrive/NLP/scraped_data.txt', 'r', encoding='utf-8') as infile, \
            open('/content/drive/MyDrive/NLP/scraped_data.csv', 'w', newline='', encoding='utf-8') as outfile:

        # Create a CSV writer object
        csv_writer = csv.writer(outfile)
        # Write the header row to the CSV file
        csv_writer.writerow(['news-topic', 'news'])

        # Initialize variables to hold the current news topic and news text
        current_topic = None
        current_news = []

        # Read the input file line by line
        for line in infile:
            # Check for the news-topic delimiter
            if line.strip() == '###':  # End of a news block
                # Write the current news topic and news text to the CSV, if any
                if current_topic is not None and current_news:
                    csv_writer.writerow([current_topic, ' '.join(current_news)])
                    # Reset the variables for the next news block
                    current_topic = None
                    current_news = []
            elif current_topic is None:  # This line is a news-topic
                current_topic = line.strip()
            else:  # This line is part of the news
                current_news.append(line.strip())

        # Check if the last news item is processed
        if current_topic is not None and current_news:
            csv_writer.writerow([current_topic, ' '.join(current_news)])

# Example usage
input_txt_file = 'path_to_your_input_text_file.txt'
output_csv_file = 'output_news.csv'
parse_and_convert_to_csv(input_txt_file, output_csv_file)

#### ***`Part 2(b) - NER and POST tagging on Text - Using Spacy`***

In [None]:
#Import suitable packages for Spacy
!pip install spacy
!python -m spacy download en_core_web_lg

In [None]:
# Example Text :
text_content = """
meghalaya: suspected smuggler killed in bsf firing near indo-bangla border
a suspected sugar smuggler was shot dead by security forces in meghalaya near the indo-bangladesh border, with his accomplice injured in the firing.
shillong: the security forces in meghalaya allegedly shot dead a suspected sugar smuggler while his accomplice was injured in firing near the indo-bangaldesh international border, officials said on saturday.
officials said that one asen m marak, in his mid-forties, was killed in the firing from bsf meghalaya frontier on march 1.
the identity and whereabouts of his accomplice, who is believed to have been hit in the leg, are being ascertained by both police and bsf currently, said officials.
confirming the incident, sohra sub-divisional officer (civil) salon verma said the matter is under inquiry and all formalities are being followed. “i will personally conduct the inquest after the investigating team returns to the base with all relevant information gathered. the postmortem will be conducted here in sohra, and thereafter, the law will take its course towards a logical conclusion,” she told ht over the phone from sohra.
officials said that the incident took place at dalia village near shella in meghalaya’s east khasi hills district at around 8pm, when a bsf posse on duty came upon a group of people allegedly trying to smuggle sugar, onion, and other materials that fetch a hefty price in bangladesh.
sharing preliminary details of the incident, a senior bsf officer said though such acts occur frequently, it was different this time as about 300 people from both sides of the international border converged at the spot and confronted the bsf team, which after resorting to all legal measures to contain the crowd, had to resort to two rounds of firing resulting in the death of marak.
“almost 300 people from both sides of the border converged at the site to confront the bsf personnel. while a group engaged with the bsf personnel, another group quickly took the head load of contraband to remove any evidence of smuggling along with the person who was alive then, to the village where he succumbed to his injuries,” the officer, who did not wish to be named, told ht over the phone.
the officer elaborated, “the boys initially used all non-lethal measures such as chilli grenade, stun grenade, and even pag (pump action gun) to quell the growing aggressive crowd. but they (the crowd) attacked the team with machetes, knives and other weapons.”
however, it did not work as the crowd got out of control and charged the bsf team with weapons, said the officer. the team initially fired once in the air and subsequently fired again, resulting in the injury of two persons, he added.
“as per convention, unlike the western border, there is a treaty wherein, non-lethal measures will be first adopted in cases of border confrontation or skirmishes and the boys had to take to the last resort to save themselves,” the officer added.
"""
# Simple concatenation into a single string
news_article_str = " ".join(text_content.splitlines())
print(news_article_str)

In [None]:
import spacy

# Load English tokenizer, tagger, parser, NER, and word vectors
spacy_module = spacy.load("en_core_web_lg")

# Load news article into spacy module
doc = spacy_module(news_article_str)

# Iterate over the predicted entities
for ent in doc.ents:
    print(ent.text, ent.label_)

# Visualize the entities
from spacy import displacy
displacy.render(doc, style="ent", jupyter=True)

 meghalaya: suspected smuggler killed in bsf firing near indo-bangla border a suspected sugar smuggler was shot dead by security forces in meghalaya near the indo-bangladesh border, with his accomplice injured in the firing. shillong: the security forces in meghalaya allegedly shot dead a suspected sugar smuggler while his accomplice was injured in firing near the indo-bangaldesh international border, officials said on saturday. officials said that one asen m marak, in his mid-forties, was killed in the firing from bsf meghalaya frontier on march 1. the identity and whereabouts of his accomplice, who is believed to have been hit in the leg, are being ascertained by both police and bsf currently, said officials. confirming the incident, sohra sub-divisional officer (civil) salon verma said the matter is under inquiry and all formalities are being followed. “i will personally conduct the inquest after the investigating team returns to the base with all relevant information gathered. the 

In [None]:
#Load the NER Transformer based module.
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Initialize the NER pipeline
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

# Perform NER
ner_results = ner_pipeline(news_article_str)

# Print the results
for entity in ner_results:
    print(entity)
    print(f"Entity: {entity['entity']}, Word: {entity['word']}, Score: {entity['score']}")

In [None]:
import nltk
from nltk.tokenize import word_tokenize

# Download necessary NLTK data (if not already done)
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")

# Tokenize the text
tokens = word_tokenize(news_article_str)

# Perform POS tagging
tagged_tokens = nltk.pos_tag(tokens)

print(tagged_tokens)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[('meghalaya', 'NN'), (':', ':'), ('suspected', 'VBN'), ('smuggler', 'NN'), ('killed', 'VBN'), ('in', 'IN'), ('bsf', 'NN'), ('firing', 'NN'), ('near', 'IN'), ('indo-bangla', 'JJ'), ('border', 'NN'), ('a', 'DT'), ('suspected', 'JJ'), ('sugar', 'NN'), ('smuggler', 'NN'), ('was', 'VBD'), ('shot', 'VBN'), ('dead', 'JJ'), ('by', 'IN'), ('security', 'NN'), ('forces', 'NNS'), ('in', 'IN'), ('meghalaya', 'NN'), ('near', 'IN'), ('the', 'DT'), ('indo-bangladesh', 'JJ'), ('border', 'NN'), (',', ','), ('with', 'IN'), ('his', 'PRP$'), ('accomplice', 'NN'), ('injured', 'VBN'), ('in', 'IN'), ('the', 'DT'), ('firing', 'NN'), ('.', '.'), ('shillong', 'NN'), (':', ':'), ('the', 'DT'), ('security', 'NN'), ('forces', 'NNS'), ('in', 'IN'), ('meghalaya', 'NN'), ('allegedly', 'RB'), ('shot', 'JJ'), ('dead', 'VBP'), ('a', 'DT'), ('suspected', 'JJ'), ('sugar', 'NN'), ('smuggler', 'NN'), ('while', 'IN'), ('his', 'PRP$'), ('accomplice', 'NN'), ('was', 'VBD'), ('injured', 'VBN'), ('in', 'IN'), ('firing', 'VBG'), 

In [None]:
# Process the text
doc = spacy_module(news_article_str)

# POS tagging
for token in doc:
    print(f"{token.text}: {token.pos_}")


In [None]:
def extract_entities_and_relationships(text):
    doc = spacy_module(text)

    knowledge_graph = {
        'entities': [],
        'relationships': []
    }

    for ent in doc.ents:
        knowledge_graph['entities'].append({
            'text': ent.text,
            'label': ent.label_
        })

    # Relationship extraction (example)
    for token in doc:
        if token.dep_ == 'pobj' and token.head.pos_ == 'VERB':
            relation = {
                'subject': token.head.text,
                'predicate': 'ACTED_ON',  # Placeholder, you'll refine this
                'object': token.text
            }
            knowledge_graph['relationships'].append(relation)

    return knowledge_graph

{'entities': [{'text': 'shillong', 'label': 'PERSON'}, {'text': 'saturday', 'label': 'DATE'}, {'text': 'one', 'label': 'CARDINAL'}, {'text': 'march 1', 'label': 'DATE'}, {'text': 'dalia', 'label': 'GPE'}, {'text': 'shella', 'label': 'GPE'}, {'text': 'around 8pm', 'label': 'TIME'}, {'text': 'bangladesh', 'label': 'GPE'}, {'text': 'about 300', 'label': 'CARDINAL'}, {'text': 'two', 'label': 'CARDINAL'}, {'text': 'marak', 'label': 'PERSON'}, {'text': 'almost 300', 'label': 'CARDINAL'}, {'text': 'pag', 'label': 'GPE'}, {'text': 'two', 'label': 'CARDINAL'}, {'text': 'first', 'label': 'ORDINAL'}], 'relationships': []}


***Trying spacy on the real world data.***

In [None]:
dataset = pd.read_csv('/content/drive/MyDrive/NLP/scraped_data.csv')

***Generating column : ner-tags***

In [None]:
def get_ner_tags(text):
    """Function to return NER tags from a text block."""
    text_concat = " ".join(text.splitlines())
    doc = spacy_module(text_concat)
    # Extract entities and labels
    return [(ent.text, ent.label_) for ent in doc.ents]

# Apply the NER tagging function to the news column
dataset['ner_tags'] = dataset['news'].apply(lambda x: get_ner_tags(x))

NameError: name 'spacy_module' is not defined

***Generating column : pos-tags***

In [None]:
def get_pos_tags(text):
    """Function to return POS tags from a text block."""
    text_concat = " ".join(text.splitlines())
    doc = spacy_module(text_concat)
    # Extract entities and labels
    return [(token.text, token.pos_) for token in doc]

# Apply the NER tagging function to the news column
dataset['pos_tags'] = dataset['news'].apply(lambda x: get_pos_tags(x))

NameError: name 'spacy_module' is not defined

## PART - 3 : BUILD LLM DATA LOADER

In [2]:
records_only_csv_path = '/content/drive/MyDrive/NLP/records_only.csv'
ner_tags_csv_path = '/content/drive/MyDrive/NLP/ner_tags.csv'
pos_tags_csv_path = '/content/drive/MyDrive/NLP/pos_tags.csv'
all_tags_json_path = '/content/drive/MyDrive/NLP/all_tags.csv'

def get_records_only_csv():
    selected_columns = ['news-topic', 'news']
    df_selected = dataset[selected_columns]
    df_selected.to_csv(records_only_csv_path, index=False)

def get_ner_csv():
    selected_columns = ['news-topic', 'news', 'ner_tags']
    df_selected = dataset[selected_columns]
    df_selected.to_csv(ner_tags_csv_path, index=False)

def get_pos_csv():
    selected_columns = ['news-topic', 'news', 'pos_tags']
    df_selected = dataset[selected_columns]
    df_selected.to_csv(pos_tags_csv_path, index=False)

def all_tags_csv():
    dataset.to_csv(all_tags_json_path, index=False)

In [3]:
get_records_only_csv()
get_ner_csv()
get_pos_csv()
all_tags_csv()

NameError: name 'dataset' is not defined

### PART - 4: RAG PIPELINE BASIC


In [4]:
!pip install langchain

Collecting langchain
  Downloading langchain-0.1.13-py3-none-any.whl (810 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m810.5/810.5 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.4-py3-none-any.whl (28 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting langchain-community<0.1,>=0.0.29 (from langchain)
  Downloading langchain_community-0.0.29-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-core<0.2.0,>=0.1.33 (from langchain)
  Downloading langchain_core-0.1.33-py3-none-any.whl (269 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m269.1/269.1 kB[0m [31m37.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-text-splitters<0.1,>=0.0.1 (from langchain)
  Downl

***Testing on 1 query for 100 chunk size***

In [5]:
from langchain_community.chat_models import ChatOpenAI
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import re

# Function to process chunking
def chunk_and_preserve_metadata_records_only(document, size, overlap):
    splitter = RecursiveCharacterTextSplitter(chunk_size=size, chunk_overlap=overlap)
    # Extract news content, news topic
    news_topic, news_content = document.page_content.split("\n")
    # Remove the labels for cleaner data
    news_topic = news_topic.split(": ")[1]
    news_content = news_content.split(": ")[1]
    # Create chunks from the news content
    chunks = splitter.create_documents([news_content])

    # Attach metadata to chunks
    for i, chunk in enumerate(chunks):
        chunk.metadata = {
            "news-topic": news_topic,
            "chunk_index": i,
            "row": document.metadata["row"]
        }
    return chunks

def chunk_and_preserve_metadata_ner_only(document, size, overlap):
    splitter = RecursiveCharacterTextSplitter(chunk_size=size, chunk_overlap=overlap)

    # Extract news content, news topic, and NER tags
    news_topic, news_content, ner_tags = document.page_content.split("\n")

    # Remove the labels for cleaner data
    news_topic = news_topic.split(": ")[1]
    news_content = news_content.split(": ")[1]
    ner_tags = ner_tags.split(": ")[1]

    # Create chunks from the news content
    chunks = splitter.create_documents([news_content])

    # Attach metadata to chunks
    for i, chunk in enumerate(chunks):
        chunk.metadata = {
            "news-topic": news_topic,
            "ner-tags": ner_tags,
            "chunk_index": i,
            "row": document.metadata["row"]
        }
    return chunks

def chunk_and_preserve_metadata_pos_only(document, size, overlap):
    splitter = RecursiveCharacterTextSplitter(chunk_size=size, chunk_overlap=overlap)

    # Extract news content, news topic, and NER tags
    news_topic, news_content, pos_tags = document.page_content.split("\n")

    # Remove the labels for cleaner data
    news_topic = news_topic.split(": ")[1]
    news_content = news_content.split(": ")[1]
    pos_tags = pos_tags.split(": ")[1]

    # Create chunks from the news content
    chunks = splitter.create_documents([news_content])

    # Attach metadata to chunks
    for i, chunk in enumerate(chunks):
        chunk.metadata = {
            "news-topic": news_topic,
            "pos-tags": pos_tags,
            "chunk_index": i,
            "row": document.metadata["row"]
        }
    return chunks


def chunk_and_preserve_metadata_all_tags(document, size, overlap):
    splitter = RecursiveCharacterTextSplitter(chunk_size=size, chunk_overlap=overlap)

    # Extract news content, news topic, and NER tags
    news_topic, news_content, ner_tags, pos_tags = document.page_content.split("\n")

    # Remove the labels for cleaner data
    news_topic = news_topic.split(": ")[1]
    news_content = news_content.split(": ")[1]
    ner_tags = ner_tags.split(": ")[1]
    pos_tags = pos_tags.split(": ")[1]

    # Create chunks from the news content
    chunks = splitter.create_documents([news_content])

    # Attach metadata to chunks
    for i, chunk in enumerate(chunks):
        chunk.metadata = {
            "news-topic": news_topic,
            "ner-tags": ner_tags,
            "pos-tags": pos_tags,
            "chunk_index": i,
            "row": document.metadata["row"]
        }
    return chunks

In [6]:
from langchain.document_loaders import CSVLoader

loader = CSVLoader(file_path='/content/drive/MyDrive/NLP/records_only.csv')
records_only_data  = loader.load()

loader = CSVLoader(file_path='/content/drive/MyDrive/NLP/ner_tags.csv')
ner_only_data  = loader.load()

loader = CSVLoader(file_path='/content/drive/MyDrive/NLP/pos_tags.csv')
pos_only_data  = loader.load()

loader = CSVLoader(file_path='/content/drive/MyDrive/NLP/all_tags.csv')
all_tags_data  = loader.load()

In [29]:
chunk_records_data = [chunk_and_preserve_metadata_records_only(doc, 200, 50) for doc in records_only_data]
chunk_ner_data = [chunk_and_preserve_metadata_ner_only(doc, 200, 50) for doc in ner_only_data]
chunk_pos_data = [chunk_and_preserve_metadata_pos_only(doc, 200, 50) for doc in pos_only_data]
chunk_all_tags_data = [chunk_and_preserve_metadata_all_tags(doc, 200, 50) for doc in all_tags_data]

In [7]:
!pip install openai

Collecting openai
  Downloading openai-1.14.2-py3-none-any.whl (262 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m262.4/262.4 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.4-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.8/77.8 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: h11, httpcore, httpx, openai
Successfully installed h11-0.14.0 httpcore-1.0.4 

In [8]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.6.0


In [9]:
!pip install chromadb

Collecting chromadb
  Downloading chromadb-0.4.24-py3-none-any.whl (525 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m525.5/525.5 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting chroma-hnswlib==0.7.3 (from chromadb)
  Downloading chroma_hnswlib-0.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.110.0-py3-none-any.whl (92 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.1/92.1 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting uvicorn[standard]>=0.18.3 (from chromadb)
  Downloading uvicorn-0.29.0-py3-none-any.whl (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.5.0-py2

In [30]:
def create_and_store_embeddings(chunk_data):
    texts = []
    metadatas = []

    # Extract text and metadata
    for chunk_group in chunk_data:
        for chunk in chunk_group:
            texts.append(chunk.page_content)
            metadatas.append(chunk.metadata)

    # Create embeddings
    embedding_model = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
    vectorstore = Chroma.from_texts(
        texts=texts,
        metadatas=metadatas,
        collection_name="chunk_embeddings",
        embedding=embedding_model
    )
    return vectorstore

vectorstore_records_200_OpenAI = create_and_store_embeddings(chunk_records_data)
retriever_records_200_OpenAI = vectorstore_records_200_OpenAI.as_retriever()

In [36]:
question = "Where is the ongoing PUNCH's 50th anniversary photo exhibition taking place?"

# Retrieve similar documents to a given question using the vector database
docs = vectorstore_records_200_OpenAI.similarity_search(question, k=5)

In [37]:
import numpy as np

def cosine_similarity(vec1, vec2):
    # Compute the dot product of vec1 and vec2
    dot_product = np.dot(vec1, vec2)

    # Compute the L2 norms (or magnitudes) of vec1 and vec2
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)

    # Compute the cosine similarity
    cos_sim = dot_product / (norm_vec1 * norm_vec2)

    return cos_sim

In [38]:
embedding_model = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
q_emb = embedding_model.embed_query(question)
q_vec = np.array(q_emb)

for d in docs:
    emb = embedding_model.embed_query(d.page_content)
    vec = np.array(emb)
    cosine = cosine_similarity(q_vec, vec)
    print(cosine)

0.9070354791346922
0.9070354791346922
0.9070354791346922
0.8715683651499764
0.8715683651499764


In [39]:
# Import the necessary classes from the langchain library.
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI

# Define a prompt template. This is a format for the text input we'll give to our model.
# It tells the model how to structure its response and what to do in different situations.
template = """I will provide you pieces of [Context] to answer the [Question]. \
Answer the question based on the context provided. \
[Context]: {context} \
[Question]: {question}"""

# If your answer includes any sort of list, return it in bullets. \
# Format your answer to Markdown. \

# Create a PromptTemplate object from our string template.
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

# Initialize our language model. We're using OpenAI's GPT-3.5-turbo model here.
llm = ChatOpenAI(api_key=OPENAI_API_KEY, model_name="gpt-3.5-turbo", temperature=0)

# Create a RetrievalQA object. This uses our language model (llm) and a retriever,
# which is our vector database (vectordb). This object will handle asking our model questions
# and retrieving relevant documents to help answer them.
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=retriever_records_200_OpenAI,
    chain_type="stuff",
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [40]:
result = qa_chain({"query": question})

# Print out the result
print(result["result"])

  warn_deprecated(


The ongoing PUNCH's 50th anniversary photo exhibition is taking place at the Alliance Francaise de Lagos/Mike Adenuga Centre in Ikoyi, Lagos.


In [41]:
#Import required libraries:
from langchain_community.chat_models import ChatOpenAI
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import re

from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI

### PART 5.A : DATA LOADING

In [42]:
from langchain.document_loaders import CSVLoader

loader = CSVLoader(file_path='/content/drive/MyDrive/NLP/records_only.csv')
records_only_data  = loader.load()

loader = CSVLoader(file_path='/content/drive/MyDrive/NLP/ner_tags.csv')
ner_only_data  = loader.load()

loader = CSVLoader(file_path='/content/drive/MyDrive/NLP/pos_tags.csv')
pos_only_data  = loader.load()

loader = CSVLoader(file_path='/content/drive/MyDrive/NLP/all_tags.csv')
all_tags_data  = loader.load()

### PART 5.B : CHUNKING BOILERPLATE

In [43]:
# Function to process chunking
def chunk_and_preserve_metadata_records_only(document, size, overlap):
    splitter = RecursiveCharacterTextSplitter(chunk_size=size, chunk_overlap=overlap)
    # Extract news content, news topic
    news_topic, news_content = document.page_content.split("\n")
    # Remove the labels for cleaner data
    news_topic = news_topic.split(": ")[1]
    news_content = news_content.split(": ")[1]
    # Create chunks from the news content
    chunks = splitter.create_documents([news_content])

    # Attach metadata to chunks
    for i, chunk in enumerate(chunks):
        chunk.metadata = {
            "news-topic": news_topic,
            "chunk_index": i,
            "row": document.metadata["row"]
        }
    return chunks

def chunk_and_preserve_metadata_ner_only(document, size, overlap):
    splitter = RecursiveCharacterTextSplitter(chunk_size=size, chunk_overlap=overlap)

    # Extract news content, news topic, and NER tags
    news_topic, news_content, ner_tags = document.page_content.split("\n")

    # Remove the labels for cleaner data
    news_topic = news_topic.split(": ")[1]
    news_content = news_content.split(": ")[1]
    ner_tags = ner_tags.split(": ")[1]

    # Create chunks from the news content
    chunks = splitter.create_documents([news_content])

    # Attach metadata to chunks
    for i, chunk in enumerate(chunks):
        chunk.metadata = {
            "news-topic": news_topic,
            "ner-tags": ner_tags,
            "chunk_index": i,
            "row": document.metadata["row"]
        }
    return chunks

def chunk_and_preserve_metadata_pos_only(document, size, overlap):
    splitter = RecursiveCharacterTextSplitter(chunk_size=size, chunk_overlap=overlap)

    # Extract news content, news topic, and NER tags
    news_topic, news_content, pos_tags = document.page_content.split("\n")

    # Remove the labels for cleaner data
    news_topic = news_topic.split(": ")[1]
    news_content = news_content.split(": ")[1]
    pos_tags = pos_tags.split(": ")[1]

    # Create chunks from the news content
    chunks = splitter.create_documents([news_content])

    # Attach metadata to chunks
    for i, chunk in enumerate(chunks):
        chunk.metadata = {
            "news-topic": news_topic,
            "pos-tags": pos_tags,
            "chunk_index": i,
            "row": document.metadata["row"]
        }
    return chunks


def chunk_and_preserve_metadata_all_tags(document, size, overlap):
    splitter = RecursiveCharacterTextSplitter(chunk_size=size, chunk_overlap=overlap)

    # Extract news content, news topic, and NER tags
    news_topic, news_content, ner_tags, pos_tags = document.page_content.split("\n")

    # Remove the labels for cleaner data
    news_topic = news_topic.split(": ")[1]
    news_content = news_content.split(": ")[1]
    ner_tags = ner_tags.split(": ")[1]
    pos_tags = pos_tags.split(": ")[1]

    # Create chunks from the news content
    chunks = splitter.create_documents([news_content])

    # Attach metadata to chunks
    for i, chunk in enumerate(chunks):
        chunk.metadata = {
            "news-topic": news_topic,
            "ner-tags": ner_tags,
            "pos-tags": pos_tags,
            "chunk_index": i,
            "row": document.metadata["row"]
        }
    return chunks

### PART 5.C : OPENAI EMBEDDINGS WITH CHROMA DB

In [44]:
def create_and_store_embeddings(chunk_data):
    texts = []
    metadatas = []

    # Extract text and metadata
    for chunk_group in chunk_data:
        for chunk in chunk_group:
            texts.append(chunk.page_content)
            metadatas.append(chunk.metadata)

    # Create embeddings
    embedding_model = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
    vectorstore = Chroma.from_texts(
        texts=texts,
        metadatas=metadatas,
        collection_name="chunk_embeddings",
        embedding=embedding_model
    )
    return vectorstore

### PART 6 : PREPARE QUESTION - ANSWER CSV

In [None]:
qa_df = pd.read_csv("/content/drive/MyDrive/NLP/QA_Dataset.csv")
qa_df.head()

Unnamed: 0,question,answer
0,Who predicts that AI will play a growing role ...,"Gerrit Kazmaier, Google Cloud’s lead executive..."
1,How does AI help in data analysis?,"It bridges structured and unstructured data, e..."
2,What is Google Cloud's approach to integrating...,"It integrates BigQuery with Vertex AI, facilit..."
3,What potential does generative AI offer in bus...,It allows natural language interaction with da...
4,How does AI address the challenge of dealing w...,"AI systems, like generative AI, enable flexibl..."


In [None]:
import nltk
nltk.download('punkt')

def preprocess_question(vstore, question, k):
  # Tokenization
  tokens = nltk.word_tokenize(question)
  # Lowercase
  tokens = [token.lower() for token in tokens]

  search_results = vstore.similarity_search(question, k)

  embedding_model = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
  q_emb = embedding_model.embed_query(question)
  q_vec = np.array(q_emb)

  lst = []
  for d in search_results:
      emb = embedding_model.embed_query(d.page_content)
      vec = np.array(emb)
      cosine = cosine_similarity(q_vec, vec)
      lst.append([d, cosine])

  return lst

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
import time
def generate_answer(llm, question, retriever):
  # Define a prompt template. This is a format for the text input we'll give to our model.
  # It tells the model how to structure its response and what to do in different situations.
  template = """I will provide you pieces of [Context] to answer the [Question]. \
  Answer the question based on the context provided. \
  [Context]: {context} \
  [Question]: {question}"""
  # If your answer includes any sort of list, return it in bullets. \
  # Format your answer to Markdown. \
  QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

  start_time = time.time()
  # Create a RetrievalQA object. This uses our language model (llm) and a retriever,
  # which is our vector database (vectordb). This object will handle asking our model questions
  # and retrieving relevant documents to help answer them.
  qa_chain = RetrievalQA.from_chain_type(
      llm,
      retriever=retriever,
      chain_type="stuff",
      return_source_documents=True,
      chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
  )

  result = qa_chain({"query": question})
  end_time = time.time()

  time_taken = end_time - start_time
  return [time_taken, result["result"]]

In [None]:
import time
def generate_ner_answer(llm, question, retriever):
  # Define a prompt template. This is a format for the text input we'll give to our model.
  # It tells the model how to structure its response and what to do in different situations.
  template = """I will provide you pieces of [Context] to answer the [Question]. \
  Answer the question based on the context provided. \
  The context will also contain relevant metadata, including NER (Named Entity Recognition) tags. Consider this metadata when generating your answer.
  [Context]: {context} \
  [Question]: {question}"""
  # If your answer includes any sort of list, return it in bullets. \
  # Format your answer to Markdown. \
  QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

  start_time = time.time()
  # Create a RetrievalQA object. This uses our language model (llm) and a retriever,
  # which is our vector database (vectordb). This object will handle asking our model questions
  # and retrieving relevant documents to help answer them.
  qa_chain = RetrievalQA.from_chain_type(
      llm,
      retriever=retriever,
      chain_type="stuff",
      return_source_documents=True,
      chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
  )

  result = qa_chain({"query": question})
  end_time = time.time()

  time_taken = end_time - start_time
  return [time_taken, result["result"]]

In [None]:
import time
def generate_pos_answer(llm, question, retriever):
  # Define a prompt template. This is a format for the text input we'll give to our model.
  # It tells the model how to structure its response and what to do in different situations.
  template = """I will provide you pieces of context \
  Answer the below question.  \
  The context will also contain relevant metadata, including POS [Part of Speech] tags. Consider this metadata when generating your answer.
  [Context]: {context} \
  [Question]: {question}"""
  # If your answer includes any sort of list, return it in bullets. \
  # Format your answer to Markdown. \
  QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

  start_time = time.time()
  # Create a RetrievalQA object. This uses our language model (llm) and a retriever,
  # which is our vector database (vectordb). This object will handle asking our model questions
  # and retrieving relevant documents to help answer them.
  qa_chain = RetrievalQA.from_chain_type(
      llm,
      retriever=retriever,
      chain_type="stuff",
      return_source_documents=True,
      chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
  )

  result = qa_chain({"query": question})
  end_time = time.time()

  time_taken = end_time - start_time
  return [time_taken, result["result"]]

In [None]:
import time
def generate_all_tags_answer(llm, question, retriever):
  # Define a prompt template. This is a format for the text input we'll give to our model.
  # It tells the model how to structure its response and what to do in different situations.
  template = """I will provide you pieces of context \
  Answer the below question. \
  The context will also contain relevant metadata, including NER [Named Entity Recognition] and POS [Part of Speech] tags. Consider this metadata when generating your answer.
  [Context]: {context} \
  [Question]: {question}"""
  # If your answer includes any sort of list, return it in bullets. \
  # Format your answer to Markdown. \
  QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

  start_time = time.time()
  # Create a RetrievalQA object. This uses our language model (llm) and a retriever,
  # which is our vector database (vectordb). This object will handle asking our model questions
  # and retrieving relevant documents to help answer them.
  qa_chain = RetrievalQA.from_chain_type(
      llm,
      retriever=retriever,
      chain_type="stuff",
      return_source_documents=True,
      chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
  )

  result = qa_chain({"query": question})
  end_time = time.time()

  time_taken = end_time - start_time
  return [time_taken, result["result"]]

In [None]:
question = "How does AI help in data analysis?"
print(generate_answer(llm, question, retriever_records_200_OpenAI))

[1.3293132781982422, 'AI, especially generative AI, helps in data analysis by providing a way to open up access to data in a way that conventional business intelligence (BI) cannot. AI can analyze large amounts of data quickly and efficiently, uncovering insights and patterns that may not be immediately apparent to human analysts. This can help businesses make more informed decisions and drive better outcomes.']


` PIPELINE : 200 CHUNK SIZE ; RECORDS ONLY; OPENAI EMBEDDDINGS `

In [None]:
chunk_records_data = [chunk_and_preserve_metadata_records_only(doc, 200, 50) for doc in records_only_data]

vectorstore_records_200_OpenAI = create_and_store_embeddings(chunk_records_data)
retriever_records_200_OpenAI = vectorstore_records_200_OpenAI.as_retriever()

# Initialize our language model. We're using OpenAI's GPT-3.5-turbo model here.
llm = ChatOpenAI(api_key=OPENAI_API_KEY, model_name="gpt-3.5-turbo", temperature=0)

qa_df['retrieved-records-200-OpenAPI'] = qa_df['question'].apply(lambda x: preprocess_question(vectorstore_records_200_OpenAI, x, k=5))

In [None]:
# Step 1: Apply the function and store results in a list
results = qa_df['question'].apply(lambda x: generate_answer(llm, x, retriever_records_200_OpenAI))

# Step 2: Convert results to a DataFrame
results_df = pd.DataFrame(results.tolist(), columns=['time-records-200-OpenAPI', 'answer-records-200-OpenAPI'])

# Step 3: Join the new DataFrame with the original DataFrame
qa_df = pd.concat([qa_df, results_df], axis=1)

` PIPELINE : 200 CHUNK SIZE ; NER ONLY; OPENAI EMBEDDDINGS `

In [None]:
chunk_ner_data = [chunk_and_preserve_metadata_ner_only(doc, 200, 50) for doc in ner_only_data]

vectorstore_ner_200_OpenAI = create_and_store_embeddings(chunk_ner_data)
retriever_ner_200_OpenAI = vectorstore_ner_200_OpenAI.as_retriever()

# Initialize our language model. We're using OpenAI's GPT-3.5-turbo model here.
llm = ChatOpenAI(api_key=OPENAI_API_KEY, model_name="gpt-3.5-turbo", temperature=0)

qa_df['retrieved-ner-200-OpenAPI'] = qa_df['question'].apply(lambda x: preprocess_question(vectorstore_ner_200_OpenAI, x, k=5))

# Step 1: Apply the function and store results in a list
ner_results = qa_df['question'].apply(lambda x: generate_ner_answer(llm, x, retriever_ner_200_OpenAI))

# Step 2: Convert results to a DataFrame
ner_results_df = pd.DataFrame(ner_results.tolist(), columns=['time-ner-200-OpenAPI', 'answer-ner-200-OpenAPI'])

# Step 3: Join the new DataFrame with the original DataFrame
qa_df = pd.concat([qa_df, ner_results_df], axis=1)

` PIPELINE : 200 CHUNK SIZE ; RECORDS + POS ONLY; OPENAI EMBEDDDINGS `

In [None]:
chunk_pos_data = [chunk_and_preserve_metadata_pos_only(doc, 200, 50) for doc in pos_only_data]

vectorstore_pos_200_OpenAI = create_and_store_embeddings(chunk_pos_data)
retriever_pos_200_OpenAI = vectorstore_pos_200_OpenAI.as_retriever()

# Initialize our language model. We're using OpenAI's GPT-3.5-turbo model here.
llm = ChatOpenAI(api_key=OPENAI_API_KEY, model_name="gpt-3.5-turbo", temperature=0)

qa_df['retrieved-pos-200-OpenAPI'] = qa_df['question'].apply(lambda x: preprocess_question(vectorstore_pos_200_OpenAI, x, k=5))

# Step 1: Apply the function and store results in a list
pos_results = qa_df['question'].apply(lambda x: generate_pos_answer(llm, x, retriever_pos_200_OpenAI))

# Step 2: Convert results to a DataFrame
pos_results_df = pd.DataFrame(pos_results.tolist(), columns=['time-pos-200-OpenAPI', 'answer-pos-200-OpenAPI'])

# Step 3: Join the new DataFrame with the original DataFrame
qa_df = pd.concat([qa_df, pos_results_df], axis=1)

` PIPELINE : 200 CHUNK SIZE ; ALL TAGS; OPENAI EMBEDDDINGS `

In [None]:
chunk_all_tags_data = [chunk_and_preserve_metadata_all_tags(doc, 200, 50) for doc in all_tags_data]

vectorstore_allTags_200_OpenAI = create_and_store_embeddings(chunk_all_tags_data)
retriever_allTags_200_OpenAI = vectorstore_allTags_200_OpenAI.as_retriever()

# Initialize our language model. We're using OpenAI's GPT-3.5-turbo model here.
llm = ChatOpenAI(api_key=OPENAI_API_KEY, model_name="gpt-3.5-turbo", temperature=0)

qa_df['retrieved-allTags-200-OpenAPI'] = qa_df['question'].apply(lambda x: preprocess_question(vectorstore_allTags_200_OpenAI, x, k=5))

# Step 1: Apply the function and store results in a list
all_results = qa_df['question'].apply(lambda x: generate_all_tags_answer(llm, x, retriever_allTags_200_OpenAI))

# Step 2: Convert results to a DataFrame
all_results_df = pd.DataFrame(all_results.tolist(), columns=['time-allTags-200-OpenAPI', 'answer-allTags-200-OpenAPI'])

# Step 3: Join the new DataFrame with the original DataFrame
qa_df = pd.concat([qa_df, all_results_df], axis=1)

In [None]:
qa_df[20:30]

Unnamed: 0,question,answer,retrieved-records-200-OpenAPI,time-records-200-OpenAPI,retrieved-ner-200-OpenAPI,time-ner-200-OpenAPI,answer-ner-200-OpenAPI,time-records-200-OpenAPI.1,answer-records-200-OpenAPI,retrieved-pos-200-OpenAPI,...,time-allTags-200-OpenAPI,answer-allTags-200-OpenAPI,time-pos-200-OpenAPI,answer-pos-200-OpenAPI,time-allTags-200-OpenAPI.1,answer-allTags-200-OpenAPI.1,time-pos-200-OpenAPI.1,answer-pos-200-OpenAPI.1,time-allTags-200-OpenAPI.2,answer-allTags-200-OpenAPI.2
20,Why was Susanna Reid criticized on Good Mornin...,Susanna Reid faced backlash for defending Prin...,"[[page_content=""gmb's susanna reid hits back a...",0.705691,"[[page_content=""gmb's susanna reid hits back a...",1.092074,Susanna Reid was criticized on Good Morning Br...,0.837425,Susanna Reid was criticized on Good Morning Br...,"[[page_content=""gmb's susanna reid hits back a...",...,0.678861,Susanna Reid was criticized on Good Morning Br...,0.636073,Susanna Reid was criticized on Good Morning Br...,0.742605,Susanna Reid was criticized on Good Morning Br...,0.699568,Susanna Reid was criticized on Good Morning Br...,1.885097,Susanna Reid was criticized on Good Morning Br...
21,How did Susanna Reid respond to the viewer's c...,Susanna Reid responded by reposting the commen...,"[[page_content=""gmb's susanna reid hits back a...",0.675496,"[[page_content=""gmb's susanna reid hits back a...",1.166351,Susanna Reid hit back at the viewer's criticis...,1.906639,Susanna Reid hit back at the criticism and def...,[[page_content='that saw george galloway win s...,...,0.998972,"I'm sorry, but the context provided does not m...",3.007667,"I'm sorry, but the context provided does not m...",1.640968,"Unfortunately, the context provided does not m...",0.975604,"I'm sorry, but the provided context does not m...",0.893357,"I'm sorry, but the provided context does not m..."
22,What topic sparked backlash towards Susanna Reid?,Her discussion on Prince Harry's security and ...,"[[page_content=""gmb's susanna reid hits back a...",0.680869,"[[page_content=""gmb's susanna reid hits back a...",0.791427,[Answer]: The topic that sparked backlash towa...,0.871762,The topic that sparked backlash towards Susann...,"[[page_content=""gmb's susanna reid hits back a...",...,0.868437,The topic that sparked backlash towards Susann...,0.66832,There is no mention of Susanna Reid in the con...,0.762882,The topic that sparked backlash towards Susann...,0.749583,The topic that sparked backlash towards Susann...,0.564574,The topic that sparked backlash towards Susann...
23,What is the name of the remote learning progra...,"Ahlan Simsim, meaning Welcome Sesame in Arabic.",[[page_content='entered the state’s custody af...,0.982083,[[page_content='entered the state’s custody af...,0.690818,The name of the remote learning program aimed ...,0.720145,The name of the remote learning program aimed ...,"[[page_content='on elementary schools,” she sa...",...,0.853671,"I'm sorry, but the context provided does not m...",0.714848,The name of the remote learning program aimed ...,0.84181,There is no mention of a remote learning progr...,1.905048,The name of the remote learning program aimed ...,0.723324,The name of the remote learning program aimed ...
24,What is unique about the content of the Ahlan ...,It addresses trauma delicately and excludes to...,[[page_content='this online learning program i...,2.20713,[[page_content='this online learning program i...,1.207133,The unique aspect of the Ahlan Simsim program ...,0.850351,The unique aspect of the Ahlan Simsim program ...,[[page_content='no time in beginning his human...,...,1.161017,"Based on the context provided, the unique aspe...",0.924824,The unique aspect of the Ahlan Simsim program ...,0.997245,"Based on the context provided, the unique aspe...",0.957847,"Based on the context provided, the unique aspe...",1.025844,"Based on the context provided, the unique aspe..."
25,How did families in crisis areas overcome inte...,"Families borrowed devices, sought rooftop inte...",[[page_content='this online learning program i...,0.815352,[[page_content='to replace it. so we saw peopl...,2.117325,"Families in crisis areas, such as the one desc...",2.338882,Families in crisis areas overcame internet and...,"[[page_content='your kids in their community, ...",...,1.335567,"Based on the context provided, families in cri...",1.720349,The context provided does not mention anything...,1.039432,"Based on the context provided, families in cri...",3.945516,Families in crisis areas overcame internet and...,1.477794,Families in crisis areas overcame internet and...
26,"According to NYU researchers, how much progres...",An 11-week program showed nearly a year's wort...,[[page_content='for “continuous improvement” f...,0.655632,[[page_content='for “continuous improvement” f...,0.80368,"Based on the context provided, there is no spe...",0.860151,"According to NYU researchers, children in the ...","[[page_content=""migrant communities with high ...",...,0.889692,The context provided does not mention NYU rese...,0.983924,The context provided does not mention NYU rese...,1.404192,The context provided does not mention NYU rese...,0.909399,The context provided does not mention NYU rese...,1.071378,The context provided does not mention NYU rese...
27,What lesson can be learned from the success of...,Full-family engagement and active participatio...,"[[page_content=""migrant communities with high ...",1.722124,[[page_content='this online learning program i...,1.545708,One lesson that can be learned from the succes...,1.424185,The lesson that can be learned from the succes...,"[[page_content=""migrant communities with high ...",...,1.196622,The lesson that can be learned from the succes...,2.516996,The lesson that can be learned from the succes...,1.34387,The lesson that can be learned from the succes...,1.192424,The lesson that can be learned from the succes...,0.948483,The lesson that can be learned from the succes...
28,Which tech giant leads in remote work opportun...,"Microsoft, offering a plethora of remote roles...","[[page_content=""from home. it's ranked as the ...",1.306068,"[[page_content=""from home. it's ranked as the ...",2.626475,The tech giant leading in remote work opportun...,1.029734,The tech giant that leads in remote work oppor...,"[[page_content=""from home. it's ranked as the ...",...,0.914783,The tech giant leading in remote work opportun...,1.31104,The tech giant leading in remote work opportun...,0.756339,The tech giant leading in remote work opportun...,1.155727,The tech giant leading in remote work opportun...,1.237472,The tech giant leading in remote work opportun...
29,How many remote positions does Salesforce offe...,Salesforce boasts over 150 remote jobs this Ma...,"[[page_content='for nearly 1,300 positions glo...",0.741991,"[[page_content='for nearly 1,300 positions glo...",0.891217,Salesforce offers nearly 800 fully remote jobs...,1.003335,Salesforce offers nearly 800 fully remote jobs...,"[[page_content='for nearly 1,300 positions glo...",...,0.891231,Salesforce offers nearly 800 fully remote jobs...,0.749835,Salesforce offers nearly 800 fully remote posi...,0.695065,Salesforce offers nearly 800 fully remote posi...,0.685568,Salesforce offers nearly 800 fully remote posi...,0.673995,Salesforce offers nearly 800 fully remote posi...


In [None]:
qa_df.to_csv('qa_df_200_OpenAI.csv', index=False)

### PART 6 : PREPARE QUESTION - ANSWER CSV - 400 CHUNK SIZE

In [None]:
qa_df_1 = pd.read_csv("/content/drive/MyDrive/NLP/QA_Dataset.csv")
qa_df_1.head()

Unnamed: 0,question,answer
0,Who predicts that AI will play a growing role ...,"Gerrit Kazmaier, Google Cloud’s lead executive..."
1,How does AI help in data analysis?,"It bridges structured and unstructured data, e..."
2,What is Google Cloud's approach to integrating...,"It integrates BigQuery with Vertex AI, facilit..."
3,What potential does generative AI offer in bus...,It allows natural language interaction with da...
4,How does AI address the challenge of dealing w...,"AI systems, like generative AI, enable flexibl..."


In [None]:
import nltk
nltk.download('punkt')

def preprocess_question(vstore, question, k):
  # Tokenization
  tokens = nltk.word_tokenize(question)
  # Lowercase
  tokens = [token.lower() for token in tokens]

  search_results = vstore.similarity_search(question, k)

  embedding_model = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
  q_emb = embedding_model.embed_query(question)
  q_vec = np.array(q_emb)

  lst = []
  for d in search_results:
      emb = embedding_model.embed_query(d.page_content)
      vec = np.array(emb)
      cosine = cosine_similarity(q_vec, vec)
      lst.append([d, cosine])

  return lst

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
import time
def generate_answer(llm, question, retriever):
  # Define a prompt template. This is a format for the text input we'll give to our model.
  # It tells the model how to structure its response and what to do in different situations.
  template = """I will provide you pieces of [Context] to answer the [Question]. \
  Answer the question based on the context provided. \
  [Context]: {context} \
  [Question]: {question}"""
  # If your answer includes any sort of list, return it in bullets. \
  # Format your answer to Markdown. \
  QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

  start_time = time.time()
  # Create a RetrievalQA object. This uses our language model (llm) and a retriever,
  # which is our vector database (vectordb). This object will handle asking our model questions
  # and retrieving relevant documents to help answer them.
  qa_chain = RetrievalQA.from_chain_type(
      llm,
      retriever=retriever,
      chain_type="stuff",
      return_source_documents=True,
      chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
  )

  result = qa_chain({"query": question})
  end_time = time.time()

  time_taken = end_time - start_time
  return [time_taken, result["result"]]

In [None]:
import time
def generate_ner_answer(llm, question, retriever):
  # Define a prompt template. This is a format for the text input we'll give to our model.
  # It tells the model how to structure its response and what to do in different situations.
  template = """I will provide you pieces of [Context] to answer the [Question]. \
  Answer the question based on the context provided. \
  The context will also contain relevant metadata, including NER (Named Entity Recognition) tags. Consider this metadata when generating your answer.
  [Context]: {context} \
  [Question]: {question}"""
  # If your answer includes any sort of list, return it in bullets. \
  # Format your answer to Markdown. \
  QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

  start_time = time.time()
  # Create a RetrievalQA object. This uses our language model (llm) and a retriever,
  # which is our vector database (vectordb). This object will handle asking our model questions
  # and retrieving relevant documents to help answer them.
  qa_chain = RetrievalQA.from_chain_type(
      llm,
      retriever=retriever,
      chain_type="stuff",
      return_source_documents=True,
      chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
  )

  result = qa_chain({"query": question})
  end_time = time.time()

  time_taken = end_time - start_time
  return [time_taken, result["result"]]

In [None]:
import time
def generate_pos_answer(llm, question, retriever):
  # Define a prompt template. This is a format for the text input we'll give to our model.
  # It tells the model how to structure its response and what to do in different situations.
  template = """I will provide you pieces of context \
  Answer the below question.  \
  The context will also contain relevant metadata, including POS [Part of Speech] tags. Consider this metadata when generating your answer.
  [Context]: {context} \
  [Question]: {question}"""
  # If your answer includes any sort of list, return it in bullets. \
  # Format your answer to Markdown. \
  QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

  start_time = time.time()
  # Create a RetrievalQA object. This uses our language model (llm) and a retriever,
  # which is our vector database (vectordb). This object will handle asking our model questions
  # and retrieving relevant documents to help answer them.
  qa_chain = RetrievalQA.from_chain_type(
      llm,
      retriever=retriever,
      chain_type="stuff",
      return_source_documents=True,
      chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
  )

  result = qa_chain({"query": question})
  end_time = time.time()

  time_taken = end_time - start_time
  return [time_taken, result["result"]]

In [None]:
import time
def generate_all_tags_answer(llm, question, retriever):
  # Define a prompt template. This is a format for the text input we'll give to our model.
  # It tells the model how to structure its response and what to do in different situations.
  template = """I will provide you pieces of context \
  Answer the below question. \
  The context will also contain relevant metadata, including NER [Named Entity Recognition] and POS [Part of Speech] tags. Consider this metadata when generating your answer.
  [Context]: {context} \
  [Question]: {question}"""
  # If your answer includes any sort of list, return it in bullets. \
  # Format your answer to Markdown. \
  QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

  start_time = time.time()
  # Create a RetrievalQA object. This uses our language model (llm) and a retriever,
  # which is our vector database (vectordb). This object will handle asking our model questions
  # and retrieving relevant documents to help answer them.
  qa_chain = RetrievalQA.from_chain_type(
      llm,
      retriever=retriever,
      chain_type="stuff",
      return_source_documents=True,
      chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
  )

  result = qa_chain({"query": question})
  end_time = time.time()

  time_taken = end_time - start_time
  return [time_taken, result["result"]]

In [None]:
question = "How does AI help in data analysis?"
print(generate_answer(llm, question, retriever_records_200_OpenAI))

[1.918428897857666, 'AI, especially generative AI, helps in data analysis by providing users with access to data in a way that conventional business intelligence (BI) cannot. AI can analyze large amounts of data quickly and efficiently, uncovering insights and patterns that may not be immediately apparent to human analysts. This can help businesses make more informed decisions and improve their overall performance.']


` PIPELINE : 200 CHUNK SIZE ; RECORDS ONLY; OPENAI EMBEDDDINGS `

In [None]:
chunk_records_data = [chunk_and_preserve_metadata_records_only(doc, 400, 50) for doc in records_only_data]

vectorstore_records_400_OpenAI = create_and_store_embeddings(chunk_records_data)
retriever_records_400_OpenAI = vectorstore_records_400_OpenAI.as_retriever()

# Initialize our language model. We're using OpenAI's GPT-3.5-turbo model here.
llm = ChatOpenAI(api_key=OPENAI_API_KEY, model_name="gpt-3.5-turbo", temperature=0)

qa_df_1['retrieved-records-400-OpenAPI'] = qa_df_1['question'].apply(lambda x: preprocess_question(vectorstore_records_400_OpenAI, x, k=5))

In [None]:
# Step 1: Apply the function and store results in a list
results = qa_df_1['question'].apply(lambda x: generate_answer(llm, x, retriever_records_400_OpenAI))

# Step 2: Convert results to a DataFrame
results_df = pd.DataFrame(results.tolist(), columns=['time-records-400-OpenAPI', 'answer-records-400-OpenAPI'])

# Step 3: Join the new DataFrame with the original DataFrame
qa_df_1 = pd.concat([qa_df_1, results_df], axis=1)

` PIPELINE : 200 CHUNK SIZE ; NER ONLY; OPENAI EMBEDDDINGS `

In [None]:
chunk_ner_data = [chunk_and_preserve_metadata_ner_only(doc, 400, 50) for doc in ner_only_data]

vectorstore_ner_400_OpenAI = create_and_store_embeddings(chunk_ner_data)
retriever_ner_400_OpenAI = vectorstore_ner_400_OpenAI.as_retriever()

# Initialize our language model. We're using OpenAI's GPT-3.5-turbo model here.
llm = ChatOpenAI(api_key=OPENAI_API_KEY, model_name="gpt-3.5-turbo", temperature=0)

qa_df_1['retrieved-ner-400-OpenAPI'] = qa_df_1['question'].apply(lambda x: preprocess_question(vectorstore_ner_400_OpenAI, x, k=5))

# Step 1: Apply the function and store results in a list
ner_results = qa_df_1['question'].apply(lambda x: generate_ner_answer(llm, x, retriever_ner_400_OpenAI))

# Step 2: Convert results to a DataFrame
ner_results_df = pd.DataFrame(ner_results.tolist(), columns=['time-ner-400-OpenAPI', 'answer-ner-400-OpenAPI'])

# Step 3: Join the new DataFrame with the original DataFrame
qa_df_1 = pd.concat([qa_df_1, ner_results_df], axis=1)

` PIPELINE : 200 CHUNK SIZE ; RECORDS + POS ONLY; OPENAI EMBEDDDINGS `

In [None]:
chunk_pos_data = [chunk_and_preserve_metadata_pos_only(doc, 400, 50) for doc in pos_only_data]

vectorstore_pos_400_OpenAI = create_and_store_embeddings(chunk_pos_data)
retriever_pos_400_OpenAI = vectorstore_pos_400_OpenAI.as_retriever()

# Initialize our language model. We're using OpenAI's GPT-3.5-turbo model here.
llm = ChatOpenAI(api_key=OPENAI_API_KEY, model_name="gpt-3.5-turbo", temperature=0)

qa_df_1['retrieved-pos-400-OpenAPI'] = qa_df_1['question'].apply(lambda x: preprocess_question(vectorstore_pos_400_OpenAI, x, k=5))

# Step 1: Apply the function and store results in a list
pos_results = qa_df_1['question'].apply(lambda x: generate_pos_answer(llm, x, retriever_pos_400_OpenAI))

# Step 2: Convert results to a DataFrame
pos_results_df = pd.DataFrame(pos_results.tolist(), columns=['time-pos-400-OpenAPI', 'answer-pos-400-OpenAPI'])

# Step 3: Join the new DataFrame with the original DataFrame
qa_df_1 = pd.concat([qa_df_1, pos_results_df], axis=1)

` PIPELINE : 200 CHUNK SIZE ; ALL TAGS; OPENAI EMBEDDDINGS `

In [None]:
chunk_all_tags_data = [chunk_and_preserve_metadata_all_tags(doc, 400, 50) for doc in all_tags_data]

vectorstore_allTags_400_OpenAI = create_and_store_embeddings(chunk_all_tags_data)
retriever_allTags_400_OpenAI = vectorstore_allTags_400_OpenAI.as_retriever()

# Initialize our language model. We're using OpenAI's GPT-3.5-turbo model here.
llm = ChatOpenAI(api_key=OPENAI_API_KEY, model_name="gpt-3.5-turbo", temperature=0)

qa_df_1['retrieved-allTags-400-OpenAPI'] = qa_df_1['question'].apply(lambda x: preprocess_question(vectorstore_allTags_400_OpenAI, x, k=5))

# Step 1: Apply the function and store results in a list
all_results = qa_df_1['question'].apply(lambda x: generate_all_tags_answer(llm, x, retriever_allTags_400_OpenAI))

# Step 2: Convert results to a DataFrame
all_results_df = pd.DataFrame(all_results.tolist(), columns=['time-allTags-400-OpenAPI', 'answer-allTags-400-OpenAPI'])

# Step 3: Join the new DataFrame with the original DataFrame
qa_df_1 = pd.concat([qa_df_1, all_results_df], axis=1)

In [None]:
qa_df_1[30:40]

Unnamed: 0,question,answer,retrieved-records-400-OpenAPI,time-records-400-OpenAPI,answer-records-400-OpenAPI,retrieved-ner-400-OpenAPI,time-ner-400-OpenAPI,answer-ner-400-OpenAPI,retrieved-pos-400-OpenAPI,time-pos-400-OpenAPI,answer-pos-400-OpenAPI,retrieved-allTags-400-OpenAPI,time-allTags-400-OpenAPI,answer-allTags-400-OpenAPI
30,"Despite advocating for office returns, which t...","Google continues to offer remote positions, pa...","[[page_content=""then microsoft could be an ide...",0.489128,Microsoft,"[[page_content=""customer care team site, or br...",0.741905,Google,"[[page_content=""customer care team site, or br...",0.635962,Google,"[[page_content=""customer care team site, or br...",0.466513,Google
31,What sets Intuit apart as a remote-friendly em...,Intuit scores 88/100 in flexibility ratings an...,"[[page_content=""from home. it's ranked as the ...",1.626286,Intuit is ranked as the top remote work compan...,"[[page_content=""from home. it's ranked as the ...",1.043871,[Answer]: Intuit is ranked as the top remote w...,"[[page_content=""from home. it's ranked as the ...",1.194648,Intuit is ranked as the top remote work compan...,"[[page_content=""from home. it's ranked as the ...",1.002844,Intuit is set apart as a remote-friendly emplo...
32,How many remote vacancies does CVS Health have...,CVS Health lists nearly 400 remote jobs across...,"[[page_content=""of defense! here's a taste of ...",0.943683,"Based on the context provided, there are five ...","[[page_content=""of defense! here's a taste of ...",2.142622,"I'm sorry, but the context provided does not m...","[[page_content=""of defense! here's a taste of ...",1.996684,The context provided is about remote job vacan...,"[[page_content='for nearly 1,300 positions glo...",0.798558,"CVS Health has nearly 800 remote vacancies, wi..."
33,What percentage of UnitedHealth Group's workfo...,UnitedHealth Group boasts over 25% remote work...,"[[page_content='for nearly 1,300 positions glo...",2.082714,"Based on the context provided, UnitedHealth Gr...","[[page_content=""to working from home. it's ran...",1.659403,"Based on the context provided, UnitedHealth Gr...","[[page_content=""to working from home. it's ran...",0.982472,"Based on the context provided, UnitedHealth Gr...","[[page_content='for nearly 1,300 positions glo...",1.713457,"Based on the context provided, UnitedHealth Gr..."
34,"Despite its return-to-office stance, how many ...",Amazon provides over 70 remote positions in Ma...,"[[page_content='for nearly 1,300 positions glo...",0.712675,Amazon is offering nearly 800 fully remote job...,"[[page_content=""as the calendar turns to march...",2.719071,"Based on the context provided, the question is...","[[page_content=""as the calendar turns to march...",0.771736,"I'm sorry, but the context provided does not m...","[[page_content=""as the calendar turns to march...",0.814545,"I'm sorry, but the context provided does not m..."
35,Which unexpected entrant in remote work opport...,"Williams-Sonoma, known for luxury goods, now o...","[[page_content=""from home. it's ranked as the ...",1.027281,The unexpected entrant in remote work opportun...,"[[page_content=""from home. it's ranked as the ...",1.164248,The unexpected entrant in remote work opportun...,"[[page_content=""from home. it's ranked as the ...",0.828785,The unexpected entrant in remote work opportun...,"[[page_content=""from home. it's ranked as the ...",2.152485,The unexpected entrant in remote work opportun...
36,What benefits of remote work are highlighted i...,Remote work offers cost and time savings from ...,"[[page_content='the minute, so here are some o...",0.739363,The benefits of remote work highlighted in the...,"[[page_content=""which is why studies like our ...",1.011862,The benefits of remote work highlighted in the...,"[[page_content=""which is why studies like our ...",1.597955,The benefits of remote work highlighted in the...,"[[page_content='the minute, so here are some o...",0.796534,The benefits of remote work highlighted in the...
37,Which specific roles are prominently featured ...,Microsoft emphasizes roles like Principal Cont...,[[page_content='sample of some of the fully re...,2.472141,The specific roles prominently featured in Mic...,[[page_content='sample of some of the fully re...,1.487232,The specific roles prominently featured in Mic...,[[page_content='sample of some of the fully re...,1.471346,The specific roles prominently featured in Mic...,[[page_content='sample of some of the fully re...,2.159955,The specific roles prominently featured in Mic...
38,Who was apprehended by Hungarian authorities i...,A Swiss national with convictions for blackmai...,"[[page_content=""who is mohammad ghouse nayazi,...",0.758609,There is no information provided in the contex...,"[[page_content=""who is mohammad ghouse nayazi,...",1.388259,"Based on the context provided, the individual ...","[[page_content=""who is mohammad ghouse nayazi,...",1.395296,"Based on the provided context, the answer to t...","[[page_content=""who is mohammad ghouse nayazi,...",2.826608,"Based on the provided context, there is no men..."
39,What sentence did the apprehended individual r...,He was sentenced to over five and a half years...,[[page_content='11 counts of attempted murder ...,1.665277,The apprehended individual likely received a l...,[[page_content='11 counts of attempted murder ...,1.240824,"Based on the context provided, the apprehended...",[[page_content='11 counts of attempted murder ...,0.96308,"Based on the context provided, it is likely th...",[[page_content='11 counts of attempted murder ...,2.623664,"Based on the context provided, it is likely th..."


In [None]:
qa_df_1.to_csv('qa_df_400_OpenAI.csv', index=False)

### PART 6 : PREPARE QUESTION - ANSWER CSV - 600 CHUNK SIZE

In [None]:
qa_df_2 = pd.read_csv("/content/drive/MyDrive/NLP/QA_Dataset.csv")
qa_df_2.head()

Unnamed: 0,question,answer
0,Who predicts that AI will play a growing role ...,"Gerrit Kazmaier, Google Cloud’s lead executive..."
1,How does AI help in data analysis?,"It bridges structured and unstructured data, e..."
2,What is Google Cloud's approach to integrating...,"It integrates BigQuery with Vertex AI, facilit..."
3,What potential does generative AI offer in bus...,It allows natural language interaction with da...
4,How does AI address the challenge of dealing w...,"AI systems, like generative AI, enable flexibl..."


In [12]:
import nltk
nltk.download('punkt')

def preprocess_question(vstore, question, k):
  # Tokenization
  tokens = nltk.word_tokenize(question)
  # Lowercase
  tokens = [token.lower() for token in tokens]

  search_results = vstore.similarity_search(question, k)

  embedding_model = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
  q_emb = embedding_model.embed_query(question)
  q_vec = np.array(q_emb)

  lst = []
  for d in search_results:
      emb = embedding_model.embed_query(d.page_content)
      vec = np.array(emb)
      cosine = cosine_similarity(q_vec, vec)
      lst.append([d, cosine])

  return lst

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [13]:
import time
def generate_answer(llm, question, retriever):
  # Define a prompt template. This is a format for the text input we'll give to our model.
  # It tells the model how to structure its response and what to do in different situations.
  template = """I will provide you pieces of [Context] to answer the [Question]. \
  Answer the question based on the context provided. \
  [Context]: {context} \
  [Question]: {question}"""
  # If your answer includes any sort of list, return it in bullets. \
  # Format your answer to Markdown. \
  QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

  start_time = time.time()
  # Create a RetrievalQA object. This uses our language model (llm) and a retriever,
  # which is our vector database (vectordb). This object will handle asking our model questions
  # and retrieving relevant documents to help answer them.
  qa_chain = RetrievalQA.from_chain_type(
      llm,
      retriever=retriever,
      chain_type="stuff",
      return_source_documents=True,
      chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
  )

  result = qa_chain({"query": question})
  end_time = time.time()

  time_taken = end_time - start_time
  return [time_taken, result["result"]]

In [14]:
import time
def generate_ner_answer(llm, question, retriever):
  # Define a prompt template. This is a format for the text input we'll give to our model.
  # It tells the model how to structure its response and what to do in different situations.
  template = """I will provide you pieces of [Context] to answer the [Question]. \
  Answer the question based on the context provided. \
  The context will also contain relevant metadata, including NER (Named Entity Recognition) tags. Consider this metadata when generating your answer.
  [Context]: {context} \
  [Question]: {question}"""
  # If your answer includes any sort of list, return it in bullets. \
  # Format your answer to Markdown. \
  QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

  start_time = time.time()
  # Create a RetrievalQA object. This uses our language model (llm) and a retriever,
  # which is our vector database (vectordb). This object will handle asking our model questions
  # and retrieving relevant documents to help answer them.
  qa_chain = RetrievalQA.from_chain_type(
      llm,
      retriever=retriever,
      chain_type="stuff",
      return_source_documents=True,
      chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
  )

  result = qa_chain({"query": question})
  end_time = time.time()

  time_taken = end_time - start_time
  return [time_taken, result["result"]]

In [15]:
import time
def generate_pos_answer(llm, question, retriever):
  # Define a prompt template. This is a format for the text input we'll give to our model.
  # It tells the model how to structure its response and what to do in different situations.
  template = """I will provide you pieces of context \
  Answer the below question.  \
  The context will also contain relevant metadata, including POS [Part of Speech] tags. Consider this metadata when generating your answer.
  [Context]: {context} \
  [Question]: {question}"""
  # If your answer includes any sort of list, return it in bullets. \
  # Format your answer to Markdown. \
  QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

  start_time = time.time()
  # Create a RetrievalQA object. This uses our language model (llm) and a retriever,
  # which is our vector database (vectordb). This object will handle asking our model questions
  # and retrieving relevant documents to help answer them.
  qa_chain = RetrievalQA.from_chain_type(
      llm,
      retriever=retriever,
      chain_type="stuff",
      return_source_documents=True,
      chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
  )

  result = qa_chain({"query": question})
  end_time = time.time()

  time_taken = end_time - start_time
  return [time_taken, result["result"]]

In [16]:
import time
def generate_all_tags_answer(llm, question, retriever):
  # Define a prompt template. This is a format for the text input we'll give to our model.
  # It tells the model how to structure its response and what to do in different situations.
  template = """I will provide you pieces of context \
  Answer the below question. \
  The context will also contain relevant metadata, including NER [Named Entity Recognition] and POS [Part of Speech] tags. Consider this metadata when generating your answer.
  [Context]: {context} \
  [Question]: {question}"""
  # If your answer includes any sort of list, return it in bullets. \
  # Format your answer to Markdown. \
  QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

  start_time = time.time()
  # Create a RetrievalQA object. This uses our language model (llm) and a retriever,
  # which is our vector database (vectordb). This object will handle asking our model questions
  # and retrieving relevant documents to help answer them.
  qa_chain = RetrievalQA.from_chain_type(
      llm,
      retriever=retriever,
      chain_type="stuff",
      return_source_documents=True,
      chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
  )

  result = qa_chain({"query": question})
  end_time = time.time()

  time_taken = end_time - start_time
  return [time_taken, result["result"]]

In [None]:
question = "How does AI help in data analysis?"
print(generate_answer(llm, question, retriever_records_200_OpenAI))

[1.4437918663024902, 'AI helps in data analysis by bringing together structured and unstructured data, allowing for a more comprehensive and insightful analysis. AI systems are starting to perform data analytics by combining AI with more conventional business intelligence tools, as mentioned by Gerrit Kazmaier, VP and GM for database, data analytics, and Looker at Google Cloud.']


` PIPELINE : 200 CHUNK SIZE ; RECORDS ONLY; OPENAI EMBEDDDINGS `

In [None]:
chunk_records_data = [chunk_and_preserve_metadata_records_only(doc, 600, 50) for doc in records_only_data]

vectorstore_records_600_OpenAI = create_and_store_embeddings(chunk_records_data)
retriever_records_600_OpenAI = vectorstore_records_600_OpenAI.as_retriever()

# Initialize our language model. We're using OpenAI's GPT-3.5-turbo model here.
llm = ChatOpenAI(api_key=OPENAI_API_KEY, model_name="gpt-3.5-turbo", temperature=0)

qa_df_2['retrieved-records-600-OpenAPI'] = qa_df_2['question'].apply(lambda x: preprocess_question(vectorstore_records_600_OpenAI, x, k=5))

In [None]:
# Step 1: Apply the function and store results in a list
results = qa_df_2['question'].apply(lambda x: generate_answer(llm, x, retriever_records_600_OpenAI))

# Step 2: Convert results to a DataFrame
results_df = pd.DataFrame(results.tolist(), columns=['time-records-600-OpenAPI', 'answer-records-600-OpenAPI'])

# Step 3: Join the new DataFrame with the original DataFrame
qa_df_2 = pd.concat([qa_df_2, results_df], axis=1)

` PIPELINE : 200 CHUNK SIZE ; NER ONLY; OPENAI EMBEDDDINGS `

In [None]:
chunk_ner_data = [chunk_and_preserve_metadata_ner_only(doc, 600, 50) for doc in ner_only_data]

vectorstore_ner_600_OpenAI = create_and_store_embeddings(chunk_ner_data)
retriever_ner_600_OpenAI = vectorstore_ner_600_OpenAI.as_retriever()

# Initialize our language model. We're using OpenAI's GPT-3.5-turbo model here.
llm = ChatOpenAI(api_key=OPENAI_API_KEY, model_name="gpt-3.5-turbo", temperature=0)

qa_df_2['retrieved-ner-600-OpenAPI'] = qa_df_2['question'].apply(lambda x: preprocess_question(vectorstore_ner_600_OpenAI, x, k=5))

# Step 1: Apply the function and store results in a list
ner_results = qa_df_2['question'].apply(lambda x: generate_ner_answer(llm, x, retriever_ner_600_OpenAI))

# Step 2: Convert results to a DataFrame
ner_results_df = pd.DataFrame(ner_results.tolist(), columns=['time-ner-600-OpenAPI', 'answer-ner-600-OpenAPI'])

# Step 3: Join the new DataFrame with the original DataFrame
qa_df_2 = pd.concat([qa_df_2, ner_results_df], axis=1)

` PIPELINE : 200 CHUNK SIZE ; RECORDS + POS ONLY; OPENAI EMBEDDDINGS `

In [None]:
chunk_pos_data = [chunk_and_preserve_metadata_pos_only(doc, 600, 50) for doc in pos_only_data]

vectorstore_pos_600_OpenAI = create_and_store_embeddings(chunk_pos_data)
retriever_pos_600_OpenAI = vectorstore_pos_600_OpenAI.as_retriever()

# Initialize our language model. We're using OpenAI's GPT-3.5-turbo model here.
llm = ChatOpenAI(api_key=OPENAI_API_KEY, model_name="gpt-3.5-turbo", temperature=0)

qa_df_2['retrieved-pos-600-OpenAPI'] = qa_df_2['question'].apply(lambda x: preprocess_question(vectorstore_pos_600_OpenAI, x, k=5))

# Step 1: Apply the function and store results in a list
pos_results = qa_df_2['question'].apply(lambda x: generate_pos_answer(llm, x, retriever_pos_600_OpenAI))

# Step 2: Convert results to a DataFrame
pos_results_df = pd.DataFrame(pos_results.tolist(), columns=['time-pos-600-OpenAPI', 'answer-pos-600-OpenAPI'])

# Step 3: Join the new DataFrame with the original DataFrame
qa_df_2 = pd.concat([qa_df_2, pos_results_df], axis=1)

` PIPELINE : 200 CHUNK SIZE ; ALL TAGS; OPENAI EMBEDDDINGS `

In [None]:
chunk_all_tags_data = [chunk_and_preserve_metadata_all_tags(doc, 600, 50) for doc in all_tags_data]

vectorstore_allTags_600_OpenAI = create_and_store_embeddings(chunk_all_tags_data)
retriever_allTags_600_OpenAI = vectorstore_allTags_600_OpenAI.as_retriever()

# Initialize our language model. We're using OpenAI's GPT-3.5-turbo model here.
llm = ChatOpenAI(api_key=OPENAI_API_KEY, model_name="gpt-3.5-turbo", temperature=0)

qa_df_2['retrieved-allTags-600-OpenAPI'] = qa_df_2['question'].apply(lambda x: preprocess_question(vectorstore_allTags_600_OpenAI, x, k=5))

# Step 1: Apply the function and store results in a list
all_results = qa_df_2['question'].apply(lambda x: generate_all_tags_answer(llm, x, retriever_allTags_600_OpenAI))

# Step 2: Convert results to a DataFrame
all_results_df = pd.DataFrame(all_results.tolist(), columns=['time-allTags-600-OpenAPI', 'answer-allTags-600-OpenAPI'])

# Step 3: Join the new DataFrame with the original DataFrame
qa_df_2 = pd.concat([qa_df_2, all_results_df], axis=1)

In [None]:
qa_df_2[30:40]

Unnamed: 0,question,answer,retrieved-records-600-OpenAPI,time-records-600-OpenAPI,answer-records-600-OpenAPI,retrieved-ner-600-OpenAPI,time-ner-600-OpenAPI,answer-ner-600-OpenAPI,retrieved-pos-600-OpenAPI,time-pos-600-OpenAPI,answer-pos-600-OpenAPI,retrieved-allTags-600-OpenAPI,time-allTags-600-OpenAPI,answer-allTags-600-OpenAPI
30,"Despite advocating for office returns, which t...","Google continues to offer remote positions, pa...","[[page_content=""return to the office, yet cont...",0.465312,Google,"[[page_content=""return to the office, yet cont...",0.345418,Google,"[[page_content=""return to the office, yet cont...",0.346464,Google,"[[page_content=""return to the office, yet cont...",0.403232,Google
31,What sets Intuit apart as a remote-friendly em...,Intuit scores 88/100 in flexibility ratings an...,"[[page_content=""from home. it's ranked as the ...",1.405961,Intuit is ranked as the top remote work compan...,"[[page_content=""from home. it's ranked as the ...",1.004503,[Answer]: Intuit is ranked as the top remote w...,"[[page_content=""from home. it's ranked as the ...",1.418569,Intuit is ranked as the top remote work compan...,"[[page_content=""from home. it's ranked as the ...",1.059686,Intuit is ranked as the top remote work compan...
32,How many remote vacancies does CVS Health have...,CVS Health lists nearly 400 remote jobs across...,"[[page_content='for nearly 1,300 positions glo...",0.944003,CVS Health has nearly 800 fully remote jobs be...,"[[page_content='for nearly 1,300 positions glo...",0.932288,CVS Health has nearly 800 fully remote jobs op...,"[[page_content='for nearly 1,300 positions glo...",2.013814,CVS Health has nearly 800 fully remote jobs be...,"[[page_content='for nearly 1,300 positions glo...",0.970326,CVS Health has nearly 800 fully remote jobs op...
33,What percentage of UnitedHealth Group's workfo...,UnitedHealth Group boasts over 25% remote work...,"[[page_content='for nearly 1,300 positions glo...",1.580128,"Based on the context provided, UnitedHealth Gr...","[[page_content='for nearly 1,300 positions glo...",2.291056,"Based on the context provided, UnitedHealth Gr...","[[page_content='for nearly 1,300 positions glo...",1.758222,"Based on the context provided, UnitedHealth Gr...","[[page_content='for nearly 1,300 positions glo...",1.66556,"Based on the context provided, UnitedHealth Gr..."
34,"Despite its return-to-office stance, how many ...",Amazon provides over 70 remote positions in Ma...,"[[page_content=""as the calendar turns to march...",0.945188,"Based on the context provided, Amazon is likel...","[[page_content=""as the calendar turns to march...",1.067138,"Based on the context provided, it is not speci...","[[page_content=""as the calendar turns to march...",0.808848,The context does not provide specific informat...,"[[page_content=""as the calendar turns to march...",2.394699,"I'm sorry, but the context provided does not m..."
35,Which unexpected entrant in remote work opport...,"Williams-Sonoma, known for luxury goods, now o...","[[page_content=""from home. it's ranked as the ...",0.822682,The unexpected entrant in remote work opportun...,"[[page_content=""from home. it's ranked as the ...",2.007302,The unexpected entrant in remote work opportun...,"[[page_content=""from home. it's ranked as the ...",0.810246,The unexpected entrant in remote work opportun...,"[[page_content=""from home. it's ranked as the ...",0.875917,The unexpected entrant in remote work opportun...
36,What benefits of remote work are highlighted i...,Remote work offers cost and time savings from ...,"[[page_content='the minute, so here are some o...",0.746782,The benefits of remote work highlighted in the...,"[[page_content='the minute, so here are some o...",1.096787,[Answer]: The benefits of remote work highligh...,"[[page_content='the minute, so here are some o...",2.031463,The benefits of remote work highlighted in the...,"[[page_content='the minute, so here are some o...",0.714591,The benefits of remote work highlighted in the...
37,Which specific roles are prominently featured ...,Microsoft emphasizes roles like Principal Cont...,[[page_content='sample of some of the fully re...,2.26739,The specific roles prominently featured in Mic...,[[page_content='sample of some of the fully re...,2.85121,The specific roles prominently featured in Mic...,[[page_content='sample of some of the fully re...,1.29322,The specific roles prominently featured in Mic...,[[page_content='sample of some of the fully re...,1.199188,The specific roles prominently featured in Mic...
38,Who was apprehended by Hungarian authorities i...,A Swiss national with convictions for blackmai...,[[page_content='ceased. the border patrol rele...,1.865835,"Based on the context provided, it is not menti...",[[page_content='ceased. the border patrol rele...,0.56816,There is no mention of Hungarian authorities a...,[[page_content='ceased. the border patrol rele...,3.458901,There is no mention of Hungarian authorities o...,[[page_content='ceased. the border patrol rele...,1.244561,There is no mention of Hungarian authorities o...
39,What sentence did the apprehended individual r...,He was sentenced to over five and a half years...,[[page_content='11 counts of attempted murder ...,1.003718,The apprehended individual likely received a l...,[[page_content='11 counts of attempted murder ...,2.489559,"Based on the context provided, the apprehended...",[[page_content='11 counts of attempted murder ...,1.188882,"Based on the context provided, it is likely th...",[[page_content='a 2-year prison term by the zu...,0.972887,The apprehended individual received a 2-year p...


In [None]:
qa_df_2.to_csv('qa_df_600_OpenAI.csv', index=False)

## PART 4 : RAG PIPELINE FOR FULL DATASET

In [45]:
#Load Questions in new_data_df
import pandas as pd
new_data_df = pd.read_csv('/content/drive/MyDrive/NLP/NLP_Project_Dataset_v1_400.csv')

In [46]:
new_data_df.head()

Unnamed: 0,Question,Answer
0,What is the species of the frog found with a m...,The frog found with a mushroom growing on its ...
1,What potential impacts could the presence of t...,The presence of the mushroom on the frog could...
2,What charges does the teenage boy face followi...,The teenage boy faces charges of disruption of...
3,How quickly did the authorities apprehend the ...,The authorities arrested the suspect within 30...
4,How much funding would California's Propositio...,California's Proposition 1 aims to raise $6.4 ...


In [47]:
def build_rag_pipeline(new_data_df, chunk_size, data_type):

  # Initialize our language model. We're using OpenAI's GPT-3.5-turbo model here.
  llm = ChatOpenAI(api_key=OPENAI_API_KEY, model_name="gpt-3.5-turbo", temperature=0)

  ret_col_name = 'retrieved-' + data_type + '-' + str(chunk_size) + '-OpenAPI'
  ans_col_name = 'answer-' + data_type + '-' + str(chunk_size) + '-OpenAPI'
  time_col_name = 'time-' + data_type + '-' + str(chunk_size) + '-OpenAPI'

  if data_type == 'records':
    # Step 1: Apply the function and store results in a list
    chunk_data = [chunk_and_preserve_metadata_records_only(doc, chunk_size, 50) for doc in records_only_data]
    vstore = create_and_store_embeddings(chunk_data)
    retreiver_store = vstore.as_retriever()

    # Step 2 : Preproces questions and get retrieved source index data ; generate answers for preprocessed questions
    new_data_df[ret_col_name] = new_data_df['Question'].apply(lambda x: preprocess_question(vstore, x, k=5))
    results = new_data_df['Question'].apply(lambda x: generate_answer(llm, x, retreiver_store))
    results_df = pd.DataFrame(results.tolist(), columns=[time_col_name, ans_col_name])

    # Step 3: Join the new DataFrame with the original DataFrame
    new_data_df = pd.concat([new_data_df, results_df], axis=1)

  elif data_type == 'ner':
    # Step 1 : Build data store
    chunk_data = [chunk_and_preserve_metadata_ner_only(doc, chunk_size, 50) for doc in ner_only_data]
    vstore = create_and_store_embeddings(chunk_data)
    retreiver_store = vstore.as_retriever()

    # Step 2 : Preproces questions and get retrieved source index data ; generate answers for preprocessed questions
    new_data_df[ret_col_name] = new_data_df['Question'].apply(lambda x: preprocess_question(vstore, x, k=5))
    ner_results = new_data_df['Question'].apply(lambda x: generate_ner_answer(llm, x, retreiver_store))
    ner_results_df = pd.DataFrame(ner_results.tolist(), columns=[time_col_name, ans_col_name])

    # Step 3: Join the new DataFrame with the original DataFrame
    new_data_df = pd.concat([new_data_df, ner_results_df], axis=1)

  elif data_type == 'pos':
    # Step 1: Apply the function and store results in a list
    chunk_data = [chunk_and_preserve_metadata_pos_only(doc, chunk_size, 50) for doc in pos_only_data]
    vstore = create_and_store_embeddings(chunk_data)
    retreiver_store = vstore.as_retriever()

    # Step 2
    new_data_df[ret_col_name] = new_data_df['Question'].apply(lambda x: preprocess_question(vstore, x, k=5))
    pos_results = new_data_df['Question'].apply(lambda x: generate_pos_answer(llm, x, retreiver_store))
    pos_results_df = pd.DataFrame(pos_results.tolist(), columns=[time_col_name, ans_col_name])

    # Step 3: Join the new DataFrame with the original DataFrame
    new_data_df = pd.concat([new_data_df, pos_results_df], axis=1)

  elif data_type =='all_tags':
    # Step 1: Apply the function and store results in a list
    chunk_data = [chunk_and_preserve_metadata_all_tags(doc, chunk_size, 50) for doc in all_tags_data]
    vstore = create_and_store_embeddings(chunk_data)
    retreiver_store = vstore.as_retriever()

    # Step 2
    new_data_df[ret_col_name] = new_data_df['Question'].apply(lambda x: preprocess_question(vstore, x, k=5))
    all_results = new_data_df['Question'].apply(lambda x: generate_all_tags_answer(llm, x, retreiver_store))
    all_results_df = pd.DataFrame(all_results.tolist(), columns=[time_col_name, ans_col_name])

    # Step 3: Join the new DataFrame with the original DataFrame
    new_data_df = pd.concat([new_data_df, all_results_df], axis=1)

  return new_data_df

In [None]:
new_data_df = build_rag_pipeline(new_data_df, 200, 'records')

In [None]:
new_data_df = build_rag_pipeline(new_data_df, 200, 'ner')

In [None]:
new_data_df = build_rag_pipeline(new_data_df, 200, 'pos')

In [None]:
new_data_df = build_rag_pipeline(new_data_df, 200, 'all_tags')

In [None]:
new_data_df

In [None]:
new_data_df.to_csv('UpdatedDataset_200.csv', index=False)

In [None]:
new_data_df = build_rag_pipeline(new_data_df, 400, 'records')

In [None]:
new_data_df = build_rag_pipeline(new_data_df, 400, 'ner')

In [None]:
new_data_df = build_rag_pipeline(new_data_df, 400, 'pos')

In [None]:
new_data_df = build_rag_pipeline(new_data_df, 400, 'all_tags')

In [58]:
new_data_df.to_csv('UpdatedDataset_600.csv', index=False)

In [48]:
new_data_df = build_rag_pipeline(new_data_df, 600, 'records')

In [49]:
new_data_df = build_rag_pipeline(new_data_df, 600, 'ner')

In [50]:
new_data_df = build_rag_pipeline(new_data_df, 600, 'pos')

In [57]:
new_data_df = build_rag_pipeline(new_data_df, 600, 'all_tags')

In [59]:
new_data_df

Unnamed: 0,Question,Answer,retrieved-records-600-OpenAPI,time-records-600-OpenAPI,answer-records-600-OpenAPI,retrieved-ner-600-OpenAPI,time-ner-600-OpenAPI,answer-ner-600-OpenAPI,retrieved-pos-600-OpenAPI,time-pos-600-OpenAPI,answer-pos-600-OpenAPI,retrieved-all_tags-600-OpenAPI,time-all_tags-600-OpenAPI,answer-all_tags-600-OpenAPI
0,What is the species of the frog found with a m...,The frog found with a mushroom growing on its ...,[[page_content='phenomenon nor the fate of the...,0.947773,The species of the frog found with a mushroom ...,[[page_content='phenomenon nor the fate of the...,1.747789,"Based on the context provided, the species of ...",[[page_content='phenomenon nor the fate of the...,1.172315,The species of the frog found with a mushroom ...,[[page_content='phenomenon nor the fate of the...,0.934102,The species of the frog found with a mushroom ...
1,What potential impacts could the presence of t...,The presence of the mushroom on the frog could...,[[page_content='is possible that “an otherwise...,1.921282,The potential impacts of the presence of the m...,[[page_content='is possible that “an otherwise...,3.014013,The potential impacts of the presence of the m...,[[page_content='is possible that “an otherwise...,4.679805,The potential impacts of the presence of the m...,[[page_content='is possible that “an otherwise...,2.584282,The potential impacts of the presence of the m...
2,What charges does the teenage boy face followi...,The teenage boy faces charges of disruption of...,"[[page_content='gwinnett county, ga. — a stude...",1.074843,The teenage boy faces charges of disruption of...,"[[page_content='gwinnett county, ga. — a stude...",2.287920,The teenage boy faces charges of disruption of...,"[[page_content='gwinnett county, ga. — a stude...",0.694022,[Answer]: The teenage boy faces charges of dis...,"[[page_content='gwinnett county, ga. — a stude...",3.384300,The teenage boy faces charges of disruption of...
3,How quickly did the authorities apprehend the ...,The authorities arrested the suspect within 30...,"[[page_content='attempts to cover his face, th...",1.281069,The authorities apprehended the suspect quickl...,[[page_content='in 24 hours. police suspect ol...,0.822917,The authorities apprehended the suspect moment...,"[[page_content='attempts to cover his face, th...",2.157547,The authorities apprehended the suspect quickl...,"[[page_content='attempts to cover his face, th...",2.014625,The authorities apprehended the suspect quickl...
4,How much funding would California's Propositio...,California's Proposition 1 aims to raise $6.4 ...,[[page_content='california’s proposition 1 – t...,1.151361,California's Proposition 1 would raise $6.4 bi...,[[page_content='california’s proposition 1 – t...,2.770639,California's Proposition 1 would raise $6.4 bi...,[[page_content='california’s proposition 1 – t...,0.830480,California's Proposition 1 would raise $6.4 bi...,[[page_content='california’s proposition 1 – t...,3.619102,California's Proposition 1 would raise $6.4 bi...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,True or False: Kimberly-Clark aims to increase...,False.,[[page_content='them to substantially increase...,1.465977,False. Kimberly-Clark aims to reduce its total...,[[page_content='them to substantially increase...,1.024981,False. Kimberly-Clark aims to reduce its total...,[[page_content='them to substantially increase...,0.802349,False. Kimberly-Clark aims to reduce its total...,[[page_content='them to substantially increase...,0.858129,False. Kimberly-Clark aims to reduce its total...
396,What was Rosalio Cervantes Galvan arrested for...,Rosalio Cervantes Galvan was arrested for Driv...,[[page_content='sedalia police reports for mar...,0.706371,Rosalio Cervantes Galvan was arrested for driv...,[[page_content='sedalia police reports for mar...,0.842624,Rosalio Cervantes Galvan was arrested for driv...,[[page_content='sedalia police reports for mar...,0.758817,Rosalio Cervantes Galvan was arrested for driv...,[[page_content='flamenco (nif) executive direc...,0.658909,The context does not mention anything about Ro...
397,Why was Kourteney Jo-Anna Parker arrested and ...,Kourteney Jo-Anna Parker was arrested and tran...,[[page_content='it was confirmed she had two f...,1.069051,Kourteney Jo-Anna Parker was arrested and tran...,[[page_content='it was confirmed she had two f...,2.761050,Kourteney Jo-Anna Parker was arrested and tran...,[[page_content='it was confirmed she had two f...,1.157800,Kourteney Jo-Anna Parker was arrested and tran...,[[page_content='it was confirmed she had two f...,1.756430,Kourteney Jo-Anna Parker was arrested and tran...
398,What type of report did Tammie Smith file? Ans...,Tammie Smith filed an identity theft/fraud rep...,[[page_content='litigation leads to many quest...,1.444440,Subpoenas,[[page_content='report. tammie smith stated so...,0.402148,Identity theft,[[page_content='litigation leads to many quest...,0.425211,Enrollment,[[page_content='litigation leads to many quest...,0.628153,Enrollment
