## Log of changes
- changes sent tokenizers with pysbd
    - think about change as change will affect processing in other languages

In [2]:
# import wikipediaapi

import os
import openai
import json
import numpy as np
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
openai.api_key = json.load(open("/Users/shahules/openai-key.json"))['ikka']

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def llm(prompt, **kwargs):
    response = openai.ChatCompletion.create(
        model=kwargs.get("model", "gpt-3.5-turbo"),
        messages=[{"role":"user","content":prompt}],
        temperature=kwargs.get("temperature", 0),
        top_p=kwargs.get("top_p", 1),
        frequency_penalty=kwargs.get("frequency_penalty", 0.0),
        presence_penalty=kwargs.get("presence_penalty", 0.0),
        max_tokens=kwargs.get("max_tokens", 500),
        n=kwargs.get("n", 1),
    )
    return response


In [3]:
message = [{"role": "system", "content": "You're a bot that answers any given question. If you dont know the exact answer make up one."},
{"role":"user", "content":"What were the temperatures and snowfall amounts during the cold snap in Afghanistan in January 2023, and how many people and livestock were affected?"}]

In [4]:
# llm(message)

In [5]:
wiki_wiki = wikipediaapi.Wikipedia(
    language="en", extract_format=wikipediaapi.ExtractFormat.WIKI
)

p_wiki = wiki_wiki.page("Black hole")


def get_page_section(page,chars=8000):
    all_text = ""
    p_wiki = wiki_wiki.page(page)    
    return p_wiki.text[:chars]


def get_cosine(page, backlinks):
    backlinks_vec = model.encode(backlinks)
    page_vec = model.encode([page]).reshape(1,-1)
    norm = np.linalg.norm(backlinks_vec,axis=1)*np.linalg.norm(page_vec,axis=1)
    cosine_sim = np.dot(backlinks_vec,page_vec.T).reshape(-1,)/norm
    return cosine_sim

def get_backlink_titles(page):
    p_wiki = wiki_wiki.page(page) 
    backlinks =  [i.title() for i in p_wiki.backlinks if ":" not in i.title()][:100]
    if len(backlinks)>1:
        c = get_cosine(page, backlinks)
        top_indices = c.argsort()[::-1][:10]
        return [backlinks[i] for i in top_indices if backlinks[i]!=page]
    return []
    
    

In [6]:
Question_gen = """
Given a wikipedia text generate a question that can be fully answered from the text of medium difficulty with long form answer seperated by \n\n from the given text.

Wikipedia Text:
The Eiffel Tower is a wrought-iron lattice tower located on the Champ de Mars in Paris, France. It was named after the engineer Gustave Eiffel, whose company designed and built the structure. Erected in 1889 as the entrance arch to the 1889 World's Fair, it has become a global cultural icon of France and one of the most recognizable structures in the world. The Eiffel Tower is the tallest structure in Paris and the most-visited paid monument in the world; millions of people ascend it every year. The tower stands at a height of 324 meters (1,063 feet) and was the tallest man-made structure in the world until the completion of the Chrysler Building in New York City in 1930.
The tower has three levels for visitors, with restaurants on the first and second levels. The third level observatory's upper platform is at 276 meters (906 feet) above the ground, the highest accessible to the public in the European Union. The tower has been featured in numerous films and TV shows, and its lighting is often modified to mark special events or holidays. Despite initial criticism from some of France's leading artists and intellectuals, the Eiffel Tower has become a global cultural icon of France and one of the most recognizable structures in the world.
Question: What is the height of the Eiffel Tower, and how does it compare to the height of the Chrysler Building in New York City?
\n\nAnswer:The height of the Eiffel Tower is 324 meters (1,063 feet). It was the tallest man-made structure in the world until the completion of the Chrysler Building in New York City in 1930. The Chrysler Building surpassed the Eiffel Tower's height and became the new tallest structure, reaching a height of 319 meters (1,046 feet) including its spire. However, it's important to note that the Eiffel Tower remains the tallest structure in Paris and still holds its title as the most-visited paid monument in the world. Despite losing its status as the tallest man-made structure globally, the Eiffel Tower's cultural significance and iconic status endure, attracting millions of visitors each year and leaving an indelible mark on the city of Paris and the world at large.
Wikipedia Text:\n{}
"""

In [7]:
from tqdm import tqdm
def generate_dataset(pages):
    data = []
    for page in tqdm(pages):
        try:
            text = get_page_section(page)
            message = [{"role": "user", "content":Question_gen.format(text)}]
            output = llm(message)
            question, answer_grounded = output['choices'][0]['message']['content'].split("\n\n")
            message = [{"role": "user", "content":question}]
            answer = llm(message)['choices'][0]['message']['content']
            data.append({
                "question":question,
                "grounded_answer":answer_grounded,
                "answer":answer,
                "context":text,
                "source":page,
            })
        except Exception as e:
            print(e)
    return data
        
        

https://en.wikipedia.org/wiki/2023#January

In [27]:
with open("/Users/shahules/belar/experimental/pages.txt", "r") as file:
    pages = file.read()


In [28]:
# output = llm(Question_gen.format(c.text))

In [29]:
# question, answer = output['choices'][0]['message']['content'].split("\n\n")

In [30]:
pages = pages.split('\n')[:-1]

In [31]:
pages = ["Volcanism on Venus","Pandemic prevention","Jupiter Icy Moons Explorer"]

In [32]:
data_sample = generate_dataset(pages)

100%|█████████████████████████████████████████████████████| 3/3 [00:54<00:00, 18.14s/it]


In [34]:
data = json.load(open("/Users/shahules/belar/experimental/ragas_wiki_eval.json"))
data.extend(data_sample)

In [35]:
len(data)

50

In [36]:
with open("ragas-eval-data.json",'w') as file:
    json.dump(data, file, indent=4)

In [50]:
"""
    get_recent_changes.py

    MediaWiki API Demos
    Demo of `RecentChanges` module: Get the three most recent changes with
    sizes and flags

    MIT License
"""

import requests

S = requests.Session()

URL = "https://en.wikipedia.org/w/api.php"

PARAMS = {
    "format": "json",
    "rcprop": "title|timestamp|tags",
    "list": "recentchanges",
    "action": "query",
    "rclimit": "1000",
    "rctype":"new"
}

R = S.get(url=URL, params=PARAMS)
DATA = R.json()

RECENTCHANGES = DATA['query']['recentchanges']
remove_tags = ["User:","Category:","Draft","User talk:","Talk:","Wikipedia:","Template talk:"]

for rc in RECENTCHANGES:
    title = str(rc['title'])
    timestamp = str(rc['timestamp'])
    if not any([x in title for x in remove_tags]) and "2021" not in timestamp:
        
        print(title)


Sorn-khiri Sriprachuap
Methane synthesis
1993–94 BCFC season
Hagawi
Populus mexicana
Gracie Lawrence
Gracie Bea Lawrence
Laura Cornelius
A. D. Macklin
Polyosma hirsuta
Maltzanella
Molina Seca
The Life of the Party (play)
Ramnagar Assembly constituency (disambiguation)
Blue Angel (Park Ji-yoon album
A.D. Macklin
Blanca Wiethüchter López
Frances Hughes
Gubad Ibadoghlu
2023 BWF World Championships – Men's doubles
Jesús Urzagasti Aguilera
1925 Reform Party (New Zealand) leadership election
Ana Pinho Rodrigues
Olympian 6
80th Venice Film Festival
2023 Venice Film Festival
2023 Venice International Film Festival
We Find the Bunyip
C-ya-laterrrr
Evan asano
13th Politburo Standing Committee of the Chinese Communist Party
Chase Blasi
Hateful speech
Template:WIR-280
Archaeology in Africa
Home (Secret Invasion)
Anil Thadani
Love Me a Little
Wayne Stevens (disambiguation)
Wayne Osborne (disambiguation)
Wayne Hughes (disambiguation)
Wayne Hammond (disambiguation)
Thai Empire (disambiguation)
Oxygen

In [286]:
data = json.load(open("ragas-data.json"))

In [94]:
evading = ["I'm sorry", "as an AI language model"]

In [99]:
topics_regen = [item for item in data if any(x.lower() in item['answer'].lower() for x in evading)]

In [101]:
message = [{"role": "system", "content": "You're a bot that answers any given question. If you dont know the exact answer make up one."},]

In [105]:
for item in topics_regen:
    message = [{"role": "system", "content": "You're a bot that answers any given question. If you dont know the exact answer make up one."},]
    message.append({"role":"user","content":item['question']})
    output = llm(message)
    item['answer'] = output['choices'][0]['message']['content']

In [42]:
for item in tqdm(data):
    titles = get_backlink_titles(item["source"])
    context = [get_page_section(item,chars=1000) for item in titles[:2]]
    context.insert(0,item['context'])
    item["context_retrieved"] = context

100%|███████████████████████████████████████████████████| 50/50 [01:23<00:00,  1.67s/it]


In [119]:
with open("ragas_wiki_evalv1.json",'w') as file:
    json.dump(data, file, indent=4)

In [56]:
Dataset.from_json("ragas-eval-data.json").push_to_hub("explodinggradients/wiki-eval")

Found cached dataset json (/Users/shahules/.cache/huggingface/datasets/json/default-d425d908fb97025a/0.0.0)
Pushing dataset shards to the dataset hub:   0%|                  | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format: 100%|█████████████████| 1/1 [00:00<00:00, 50.02ba/s][A

Upload 1 LFS files:   0%|                                         | 0/1 [00:00<?, ?it/s][A
Upload 1 LFS files: 100%|█████████████████████████████████| 1/1 [00:05<00:00,  5.34s/it][A
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:06<00:00,  6.27s/it]


## Generate low relevancy answers

In [1]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [22]:
irr_answer = """
Answer the given question partially.
Question:{}
"""

In [12]:
dataset = load_dataset("explodinggradients/wiki-eval")

Downloading and preparing dataset None/None to /Users/shahules/.cache/huggingface/datasets/explodinggradients___parquet/explodinggradients--wiki-eval-f0df84e235efd078/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files: 100%|████████| 1/1 [00:00<00:00, 89.46it/s]
Extracting data files: 100%|████████| 1/1 [00:00<00:00, 389.52it/s]
                                                                   

Dataset parquet downloaded and prepared to /Users/shahules/.cache/huggingface/datasets/explodinggradients___parquet/explodinggradients--wiki-eval-f0df84e235efd078/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


100%|███████████████████████████████| 1/1 [00:00<00:00, 224.16it/s]


In [29]:
answers = []
for item in dataset['train']:
    question = item['question']
    message = []
    message.append({"role":"user","content":irr_answer.format(question)})
    while True:
        try:
            answer = llm(message)['choices'][0]['message']['content']
        except Exception as e:
            continue
        break
    answers.append(answer)


In [31]:
dataset = dataset["train"].add_column("partial_answer",answers)

In [32]:
dataset.push_to_hub("explodinggradients/wiki-eval")

Pushing dataset shards to the dataset hub:   0%| | 0/1 [00:00<?, ?i
Creating parquet from Arrow format: 100%|█| 1/1 [00:00<00:00, 79.01[A

Upload 1 LFS files:   0%|                    | 0/1 [00:00<?, ?it/s][A
Upload 1 LFS files: 100%|████████████| 1/1 [00:06<00:00,  6.60s/it][A
Pushing dataset shards to the dataset hub: 100%|█| 1/1 [00:07<00:00
Deleting unused files from dataset repository: 100%|█| 1/1 [00:00<0
Updating downloaded metadata with the new split.


In [231]:
with open("ragas_wiki_evalv1.json",'w') as file:
    json.dump(data, file, indent=4)

In [17]:
dataset = load_dataset("explodinggradients/wiki-eval")

Found cached dataset parquet (/Users/shahules/.cache/huggingface/datasets/explodinggradients___parquet/explodinggradients--wiki-eval-80d4ef132547f2df/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|████████████████████████████████████████████████████| 1/1 [00:00<00:00, 241.32it/s]


In [18]:
dataset = dataset.select_columns(["question","source","answer","grounded_answer","answer_relevancy_v1","context_v1","context_v2"])

In [19]:
dataset = dataset.rename_columns({"answer":"ungrounded_answer","answer_relevancy_v1":"poor_answer"})

In [20]:
dataset.push_to_hub("explodinggradients/WikiEval")

Pushing split train to the Hub.
Pushing dataset shards to the dataset hub:   0%|                  | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format: 100%|█████████████████| 1/1 [00:00<00:00, 97.16ba/s][A

Upload 1 LFS files:   0%|                                         | 0/1 [00:00<?, ?it/s][A
Upload 1 LFS files: 100%|█████████████████████████████████| 1/1 [00:06<00:00,  6.32s/it][A
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:07<00:00,  7.51s/it]
Deleting unused files from dataset repository: 100%|██████| 1/1 [00:00<00:00,  2.49it/s]
Downloading metadata: 100%|████████████████████████| 1.06k/1.06k [00:00<00:00, 3.07MB/s]


In [4]:
import json
def record_score(scores,name):
    data = json.load(open("ragas_abblation.json"))
    data[name] = scores
    with open("ragas_abblation.json",'w') as file:
        json.dump(data,file,indent=4)
        
def read_score(score_name):
    data = json.load(open("ragas_abblation.json"))
    return data[score_name]

## Ablation studies

In [5]:
from datasets import load_dataset
import numpy as np

In [6]:
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
)
from ragas import evaluate
from ragas.metrics.context_relevance import ContextRelevancy

In [7]:
context_relevancy = ContextRelevancy(strictness=1)

In [8]:
wikieval = load_dataset("explodinggradients/WikiEval")

Found cached dataset parquet (/Users/shahules/.cache/huggingface/datasets/explodinggradients___parquet/explodinggradients--WikiEval-3b60abf6f625ac40/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)
100%|█████████████████████████████████████████████████████| 1/1 [00:00<00:00, 35.36it/s]


In [7]:
wikieval = wikieval['train'].rename_columns({"grounded_answer":"answer","context_v1":"contexts"})

In [19]:
wikieval1 = wikieval.select_columns(['answer', 'question','contexts'])

In [20]:
results = evaluate(dataset=wikieval1,metrics=[context_relevancy,faithfulness,answer_relevancy])

evaluating with [context_relevancy]


100%|█████████████████████████████████████████████████████| 4/4 [04:54<00:00, 73.72s/it]


evaluating with [faithfulness]


100%|████████████████████████████████████████████████████| 4/4 [08:41<00:00, 130.45s/it]


evaluating with [answer_relevancy]


100%|█████████████████████████████████████████████████████| 4/4 [02:01<00:00, 30.46s/it]


In [22]:
wikieval1.select(range(0,20)).to_csv("/Users/shahules/Downloads/wikieval_sample20.csv")

Creating CSV from Arrow format: 100%|█████████████████████| 1/1 [00:00<00:00, 10.41ba/s]


87063

In [23]:
results

{'ragas_score': 0.6130, 'context_relevancy': 0.3521, 'faithfulness': 0.9865, 'answer_relevancy': 0.9613}

In [24]:
results.to_pandas()

Unnamed: 0,answer,question,contexts,context_relevancy,faithfulness,answer_relevancy
0,Answer: The PSLV-C56 mission is scheduled to b...,Question: When is the scheduled launch date an...,[The PSLV-C56 is the 58th mission of Indian Sp...,0.4,1.0,0.942336
1,Answer: The objective of the Uzbekistan-Afghan...,Question: What is the objective of the Uzbekis...,[The Uzbekistan–Afghanistan–Pakistan Railway P...,1.0,1.0,0.949762
2,Answer: PharmaCann was founded in 2014 by Theo...,Question: When was PharmaCann founded and what...,"[Found in 2014 by Theodore Scott, PharmaCann i...",0.333333,1.0,0.963253
3,Answer: Christopher Nolan directed the film Op...,Question: Who directed the film Oppenheimer an...,[Oppenheimer is a 2023 biographical thriller f...,0.038462,1.0,0.984808
4,"Answer: Theranostics, also known as theragnost...",Question: What is theranostics and how does it...,"[Theranostics, also known as theragnostics, is...",0.457143,1.0,0.98118
5,Answer: The human climate niche refers to the ...,Question: What is the human climate niche and ...,[The human climate niche is the ensemble of cl...,0.666667,1.0,0.981146
6,Answer: Dasypoda radchenkoi belongs to the gen...,Question: What is the taxonomy of Dasypoda rad...,"[Dasypoda radchenkoi, also known as Radchenko'...",0.2,1.0,0.969072
7,Answer: The main product of Fremantle Octopus ...,Question: What is the main product of Fremantl...,[Fremantle Octopus is an Australian octopus fi...,0.545455,1.0,0.971285
8,Answer: The Managing Director of FoodFutureCo ...,Question: Who is the Managing Director of Food...,[FoodFutureCo is a scale-up accelerator for pu...,0.545455,1.0,0.97758
9,Answer: The purpose of designing and building ...,Question: What was the purpose of designing an...,[The Fiat Ecobasic is a concept car designed b...,0.333333,1.0,0.981272


### Concat reannoated samples with all samples

In [114]:
wikieval_og = load_dataset("explodinggradients/WikiEval")
wikieval_local = Dataset.from_csv("/Users/shahules/Downloads/wikieval_sample20 - wikieval_sample20.csv.csv")

Found cached dataset parquet (/Users/shahules/.cache/huggingface/datasets/explodinggradients___parquet/explodinggradients--WikiEval-3b60abf6f625ac40/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)
100%|████████████████████████████████████████████████████| 1/1 [00:00<00:00, 305.04it/s]
Found cached dataset csv (/Users/shahules/.cache/huggingface/datasets/csv/default-7607020e99f8dfe0/0.0.0)


In [116]:
wikieval = wikieval_og['train'].select(range(0,20))

In [117]:
wikieval_local = wikieval_local.add_column('poor_answer', wikieval['poor_answer'])

In [118]:
wikieval_local = wikieval_local.add_column('source', wikieval['source'])

In [119]:
wikieval_local = wikieval_local.rename_columns({'contexts':'context_v1'})
wikieval_local = wikieval_local.map(lambda x : {"context_v1":eval(x['context_v1'])})
wikieval_local = wikieval_local.map(lambda x : {"context_v2":eval(x['context_v2'])})

                                                                                        

In [120]:
wikieval_og

DatasetDict({
    train: Dataset({
        features: ['ungrounded_answer', 'source', 'grounded_answer', 'question', 'poor_answer', 'context_v2', 'context_v1'],
        num_rows: 50
    })
})

In [121]:
from datasets import concatenate_datasets

In [123]:
wikieval_latest = concatenate_datasets([wikieval_local,wikieval_og['train'].select(range(20,50))])

## reannotates sample
Test assumptions
1. Ragas scores ranked correctly
2. GPT3.5 scores 
3. GPT3.5 rankings

In [9]:
from datasets import Dataset
# from ragas.metrics.context_relevance import sent_tokenize as ragas_tokenizer

In [124]:
results = evaluate(dataset=wikieval_local,metrics=[faithfulness],column_map={"context_v1":"contexts"})

KeyError: 'answer'

In [21]:
record_score(results.to_pandas()['faithfulness'].values.tolist(),"ungrounded_faithfulness_v1")

In [22]:
results.to_pandas()['faithfulness'].values.tolist()

[0.5,
 0.5,
 0.5,
 0.5,
 0.8,
 0.75,
 0.6,
 0.0,
 0.33333333333333337,
 0.0,
 0.5,
 0.0,
 0.5,
 0.6666666666666667,
 0.33333333333333337,
 0.5,
 0.33333333333333337,
 0.6,
 0.6666666666666667,
 0.0]

In [16]:
results.to_pandas()['faithfulness'].values.tolist()

[1.0,
 0.6666666666666667,
 1.0,
 0.5,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.875,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0]

In [21]:
comp_matrix = np.vstack([read_score("gt_context_v1"),read_score("context_v2_v1")])

In [26]:
comp_matrix[0,:] >= comp_matrix[1,:]

array([ True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True, False,  True,  True,  True,  True,  True,  True,
        True,  True])

In [28]:
np.where(comp_matrix[0,:] < comp_matrix[1,:])

(array([ 3, 11]),)

In [23]:
len(ragas_tokenizer(wikieval1[1]['contexts'][0]))

10

In [83]:
from nltk.tokenize import sent_tokenize

In [88]:
len(sent_tokenize(wikieval1[1]['contexts'][0]))

9

In [91]:
import pysbd
seg = pysbd.Segmenter(language="en", clean=False)
len(seg.segment(wikieval1[1]['contexts'][0]))

10

#### Annotate samples using gpt direct scoring

In [68]:
metric_name = "answer_relevancy"
definition = """answer relevancy: refers to the degree to which a response directly addresses and is appropriate for a given question or context. Penalize incomplete or reduntant information in the answer"""

In [72]:
prompt = """
{definition}
Given question and answer, assign a score for {metric_name} in the range 0-10.
question:\n{question}
answer:\n{answer}
{metric_name} score:
"""

In [80]:
item = wikieval[2]

In [83]:
prompt_input = prompt.format(definition=definition,metric_name=metric_name,
                             question=item['question'],
                             answer=item['grounded_answer'])

In [84]:
llm(prompt_input)

<OpenAIObject chat.completion id=chatcmpl-7yHqQgOC2q8W4x876LorxX9Im4bP3 at 0x7f78d1192570> JSON: {
  "id": "chatcmpl-7yHqQgOC2q8W4x876LorxX9Im4bP3",
  "object": "chat.completion",
  "created": 1694602630,
  "model": "gpt-3.5-turbo-0613",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": "8"
      },
      "finish_reason": "stop"
    }
  ],
  "usage": {
    "prompt_tokens": 115,
    "completion_tokens": 1,
    "total_tokens": 116
  }
}

### Annotate using gpt ranking

In [1]:
prompt = """
{definition}
Given question and two contexts , output rank of each context based on {metric_name}
question:\n{question}
context1:\n{context1}
context2:\n{context2}
output ranks:
"""