In [None]:
!pip install beir datasets

Collecting beir
  Downloading beir-2.0.0.tar.gz (53 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/53.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.6/53.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting sentence-transformers (from beir)
  Downloading sentence_transformers-3.1.1-py3-none-any.whl.metadata (10 kB)
Collecting pytrec_eval (from beir)
  Downloading pytrec_eval-0.5.tar.gz (15 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting faiss_cpu (from beir)
  Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Collecting elasticsearch==7.9.1 (from beir)
  Downloading elasticsearch-7.9.1-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Dow

In [None]:
from beir.datasets.data_loader import GenericDataLoader
from beir import util as beir_util

  from tqdm.autonotebook import tqdm


In [None]:
# BEIR utility to download the dataset
dataset = "fiqa"  # Natural Questions (NQ) dataset
url = f"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{dataset}.zip"
out_dir = beir_util.download_and_unzip(url, "datasets")

# Load the dataset
corpus, queries, qrels = GenericDataLoader(out_dir).load(split="test")


  0%|          | 0/57638 [00:00<?, ?it/s]

In [None]:
import json

# Open and load corpus.jsonl
corpus = []
with open('/content/datasets/fiqa/corpus.jsonl', 'r') as f:
    for line in f:
        corpus.append(json.loads(line.strip()))

# Convert to DataFrame for easier handling
import pandas as pd
corpus_df = pd.DataFrame(corpus)


In [None]:
corpus_df.head()

Unnamed: 0,_id,title,text,metadata
0,3,,I'm not saying I don't like the idea of on-the...,{}
1,31,,So nothing preventing false ratings besides ad...,{}
2,56,,You can never use a health FSA for individual ...,{}
3,59,,Samsung created the LCD and other flat screen ...,{}
4,63,,Here are the SEC requirements: The federal sec...,{}


In [None]:
# Open and load queries.jsonl
queries = []
with open('/content/datasets/fiqa/queries.jsonl', 'r') as f:
    for line in f:
        queries.append(json.loads(line.strip()))

# Convert to DataFrame
queries_df = pd.DataFrame(queries)


In [None]:
queries_df.head()

Unnamed: 0,_id,text,metadata
0,0,What is considered a business expense on a bus...,{}
1,4,Business Expense - Car Insurance Deductible Fo...,{}
2,5,Starting a new online business,{}
3,6,“Business day” and “due date” for bills,{}
4,7,New business owner - How do taxes work for the...,{}


In [None]:
data = corpus_df.loc[:2, 'text']
data

Unnamed: 0,text
0,I'm not saying I don't like the idea of on-the...
1,So nothing preventing false ratings besides ad...
2,You can never use a health FSA for individual ...


In [None]:
# Load qrels from train.tsv
qrels_df = pd.read_csv('/content/datasets/fiqa/qrels/train.tsv', sep='\t')
qrels_df = qrels_df.rename(columns={'query-id': 'query_id', 'corpus-id': 'doc_id','score':'relevance'})
qrels_df.head()  # Check the first few rows


Unnamed: 0,query_id,doc_id,relevance
0,0,18850,1
1,4,196463,1
2,5,69306,1
3,6,560251,1
4,6,188530,1


In [None]:
# Check if all fields are present
print(corpus_df.columns)  # Should include 'doc_id', 'text' or similar
print(queries_df.columns)  # Should include 'query_id', 'text' or similar


Index(['_id', 'title', 'text', 'metadata'], dtype='object')
Index(['_id', 'text', 'metadata'], dtype='object')


In [None]:
corpus_df = corpus_df.rename(columns={'_id': 'doc_id', 'text': 'document_text'})
queries_df = queries_df.rename(columns={'_id': 'query_id', 'text': 'query_text'})
# Drop 'metadata' from df
corpus_df = corpus_df.drop(columns=['metadata'])
queries_df = queries_df.drop(columns=['metadata'])


In [None]:
corpus_df = corpus_df.drop(columns=['title'])

In [None]:
print(corpus_df.head())  # Check corpus_df structure with 'doc_id' and 'document_text'
print(queries_df.head())  # Check queries_df structure with 'query_id' and 'query_text'

  doc_id                                      document_text
0      3  I'm not saying I don't like the idea of on-the...
1     31  So nothing preventing false ratings besides ad...
2     56  You can never use a health FSA for individual ...
3     59  Samsung created the LCD and other flat screen ...
4     63  Here are the SEC requirements: The federal sec...
  query_id                                         query_text
0        0  What is considered a business expense on a bus...
1        4  Business Expense - Car Insurance Deductible Fo...
2        5                     Starting a new online business
3        6            “Business day” and “due date” for bills
4        7  New business owner - How do taxes work for the...


In [None]:
queries_df.dtypes

Unnamed: 0,0
query_id,object
query_text,object


In [None]:
# Ensure both 'query_id' columns are of the same type (str in this case)
qrels_df['query_id'] = qrels_df['query_id'].astype(int)
corpus_df['doc_id'] = corpus_df['doc_id'].astype(int)
queries_df['query_id'] = queries_df['query_id'].astype(int)

In [None]:
# Merge qrels (relevance scores) with queries (queries_df)
merged_df = pd.merge(qrels_df, queries_df, left_on='query_id', right_on='query_id')

# Now merge this result with corpus (corpus_df)
merged_df = pd.merge(merged_df, corpus_df, left_on='doc_id', right_on='doc_id')

# You should now have a dataframe containing the query, document, and relevance
print(merged_df[['query_id', 'doc_id', 'query_text', 'document_text', 'relevance']].head())

   query_id  doc_id                                         query_text  \
0         0   18850  What is considered a business expense on a bus...   
1         4  196463  Business Expense - Car Insurance Deductible Fo...   
2         5   69306                     Starting a new online business   
3         6  560251            “Business day” and “due date” for bills   
4         6  188530            “Business day” and “due date” for bills   

                                       document_text  relevance  
0  The IRS Guidance pertaining to the subject.  I...          1  
1  As a general rule, you must choose between a m...          1  
2  Most US states have rules that go something li...          1  
3  I don't believe Saturday is a business day eit...          1  
4  You definitely have an argument for getting th...          1  


In [None]:
merged_df.head()

Unnamed: 0,query_id,doc_id,relevance,query_text,document_text
0,0,18850,1,What is considered a business expense on a bus...,The IRS Guidance pertaining to the subject. I...
1,4,196463,1,Business Expense - Car Insurance Deductible Fo...,"As a general rule, you must choose between a m..."
2,5,69306,1,Starting a new online business,Most US states have rules that go something li...
3,6,560251,1,“Business day” and “due date” for bills,I don't believe Saturday is a business day eit...
4,6,188530,1,“Business day” and “due date” for bills,You definitely have an argument for getting th...


In [None]:
merged_df['query_length'] = merged_df['query_text'].apply(len)
merged_df['doc_length'] = merged_df['document_text'].apply(len)
print(merged_df[['query_length', 'doc_length']].describe())


       query_length    doc_length
count  14166.000000  14166.000000
mean      62.115558   1028.606593
std       22.648055    911.420284
min       14.000000      0.000000
25%       46.000000    440.000000
50%       60.000000    780.000000
75%       76.000000   1318.000000
max      158.000000  13080.000000


In [None]:
print(merged_df['relevance'].value_counts())  # Check the distribution of relevance labels

relevance
1    14166
Name: count, dtype: int64


In [None]:
print(merged_df.iloc[10, 3:5])

query_text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              

In [None]:
df = merged_df.copy()

In [None]:
df = df.drop(columns=['query_length','doc_length'])

In [None]:
df.head()

Unnamed: 0,query_id,doc_id,relevance,query_text,document_text
0,0,18850,1,What is considered a business expense on a business trip?,"The IRS Guidance pertaining to the subject. In general the best I can say is your business expense may be deductible. But it depends on the circumstances and what it is you want to deduct. Travel Taxpayers who travel away from home on business may deduct related expenses, including the cost of reaching their destination, the cost of lodging and meals and other ordinary and necessary expenses. Taxpayers are considered “traveling away from home” if their duties require them to be away from home substantially longer than an ordinary day’s work and they need to sleep or rest to meet the demands of their work. The actual cost of meals and incidental expenses may be deducted or the taxpayer may use a standard meal allowance and reduced record keeping requirements. Regardless of the method used, meal deductions are generally limited to 50 percent as stated earlier. Only actual costs for lodging may be claimed as an expense and receipts must be kept for documentation. Expenses must be reasonable and appropriate; deductions for extravagant expenses are not allowable. More information is available in Publication 463, Travel, Entertainment, Gift, and Car Expenses. Entertainment Expenses for entertaining clients, customers or employees may be deducted if they are both ordinary and necessary and meet one of the following tests: Directly-related test: The main purpose of the entertainment activity is the conduct of business, business was actually conducted during the activity and the taxpayer had more than a general expectation of getting income or some other specific business benefit at some future time. Associated test: The entertainment was associated with the active conduct of the taxpayer’s trade or business and occurred directly before or after a substantial business discussion. Publication 463 provides more extensive explanation of these tests as well as other limitations and requirements for deducting entertainment expenses. Gifts Taxpayers may deduct some or all of the cost of gifts given in the course of their trade or business. In general, the deduction is limited to $25 for gifts given directly or indirectly to any one person during the tax year. More discussion of the rules and limitations can be found in Publication 463. If your LLC reimburses you for expenses outside of this guidance it should be treated as Income for tax purposes. Edit for Meal Expenses: Amount of standard meal allowance. The standard meal allowance is the federal M&IE rate. For travel in 2010, the rate for most small localities in the United States is $46 a day. Source IRS P463 Alternately you could reimburse at a per diem rate"
1,4,196463,1,Business Expense - Car Insurance Deductible For Accident That Occurred During a Business Trip,"As a general rule, you must choose between a mileage deduction or an actual expenses deduction. The idea is that the mileage deduction is supposed to cover all costs of using the car. Exceptions include parking fees and tolls, which can be deducted separately under either method. You explicitly cannot deduct insurance costs if you claim a mileage deduction. Separately, you probably won't be able to deduct the deductible for your car as a casualty loss. You first subtract $100 from the deductible and then divide it by your Adjusted Gross Income (AGI) from your tax return. If your deductible is over 10% of your AGI, you can deduct it. Note that even with a $1500 deductible, you won't be able to deduct anything if you made more than $14,000 for the year. For most people, the insurance deductible just isn't large enough relative to income to be tax deductible. Source"
2,5,69306,1,Starting a new online business,"Most US states have rules that go something like this: You will almost certainly have to pay some registration fees, as noted above. Depending on how you organize, you may or may not need to file a separate tax return for the business. (If you're sole proprietor for tax purposes, then you file on Schedule C on your personal Form 1040.) Whether or not you pay taxes depends on whether you have net income. It's possible that some losses might also be deductible. (Note that you may have to file a return even if you don't have net income - Filing and needing to pay are not the same since your return may indicate no tax due.) In addition, at the state level, you may have to pay additional fees or taxes beyond income tax depending on what you sell and how you sell it. (Sales tax, for example, might come into play as might franchise taxes.) You'll need to check your own state law for that. As always, it could be wise to get professional tax and accounting advice that's tailored to your situation and your state. This is just an outline of some things that you'll need to consider."
3,6,560251,1,“Business day” and “due date” for bills,"I don't believe Saturday is a business day either. When I deposit a check at a bank's drive-in after 4pm Friday, the receipt tells me it will credit as if I deposited on Monday. If a business' computer doesn't adjust their billing to have a weekday due date, they are supposed to accept the payment on the next business day, else, as you discovered, a Sunday due date is really the prior Friday. In which case they may be running afoul of the rules that require X number of days from the time they mail a bill to the time it's due. The flip side to all of this, is to pick and choose your battles in life. Just pay the bill 2 days early. The interest on a few hundred dollars is a few cents per week. You save that by not using a stamp, just charge it on their site on the Friday. Keep in mind, you can be right, but their computer still dings you. So you call and spend your valuable time when ever the due date is over a weekend, getting an agent to reverse the late fee. The cost of 'right' is wasting ten minutes, which is worth far more than just avoiding the issue altogether. But - if you are in the US (you didn't give your country), we have regulations for everything. HR 627, aka The CARD act of 2009, offers - ‘‘(2) WEEKEND OR HOLIDAY DUE DATES.—If the payment due date for a credit card account under an open end consumer credit plan is a day on which the creditor does not receive or accept payments by mail (including weekends and holidays), the creditor may not treat a payment received on the next business day as late for any purpose.’’. So, if you really want to pursue this, you have the power of our illustrious congress on your side."
4,6,188530,1,“Business day” and “due date” for bills,"You definitely have an argument for getting them to reverse the late fee, especially if it hasn't happened very often. (If you are late every month they may be less likely to forgive.) As for why this happens, it's not actually about business days, but instead it's based on when they know that you paid. In general, there are 2 ways for a company to mark a bill as paid: Late Fees: Some systems automatically assign late fees at the start of the day after the due date if money has not been received. In your case, if your bill was due on the 24th, the late fee was probably assessed at midnight of the 25th, and the payment arrived after that during the day of the 25th. You may have been able to initiate the payment on the company's website at 11:59pm on the 24th and not have received a late fee (or whatever their cutoff time is). Suggestion: as a rule of thumb, for utility bills whose due date and amount can vary slightly from month to month, you're usually better off setting up your payments on the company website to pull from your bank account, instead of setting up your bank account to push the payment to the company. This will ensure that you always get the bill paid on time and for the correct amount. If you still would rather push the payment from your bank account, then consider setting up the payment to arrive about 5 days early, to account for holidays and weekends."


In [None]:
df.to_csv('fiqa.csv', index=False)