In [1]:
import json
import pandas as pd
import os


data_path = "/mnt/data/wangshu/hcarag/MultiHop-RAG/dataset"
corpus_file = os.path.join(data_path, "corpus.json")
corpus_df = pd.read_json(corpus_file)

print(corpus_df.shape)

corpus_df.head(2)


(609, 7)


Unnamed: 0,title,author,source,published_at,category,url,body
0,200+ of the best deals from Amazon's Cyber Mon...,,Mashable,2023-11-27 08:45:59+00:00,entertainment,https://mashable.com/article/cyber-monday-deal...,"Table of Contents Table of Contents Echo, Fire..."
1,ASX set to drop as Wall Street’s September slu...,Stan Choe,The Sydney Morning Herald,2023-09-26 19:11:30+00:00,business,https://www.smh.com.au/business/markets/asx-se...,"ETF provider Betashares, which manages $30 bil..."


In [2]:
def format_row_as_article(row):
    body_text = row['body'].replace('\n\n', '\n')  # 减少一个换行符
    article_header = f"## {row['title']}\n\n"  # 添加标题
    if pd.notnull(row['author']):
        article = (
            article_header +
            f"{body_text}\n\n"
            f"This article, authored by {row['author']}, "
            f"was published by {row['source']} on {row['published_date']} "
            f"under the category '{row['category']}'.\n\n"
        )
    else:
        article = (
            article_header +
            f"{body_text}\n\n"
            f"This article was published by {row['source']} on {row['published_date']} "
            f"under the category '{row['category']}'.\n\n"
        )
    return article


# 提取日期部分并去掉时间，处理空值
corpus_df["published_date"] = pd.to_datetime(
    corpus_df["published_at"], errors="coerce"
).dt.date
corpus_df['content'] = corpus_df.apply(format_row_as_article, axis=1)
corpus_df.head(2)

Unnamed: 0,title,author,source,published_at,category,url,body,published_date,content
0,200+ of the best deals from Amazon's Cyber Mon...,,Mashable,2023-11-27 08:45:59+00:00,entertainment,https://mashable.com/article/cyber-monday-deal...,"Table of Contents Table of Contents Echo, Fire...",2023-11-27,## 200+ of the best deals from Amazon's Cyber ...
1,ASX set to drop as Wall Street’s September slu...,Stan Choe,The Sydney Morning Herald,2023-09-26 19:11:30+00:00,business,https://www.smh.com.au/business/markets/asx-se...,"ETF provider Betashares, which manages $30 bil...",2023-09-26,## ASX set to drop as Wall Street’s September ...


In [None]:
title_set = set()
all_title_list = []
for i, row in corpus_df.iterrows():
    k = row["title"]
    v = row["content"]
    if k not in title_set:
        all_title_list.append({"title": k, "content": v})

    title_set.add(k)

corpus_df_new = pd.DataFrame(all_title_list)
corpus_df_new["id"] = range(corpus_df_new.shape[0])
print(corpus_df_new.shape)
corpus_df_new.head(2)


(609, 3)


Unnamed: 0,title,content,id
0,200+ of the best deals from Amazon's Cyber Mon...,## 200+ of the best deals from Amazon's Cyber ...,0
1,ASX set to drop as Wall Street’s September slu...,## ASX set to drop as Wall Street’s September ...,1


In [4]:
save_path = "/mnt/data/wangshu/hcarag/MultiHop-RAG/dataset/rag_multihop_corpus.json"
corpus_df_new.to_json(save_path, orient="records", lines=True)

In [11]:
save_path = os.path.join(data_path, "mh-corpus.txt")
# 提取所有行的数据并构建完整文本
complete_text = "".join(corpus_df.apply(format_row_as_article, axis=1))

# 保存到 mh-corpus.txt 文件
with open(save_path, "w") as file:
    file.write(complete_text)

print(f"文本已保存到 {save_path} 文件中。")

文本已保存到 /mnt/data/wangshu/hcarag/MultiHop-RAG/dataset/mh-corpus.txt 文件中。


In [2]:
# Analyze each column
analysis_results = {}

# Title
analysis_results['title'] = {
    'count': corpus_df['title'].count(),
    'unique': corpus_df['title'].nunique(),
    'top': corpus_df['title'].mode()[0],
    'lengths': corpus_df['title'].str.len().describe()
}

# Author
analysis_results['author'] = {
    'count': corpus_df['author'].count(),
    'unique': corpus_df['author'].nunique(),
    'top': corpus_df['author'].mode()[0] if not corpus_df['author'].mode().empty else None
}

# Source
analysis_results['source'] = {
    'count': corpus_df['source'].count(),
    'unique': corpus_df['source'].nunique(),
    'top': corpus_df['source'].mode()[0]
}

# Published At
corpus_df['published_at'] = pd.to_datetime(corpus_df['published_at'])
analysis_results['published_at'] = {
    'min': corpus_df['published_at'].min(),
    'max': corpus_df['published_at'].max(),
    'count': corpus_df['published_at'].count(),
    'unique': corpus_df['published_at'].nunique()
}

# Category
analysis_results['category'] = {
    'count': corpus_df['category'].count(),
    'unique': corpus_df['category'].nunique(),
    'top': corpus_df['category'].mode()[0]
}

# URL
analysis_results['url'] = {
    'count': corpus_df['url'].count(),
    'unique': corpus_df['url'].nunique(),
    'top': corpus_df['url'].mode()[0]
}

# Body
analysis_results['body'] = {
    'count': corpus_df['body'].count(),
    'unique': corpus_df['body'].nunique(),
    'lengths': corpus_df['body'].str.len().describe()
}

# Display the analysis results
for column, stats in analysis_results.items():
    print(f"Analysis for '{column}':")
    for stat, value in stats.items():
        print(f"  {stat}: {value}")
    print()

Analysis for 'title':
  count: 609
  unique: 609
  top: "There are no sacred numbers"
  lengths: count    609.000000
mean      79.067323
std       26.662247
min       24.000000
25%       59.000000
50%       78.000000
75%       96.000000
max      212.000000
Name: title, dtype: float64

Analysis for 'author':
  count: 545
  unique: 300
  top: Natasha Lomas

Analysis for 'source':
  count: 609
  unique: 49
  top: Sporting News

Analysis for 'published_at':
  min: 2023-09-26 05:26:05+00:00
  max: 2023-12-25 23:36:00+00:00
  count: 609
  unique: 606

Analysis for 'category':
  count: 609
  unique: 6
  top: sports

Analysis for 'url':
  count: 609
  unique: 609
  top: https://arizona.rivals.com/news/recap-no-17-arizona-dominates-no-22-utah-42-18-on-senior-day

Analysis for 'body':
  count: 609
  unique: 609
  lengths: count      609.000000
mean     10340.182266
std       7809.231296
min       4770.000000
25%       6112.000000
50%       7836.000000
75%      11471.000000
max      71034.000000
