In [2]:
import pandas as pd

In [3]:
news_agg = pd.read_csv("data/kaggle/uci-news-aggregator.csv")
print(news_agg.shape)
print(news_agg.columns.tolist())

(422419, 8)
['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP']


In [4]:
clickbait = pd.read_csv("data/clickbait/clickbait_data.csv")
print(clickbait.shape)
print(clickbait.columns.tolist())

(32000, 2)
['headline', 'clickbait']


In [6]:
liar_train = pd.read_csv("data/liar/train.tsv", sep="\t")
print(liar_train.shape)
print(liar_train.columns.tolist())

(10239, 14)
['2635.json', 'false', 'Says the Annies List political group supports third-trimester abortions on demand.', 'abortion', 'dwayne-bohac', 'State representative', 'Texas', 'republican', '0', '1', '0.1', '0.2', '0.3', 'a mailer']


In [7]:
newsroom_train = pd.read_json("data/newsroom/train.jsonl", lines=True)
print(newsroom_train.shape)
print(newsroom_train.columns.tolist())

(995041, 12)
['url', 'archive', 'title', 'date', 'text', 'summary', 'compression', 'coverage', 'density', 'compression_bin', 'coverage_bin', 'density_bin']


In [None]:
def stats(df, text_column):
    text = df[text_column].astype(str).dropna()
    avg_word_count = text.apply(lambda x: len(x.split())).mean()
    vocab = set(" ".join(text.tolist()).split())
    vocab_size = len(vocab)
    duplicate_count = df[text_column].duplicated().sum()
    
    return {
        "No of Variables": df.shape[1],
        "Document Count": df.shape[0],
        "Avg Word Count": int(avg_word_count),
        "Vocab Size": vocab_size,
        "Dup Count": duplicate_count
    }


In [None]:
stats(pd.read_csv("data/kaggle/uci-news-aggregator.csv"), text_column='TITLE')

{'No of Variables': 8,
 'Document Count': 422419,
 'Avg Word Count': 9,
 'Vocab Size': 176490,
 'Dup Count': 15964}

In [None]:
stats(pd.read_csv("data/clickbait/clickbait_data.csv"), text_column='headline')

{'No of Variables': 2,
 'Document Count': 32000,
 'Avg Word Count': 9,
 'Vocab Size': 35789,
 'Dup Count': 0}

In [None]:
stats(pd.read_csv("data/liar/train.tsv", sep="\t"), text_column=liar.columns[2])

{'No of Variables': 14,
 'Document Count': 10239,
 'Avg Word Count': 18,
 'Vocab Size': 21676,
 'Dup Count': 17}

In [None]:
stats(pd.read_json("data/newsroom/train.jsonl", lines=True, nrows=100000), text_column='text')

{'No of Variables': 12,
 'Document Count': 100000,
 'Avg Word Count': 557,
 'Vocab Size': 1386391,
 'Dup Count': 22}

In [55]:
with open("data/newsroom/train.jsonl", "r", encoding="utf-8") as f:
    print(f"Document Count Newsroom: {sum(1 for _ in f)}")

Document Count Newsroom: 995041
