In [3]:
import re
import json
import nltk
import random
import pandas as pd
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

## Preprocessing 

In [6]:
# === CONFIG ===
train_csv = "train.csv"
test_csv = "test.csv"
output_file = "data.txt"

# === Setup ===
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()
tqdm.pandas()

# === Load and Concatenate Train + Test ===
train_df = pd.read_csv(train_csv)
test_df = pd.read_csv(test_csv)
df = pd.concat([train_df, test_df], ignore_index=True)

# === Shuffle rows of the DataFrame ===
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# === Preprocessing Function ===
def preprocess(text):
    text = text.lower()
    words = re.findall(r'\b\w+\b', text)
    filtered = [stemmer.stem(w) for w in words if w not in stop_words]
    return ' '.join(filtered)

# === Apply Preprocessing ===
df['text'] = (df['title'] + " " + df['content']).progress_apply(preprocess)

# === Write to Output File ===
with open(output_file, 'w') as f_out:
    for _, row in df.iterrows():
        record = {
            "text": row['text'],
            "cluster": int(row['label'])  # use original label
        }
        f_out.write(json.dumps(record) + "\n")

print(f"Preprocessing complete. Output saved to: {output_file}")

[nltk_data] Downloading package stopwords to /home/reda/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 630000/630000 [02:52<00:00, 3660.44it/s]


Preprocessing complete. Output saved to: data.txt


## Statistics

In [7]:
texts = []
clusters = []

with open("data.txt", "r") as f:
    for line in f:
        record = json.loads(line)
        texts.append(record["text"])
        clusters.append(record["cluster"])

n = len(texts)
K = len(set(clusters))

# tokenize documents (split by whitespace, since your texts are preprocessed)
tokenized_docs = [doc.split() for doc in texts]

# vocabulary is the set of all unique words
vocab = set(word for doc in tokenized_docs for word in doc)
vocab_size = len(vocab)

# average length of documents (number of tokens)
avg_len = sum(len(doc) for doc in tokenized_docs) / n

print(f"Number of documents (n): {n}")
print(f"Number of clusters (K): {K}")
print(f"Vocabulary size: {vocab_size}")
print(f"Average document length: {avg_len:.2f} tokens")

Number of documents (n): 630000
Number of clusters (K): 14
Vocabulary size: 666482
Average document length: 32.81 tokens


# Extract sub documents

In [10]:

input_file = "data.txt"
min_docs = 100000  # fixed number of documents you want

texts = []
clusters = []

with open(input_file, "r") as f:
    for line in f:
        record = json.loads(line)
        texts.append(record["text"])
        clusters.append(record["cluster"])

tokenized_docs = [doc.split() for doc in texts]
doc_vocab_sizes = [len(set(doc)) for doc in tokenized_docs]

docs_with_stats = list(zip(tokenized_docs, clusters, doc_vocab_sizes))
docs_with_stats_sorted = sorted(docs_with_stats, key=lambda x: x[2])

# Pick first min_docs documents with smallest vocab sizes
selected_docs = [doc for doc, _, _ in docs_with_stats_sorted[:min_docs]]
selected_clusters = [cluster for _, cluster, _ in docs_with_stats_sorted[:min_docs]]

# Compute combined vocab size
combined_vocab = set(word for doc in selected_docs for word in doc)
combined_vocab_size = len(combined_vocab)

# Average doc length
avg_len = sum(len(doc) for doc in selected_docs) / min_docs

print(f"Selected {min_docs} documents")
print(f"Minimal combined vocabulary size (heuristic): {combined_vocab_size}")
print(f"Average document length: {avg_len:.2f} tokens")


Selected 100000 documents
Minimal combined vocabulary size (heuristic): 116154
Average document length: 11.37 tokens
