In [1]:
# Cell 1: Install & Setup
# Amra 'fastparquet' use korbo karon eta disk space bachay ar fast kaj kore
!pip install -q datasets pyarrow fastparquet
import pandas as pd
import os
import gc
import json
from datasets import load_dataset

# FINAL OUTPUT FOLDER (Sob data ekhane joma hobe)
OUTPUT_DIR = "swarabyanjan_final_dataset"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("üöÄ Setup Complete. Output Folder Created: swarabyanjan_final_dataset")

[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.8/1.8 MB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[?25hüöÄ Setup Complete. Output Folder Created: swarabyanjan_final_dataset


In [2]:
# Cell 2: Configuration & Mapping
# Kon dataset er kon column ke 'headline' ar 'body' banabo, tar list

CONFIG = {
    # 21GB Monster File
    'bangla-largest': {'headline': 'headline', 'body': 'body'},
    
    # Common CSVs
    'bdnews24': {'headline': 'title', 'body': 'contents'},
    'bdpratidin': {'headline': 'title', 'body': 'article'},
    'potrika': {'headline': 'Headline', 'body': 'News Article'},
    
    # Furcifer (data.json)
    'data': {'headline': 'title', 'body': 'content'},
    
    # Folder-based CSVs (Sports, Politics etc.)
    'Politics': {'headline': 'Heading', 'body': 'News'},
    'International': {'headline': 'Heading', 'body': 'News'},
    'Sports': {'headline': 'Heading', 'body': 'News'},
    'Science': {'headline': 'Heading', 'body': 'News'},
    'National': {'headline': 'Heading', 'body': 'News'},
    'Entertainment': {'headline': 'Heading', 'body': 'News'},
    'Education': {'headline': 'Heading', 'body': 'News'},
    'Economy': {'headline': 'Heading', 'body': 'News'},
    
    # 40k Dataset (Headless)
    'BalancedDataset': {'headline': None, 'body': 'article'}, 
    '40k': {'headline': None, 'body': 'article'}
}

print("‚úÖ Configuration Loaded. Ready to standarize columns.")

‚úÖ Configuration Loaded. Ready to standarize columns.


In [3]:
# Cell 3: The Optimized Saver Function (CPU Safe)

def save_parquet(df, source_name, part_id):
    """ 
    1. Renames columns to 'headline' & 'body'
    2. Drops unnecessary columns (author, date, tags) to save space
    3. Saves as compressed Parquet file
    """
    # 1. Standardize Column Names
    rename_map = {}
    
    # Find mapping based on source name
    mapping = None
    for key in CONFIG:
        if key in source_name:
            mapping = CONFIG[key]
            break
    
    if not mapping:
        mapping = {'headline': 'title', 'body': 'content'} # Fallback default

    # Rename Headline
    if mapping['headline'] and mapping['headline'] in df.columns:
        rename_map[mapping['headline']] = 'headline'
    
    # Rename Body
    if mapping['body'] in df.columns:
        rename_map[mapping['body']] = 'body'
        
    df = df.rename(columns=rename_map)
    
    # Handle missing headline (e.g. for 40k dataset)
    if 'headline' not in df.columns:
        df['headline'] = '' 
        
    # CRITICAL: If body doesn't exist, skip this chunk
    if 'body' not in df.columns:
        return 

    # 2. Add Source & Select ONLY needed columns (Size Reduction Step)
    df['source'] = source_name
    try:
        df = df[['headline', 'body', 'source']].astype(str)
    except KeyError:
        return # Skip if columns missing
    
    # 3. Save Compressed
    safe_source = source_name.replace(" ", "_").replace("-", "_")
    filename = f"{OUTPUT_DIR}/{safe_source}_part_{part_id}.parquet"
    
    # 'snappy' compression is very fast and reduces size by 70-80%
    df.to_parquet(filename, index=False, compression='snappy')
    print(f"  ‚úÖ Saved: {filename} ({len(df)} rows)")
    
    # Free RAM immediately
    del df; gc.collect()

In [4]:
# Cell 4: Process Local Files (Chunk by Chunk to prevent RAM Crash)
print("üëâ Processing Local Kaggle Files...")

for root, dirs, files in os.walk('/kaggle/input'):
    folder_name = os.path.basename(root)
    
    for file in files:
        file_path = os.path.join(root, file)
        
        # Skip output files or checkpoints
        if file.endswith('.parquet') or 'checkpoint' in file: continue
        
        try:
            # A. HANDLE JSON FILES (Can be huge)
            if file.endswith('.json'):
                print(f"Reading JSON: {file} (Folder: {folder_name})...")
                try:
                    # Attempt to read standard JSON
                    df = pd.read_json(file_path)
                    save_parquet(df, folder_name, 0)
                except ValueError:
                    print(f"  ‚ö†Ô∏è JSON Error in {file} (Format or Size issue). Skipping.")
            
            # B. HANDLE CSV FILES (Read in Chunks)
            elif file.endswith('.csv'):
                print(f"Reading CSV: {file} (Folder: {folder_name})...")
                chunk_no = 0
                # 100k rows at a time = RAM Safe
                for chunk in pd.read_csv(file_path, chunksize=100000, on_bad_lines='skip'):
                    save_parquet(chunk, folder_name, chunk_no)
                    chunk_no += 1
            
            # C. HANDLE JSONL (Prothom Alo)
            elif file.endswith('.jsonl'):
                print(f"Reading JSONL: {file}...")
                data = []
                chunk_count = 0
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    for i, line in enumerate(f):
                        try: 
                            obj = json.loads(line)
                            # Prothom Alo Specific Mapping
                            data.append({'headline': obj.get('Title'), 'body': obj.get('Body')})
                            
                            # Save every 200k lines
                            if len(data) >= 200000:
                                save_parquet(pd.DataFrame(data), 'prothomalo', chunk_count)
                                data = []
                                chunk_count += 1
                        except: continue
                # Save remaining
                if data:
                    save_parquet(pd.DataFrame(data), 'prothomalo', chunk_count)

        except Exception as e:
            print(f"  ‚ùå Error processing {file}: {e}")
            continue

üëâ Processing Local Kaggle Files...
Reading JSONL: prothomalo_articles.jsonl...
  ‚úÖ Saved: swarabyanjan_final_dataset/prothomalo_part_0.parquet (200000 rows)
  ‚úÖ Saved: swarabyanjan_final_dataset/prothomalo_part_1.parquet (6762 rows)
Reading CSV: bdnews24.csv (Folder: bdnews24-corpus)...
  ‚úÖ Saved: swarabyanjan_final_dataset/bdnews24_corpus_part_0.parquet (100000 rows)
  ‚úÖ Saved: swarabyanjan_final_dataset/bdnews24_corpus_part_1.parquet (100000 rows)
  ‚úÖ Saved: swarabyanjan_final_dataset/bdnews24_corpus_part_2.parquet (100000 rows)
  ‚úÖ Saved: swarabyanjan_final_dataset/bdnews24_corpus_part_3.parquet (100000 rows)
  ‚úÖ Saved: swarabyanjan_final_dataset/bdnews24_corpus_part_4.parquet (17808 rows)
Reading CSV: somoyer_alo_2020_politics_text.csv (Folder: Politics)...
  ‚úÖ Saved: swarabyanjan_final_dataset/Politics_part_0.parquet (646 rows)
Reading CSV: ittefaq_2019_2020_politics_text.csv (Folder: Politics)...
  ‚úÖ Saved: swarabyanjan_final_dataset/Politics_part_0.parquet (

In [5]:
# Cell 5: Process Hugging Face (Zabir Nabil)
print("\nüëâ Downloading & Processing Hugging Face (zabir-nabil/bangla_newspaper_dataset)...")

try:
    # Stream mode use korchi jate purota ekbare download na hoy
    ds = load_dataset("zabir-nabil/bangla_newspaper_dataset", split="train")
    
    # Convert to Pandas in Batches (RAM Safe)
    batch_size = 200000
    total_rows = len(ds)
    
    for i in range(0, total_rows, batch_size):
        # Get a slice
        df_batch = ds.select(range(i, min(i + batch_size, total_rows))).to_pandas()
        
        # Manual Mapping for HF
        df_batch = df_batch.rename(columns={'title': 'headline', 'content': 'body'})
        
        # Save
        save_parquet(df_batch, 'hf_zabir_nabil', i // batch_size)
        
        del df_batch; gc.collect()
        
    print("‚úÖ Hugging Face Data Processed Successfully!")

except Exception as e:
    print(f"  ‚ùå HF Error: {e}")

print(f"\nüéâ ALL DONE! All files are optimized and merged in '{OUTPUT_DIR}'")


üëâ Downloading & Processing Hugging Face (zabir-nabil/bangla_newspaper_dataset)...


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00003.parquet:   0%|          | 0.00/170M [00:00<?, ?B/s]

data/train-00001-of-00003.parquet:   0%|          | 0.00/170M [00:00<?, ?B/s]

data/train-00002-of-00003.parquet:   0%|          | 0.00/171M [00:00<?, ?B/s]

data/valid-00000-of-00001.parquet:   0%|          | 0.00/118M [00:00<?, ?B/s]

data/test_1-00000-of-00001.parquet:   0%|          | 0.00/157M [00:00<?, ?B/s]

data/test_2-00000-of-00001.parquet:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/265506 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/61274 [00:00<?, ? examples/s]

Generating test_1 split:   0%|          | 0/81691 [00:00<?, ? examples/s]

Generating test_2 split:   0%|          | 0/495 [00:00<?, ? examples/s]

  ‚úÖ Saved: swarabyanjan_final_dataset/hf_zabir_nabil_part_0.parquet (200000 rows)
  ‚úÖ Saved: swarabyanjan_final_dataset/hf_zabir_nabil_part_1.parquet (65506 rows)
‚úÖ Hugging Face Data Processed Successfully!

üéâ ALL DONE! All files are optimized and merged in 'swarabyanjan_final_dataset'
