In [1]:
import os
import re
import pandas as pd
from tqdm import tqdm

In [2]:
def preprocess_text(text):
    """
    Cleans and preprocesses the text data.
    """
    # Remove double quotes
    text = text.replace('"', '')

    # Remove redundant spaces and newline characters
    text = re.sub(r'\s+', ' ', text).strip()

    # Normalize other whitespace
    text = re.sub(r'[ \t]+', ' ', text)

    # Remove duplicate sentences/lines
    sentences = list(dict.fromkeys(text.split('. ')))
    processed_text = '. '.join(sentences)

    return processed_text

In [3]:
def load_and_preprocess_data_by_category(directory):
    """
    Loads and preprocesses text data from a given directory and creates separate CSV files for each category.
    """
    categories = [
        folder
        for folder in os.listdir(directory)
        if os.path.isdir(os.path.join(directory, folder))
    ]

    print(f"Found {len(categories)} categories: {categories}")

    # Traverse through each category folder
    for category in tqdm(categories, desc="Processing categories", unit="category"):
        category_path = os.path.join(directory, category)
        txt_files = [
            f
            for f in os.listdir(category_path)
            if f.endswith(".txt")
        ]

        print(f" - Found {len(txt_files)} files in category '{category}'.\n")

        category_data = []

        for filename in tqdm(txt_files, desc=f"Processing files in {category}", unit="file", leave=False):
            file_path = os.path.join(category_path, filename)

            # Read file content
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()

            # Preprocess the content
            processed_content = preprocess_text(content)

            # Append to category data
            category_data.append({
                "category": category,
                "text": processed_content
            })

        # Convert category data to DataFrame
        df = pd.DataFrame(category_data)

        # Save to CSV
        output_file = f"../dataset/processed/{category}.csv"
        df.to_csv(output_file, index=False, encoding='utf-8')
        print(f"Saved {len(category_data)} records to {output_file}")

In [4]:
# Directory path
dataset_directory = "../dataset/NepaliNewsDataset"

# Load and preprocess data by category
print("Starting data loading and preprocessing by category...")
load_and_preprocess_data_by_category(dataset_directory)
print("All category data has been processed and saved to respective CSV files.")

Starting data loading and preprocessing by category...
Found 10 categories: ['ArthaBanijya', 'Bichar', 'Desh', 'Khelkud', 'Manoranjan', 'Prabas', 'Sahitya', 'SuchanaPrabidhi', 'Swasthya', 'Viswa']


Processing categories:   0%|          | 0/10 [00:00<?, ?category/s]

 - Found 1000 files in category 'ArthaBanijya'.



Processing categories:  10%|█         | 1/10 [00:00<00:03,  2.77category/s]

Saved 1000 records to ../dataset/processed/ArthaBanijya.csv
 - Found 1000 files in category 'Bichar'.



Processing categories:  20%|██        | 2/10 [00:01<00:04,  1.64category/s]

Saved 1000 records to ../dataset/processed/Bichar.csv
 - Found 1000 files in category 'Desh'.



Processing categories:  30%|███       | 3/10 [00:01<00:03,  2.29category/s]

Saved 1000 records to ../dataset/processed/Desh.csv
 - Found 1000 files in category 'Khelkud'.



Processing categories:  40%|████      | 4/10 [00:01<00:02,  2.64category/s]

Saved 1000 records to ../dataset/processed/Khelkud.csv
 - Found 1000 files in category 'Manoranjan'.



Processing categories:  50%|█████     | 5/10 [00:01<00:01,  2.91category/s]

Saved 1000 records to ../dataset/processed/Manoranjan.csv
 - Found 1000 files in category 'Prabas'.



Processing categories:  60%|██████    | 6/10 [00:02<00:01,  3.09category/s]

Saved 1000 records to ../dataset/processed/Prabas.csv
 - Found 1000 files in category 'Sahitya'.



Processing categories:  70%|███████   | 7/10 [00:02<00:00,  3.09category/s]

Saved 1000 records to ../dataset/processed/Sahitya.csv
 - Found 1000 files in category 'SuchanaPrabidhi'.



Processing categories:  80%|████████  | 8/10 [00:02<00:00,  3.16category/s]

Saved 1000 records to ../dataset/processed/SuchanaPrabidhi.csv
 - Found 1000 files in category 'Swasthya'.



Processing categories:  90%|█████████ | 9/10 [00:03<00:00,  3.07category/s]

Saved 1000 records to ../dataset/processed/Swasthya.csv
 - Found 1000 files in category 'Viswa'.



Processing categories: 100%|██████████| 10/10 [00:03<00:00,  2.88category/s]

Saved 1000 records to ../dataset/processed/Viswa.csv
All category data has been processed and saved to respective CSV files.



