In [None]:
import os
import csv
from collections import Counter

# Path to folder with Nepali text files
folder_path = "data/corpus"

# Counter for words
word_counter = Counter()

# Read all txt files
for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
            tokens = text.split()   # <-- only space-based splitting
            word_counter.update(tokens)

# Save to unigram.csv
with open("unigram.csv", "w", encoding="utf-8", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["Word", "Count"])  # header
    for word, count in word_counter.most_common():
        writer.writerow([word, count])

print("✅ unigram.csv created successfully!")

In [None]:
import os

def count_characters_in_folder(folder_path, include_spaces=True):
    total_characters = 0
    file_character_counts = {}

    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                    if not include_spaces:
                        content = content.replace(" ", "").replace("\n", "").replace("\t", "")
                    char_count = len(content)
                    file_character_counts[filename] = char_count
                    total_characters += char_count
            except Exception as e:
                print(f"Error reading {filename}: {e}")

    print("Character count per file:")
    for file, count in file_character_counts.items():
        print(f"{file}: {count} characters")

    print(f"\nTotal characters across all .txt files: {total_characters}")

# Example usage:
# Replace with your folder path
folder_path = "data/corpus"
count_characters_in_folder(folder_path, include_spaces=True)

In [None]:
import os
import csv
from collections import Counter

folder_path = "data/corpus"

# Collect all text at once (fewer I/O calls)
texts = []
for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        with open(os.path.join(folder_path, filename), "r", encoding="utf-8") as f:
            texts.append(f.read())

# Single big join + split
all_text = " ".join(texts)
tokens = all_text.split()  # space-based split only

# Count words
word_counter = Counter(tokens)

# Write to CSV
with open("unigram.csv", "w", encoding="utf-8", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["Word", "Count"])
    writer.writerows(word_counter.most_common())

print("✅ unigram.csv created successfully!")

In [None]:
import csv

input_file = "unigram.csv"
output_file = "unigram_sorted.csv"

# Read CSV
with open(input_file, "r", encoding="utf-8") as f:
    reader = csv.reader(f)
    header = next(reader)  # keep header
    rows = list(reader)

# Sort by word (column 0)
rows.sort(key=lambda x: x[0])

# Write sorted CSV
with open(output_file, "w", encoding="utf-8", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(header)
    writer.writerows(rows)

print("✅ unigram_sorted.csv created (sorted by Word).")


In [None]:
import pandas as pd
import re
from collections import Counter

# Read the CSV file
df = pd.read_csv('corpus.csv')

# Concatenate all content into a single string
# .astype(str) is used to ensure all content is treated as a string,
# which helps avoid errors if there are non-string values.
all_text = ' '.join(df['content'].astype(str))

# Use a regular expression to find all words that consist solely of Devanagari characters,
# excluding the '।' character (danda).
tokens = re.findall(r'[\u0900-\u0963\u0965-\u097F]+', all_text)

# Count words
word_counter = Counter(tokens)

# Convert the counter to a DataFrame
word_counts_df = pd.DataFrame(word_counter.most_common(), columns=['Word', 'Count'])

# Write to CSV
word_counts_df.to_csv("unigram_re.csv", index=False, encoding="utf-8")

print("✅ unigram_re.csv created successfully!")

In [None]:
import pandas as pd

# Read the CSV file
df = pd.read_csv("unigram_re.csv")

# Sort the DataFrame alphabetically by the 'Word' column
df_sorted = df.sort_values(by="Word")

# Save the sorted DataFrame to a new CSV file
df_sorted.to_csv("unigram_re_sorted.csv", index=False, encoding="utf-8")

print("✅ unigram_re_sorted.csv created successfully!")