In [48]:
# Dictionary mapping Unicode Nepali characters to Preeti font characters
unicodeToPreetiDict = {
    "अ": "c", "आ": "cf", "ा": "f", "इ": "O", "ई": "O{", "र्": "{",
    "उ": "p", "ए": "P", "े": "]", "ै": "}", "ो": "f]", "ौ": "f}",
    "ओ": "cf]", "औ": "cf}", "ं": "+", "ँ": "F", "ि": "l", "ी": "L",
    "ु": "'", "ू": '"', "क": "s", "ख": "v", "ग": "u", "घ": "3",
    "ङ": "ª", "च": "r", "छ": "5", "ज": "h", "झ": "´", "ञ": "`",
    "ट": "6", "ठ": "7", "ड": "8", "ढ": "9", "ण": "0f", "त": "t",
    "थ": "y", "द": "b", "ध": "w", "न": "g", "प": "k", "फ": "km",
    "ब": "a", "भ": "e", "म": "d", "य": "o", "र": "/", "रू": "?",
    "ृ": "[", "ल": "n", "व": "j", "स": ";", "श": "z", "ष": "if",
    "ज्ञ": "1", "ह": "x", "१": "!", "२": "@", "३": "#", "४": "$",
    "५": "%", "६": "^", "७": "&", "८": "*", "९": "(", "०": ")",
    "।": ".", "्": "\\", "ऊ": "pm", "-": " ", "(": "-", ")": "_"
}

def normalize_unicode(unicode_text):
    """
    Normalizes unicode text to handle special cases in Nepali text
    before converting to Preeti.
    """
    index = -1
    normalized = ''
    
    while index + 1 < len(unicode_text):
        index += 1
        character = unicode_text[index]
        
        try:
            try:
                if character != 'र':
                    if (index + 2 < len(unicode_text) and 
                        unicode_text[index+1] == '्' and 
                        unicode_text[index+2] not in [' ', '।', ',']):
                        
                        if unicode_text[index+2] != 'र':
                            if unicodeToPreetiDict[character] in list('wertyuxasdghjkzvn'):
                                normalized += chr(ord(unicodeToPreetiDict[character]) - 32)
                                index += 1
                                continue
                            elif character == 'स':
                                normalized += ':'
                                index += 1
                                continue
                            elif character == 'ष':
                                normalized += 'i'
                                index += 1
                                continue
                                
                if (index > 0 and unicode_text[index-1] != 'र' and 
                    character == '्' and 
                    index + 1 < len(unicode_text) and 
                    unicode_text[index+1] == 'र'):
                    
                    if unicode_text[index-1] not in ['ट', 'ठ', 'ड']:
                        normalized += '|'  # for sign as in क्रम
                    else:
                        normalized += '«'  # for sign as in ट्रक
                    index += 1
                    continue
                    
            except IndexError:
                pass
            
            normalized += character
            
        except KeyError:
            normalized += character
    
    # Replace specific combinations
    normalized = normalized.replace('त|', 'q')  # for त्र
    return normalized

def convert_to_preeti(normalized_unicode):
    """
    Converts normalized unicode text to Preeti font format.
    """
    converted = ''
    index = -1
    
    while index + 1 < len(normalized_unicode):
        index += 1
        character = normalized_unicode[index]
        
        # Skip BOM character if present
        if character == '\ufeff':
            continue
            
        try:
            try:
                # Handle special cases for 'ि' (hraswo ukaar)
                if index + 1 < len(normalized_unicode) and normalized_unicode[index+1] == 'ि':
                    if character == 'q':
                        converted += 'l' + character
                    else:
                        converted += 'l' + unicodeToPreetiDict[character]
                    index += 1
                    continue
                
                # Handle constructs like त्ति
                if (index + 2 < len(normalized_unicode) and 
                    normalized_unicode[index+2] == 'ि'):
                    if character in list('WERTYUXASDGHJK:ZVN'):
                        if normalized_unicode[index+1] != 'q':
                            converted += 'l' + character + unicodeToPreetiDict[normalized_unicode[index+1]]
                            index += 2
                            continue
                        else:
                            converted += 'l' + character + normalized_unicode[index+1]
                            index += 2
                            continue
                
                # Handle reph as in वार्ता
                if (index + 1 < len(normalized_unicode) and 
                    normalized_unicode[index+1] == '्' and 
                    character == 'र'):
                    
                    if (index + 3 < len(normalized_unicode) and 
                        normalized_unicode[index+3] in ['ा', 'ो', 'ौ', 'े', 'ै', 'ी']):
                        
                        converted += (unicodeToPreetiDict[normalized_unicode[index+2]] + 
                                      unicodeToPreetiDict[normalized_unicode[index+3]] + '{')
                        index += 3
                        continue
                        
                    elif index + 3 < len(normalized_unicode) and normalized_unicode[index+3] == 'ि':
                        converted += (unicodeToPreetiDict[normalized_unicode[index+3]] + 
                                      unicodeToPreetiDict[normalized_unicode[index+2]] + '{')
                        index += 3
                        continue
                        
                    converted += unicodeToPreetiDict[normalized_unicode[index+2]] + '{'
                    index += 2
                    continue
                
                # Handle the likes of ष्ट्रिय
                if (index + 3 < len(normalized_unicode) and 
                    normalized_unicode[index+3] == 'ि' and 
                    (normalized_unicode[index+2] == '|' or normalized_unicode[index+2] == '«')):
                    
                    if character in list('WERTYUXASDGHJK:ZVNIi'):
                        converted += ('l' + character + 
                                      unicodeToPreetiDict[normalized_unicode[index+1]] + 
                                      normalized_unicode[index+2])
                        index += 3
                        continue
                
            except IndexError:
                pass
            
            converted += unicodeToPreetiDict[character]
            
        except KeyError:
            converted += character
    
    # Replace specific combinations with their Preeti equivalents
    replacements = [
        ('Si', 'I'),        # Si in preeti is aadha ka aadha ष, so replace with I which is aadha क्ष
        ('H`', '1'),        # H` is the product of composite nature of unicode ज्ञ
        ('b\\w', '4'),      # b\w means in preeti द halanta ध, so replace the composite
        ('z|', '>'),        # composite for श्र
        ("/'", '?'),        # composite for रु
        ('/"', '¿'),        # composite for रू
        ('Tt', 'Q'),        # composite for त्त
        ('b\\lj', 'lå'),    # composite for द्वि
        ('b\\j', 'å'),      # composite for द्व
        ('0f\\\\', '0'),    # composite for ण् to get the aadha ण in say गण्डक
        ('`\\\\', '~')      # composite for aadha ञ्
    ]
    
    for old, new in replacements:
        converted = converted.replace(old, new)
        
    return converted

def convert_unicode_to_preeti(input_file, output_file):
    """
    Read Unicode Nepali words from input CSV and convert to Preeti
    """
    import pandas as pd
    
    # Read the input CSV file
    df = pd.read_csv(input_file, encoding='utf-8')
    
    # Prepare output data
    output_data = []
    
    # Convert each word
    for word in df['unicode']:
        # Normalize and convert the word
        normalized_word = normalize_unicode(word)
        preeti_word = convert_to_preeti(normalized_word)
        
        # Add to output data
        output_data.append([word, preeti_word])
    
    # Create output DataFrame
    output_df = pd.DataFrame(output_data, columns=['Unicode Word', 'Preeti Conversion'])
    
    # Save to CSV
    output_df.to_csv(output_file, index=False, encoding='utf-8')
    
    print(f"Conversion complete. Output saved to {output_file}")

# Example usage
input_file = 'output_short_phrases.csv'
output_file = 'phrase_dataset_unicode_to_preeti.csv'

# Ensure your input CSV has a column named 'Word'
# Example input CSV content:
# Word
# नमस्ते
# धन्यवाद
# स्वागत
# माफ गर्नुहोस्
# कृपया

convert_unicode_to_preeti(input_file, output_file)

Conversion complete. Output saved to phrase_dataset_unicode_to_preeti.csv


In [42]:
import pandas as pd

input_csv = 'output_words.csv'
df = pd.read_csv(input_csv, encoding='utf-8')

# Display the first few rows
print(df.head())

# Get the total number of elements (rows × columns)
print("Total elements (rows × columns):", df.size)

# If you want number of rows instead, use:
print("Total number of rows:", len(df))

  Word
0    अ
1    आ
2    इ
3    ई
4    उ
Total elements (rows × columns): 761029
Total number of rows: 761029


In [44]:
import pandas as pd

def extract_words(input_file, input_column, output_file, max_words=20000):
    df = pd.read_csv(input_file, encoding='utf-8')
    
    # Flatten all words from the sentences
    all_words = []
    for sentence in df[input_column].dropna():
        all_words.extend(sentence.split())
        if len(all_words) >= max_words:
            break
    
    # Trim to exactly 5000
    all_words = all_words[:max_words]
    
    # Create DataFrame
    words_df = pd.DataFrame(all_words, columns=['unicode'])
    
    # Save to CSV
    words_df.to_csv(output_file, index=False, encoding='utf-8')
    print(f"Saved {len(words_df)} words to '{output_file}'")

# Example usage
extract_words('unicode_preeti_dataset.csv', 'unicode', 'unicode_words_5000.csv')

Saved 20000 words to 'unicode_words_5000.csv'


In [45]:
import pandas as pd

# Read the input CSV (only one column: 'unicode')
df = pd.read_csv('unicode_words_5000.csv', encoding='utf-8')

# Drop NaN and convert to list
words = df['unicode'].dropna().tolist()

# Convert using your function
converted_words = convert_word_list(words)


print(converted_words[:5])

[('अ', 'c'), ('आ', 'cf'), ('इ', 'O'), ('ई', 'O{'), ('उ', 'p')]


In [38]:

# Save to new CSV
output_df = pd.DataFrame({'preeti': converted_words})
output_df.to_csv('converted_preeti_words.csv', index=False, encoding='utf-8')

print("Saved converted words to 'converted_preeti_words.csv'")

Saved converted words to 'converted_preeti_words.csv'


In [11]:
import os
import glob
import csv

def read_and_convert_file(file_path):
    """
    Read a single text file and convert its content to Preeti
    
    Args:
        file_path (str): Path to the input text file
    
    Returns:
        list: List of converted words/lines
    """
    try:
        # Read the file
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read().strip()
            
        # Split content into lines or words
        lines = content.split('\n')
        
        # Convert each line
        converted_lines = []
        for line in lines:
            # Normalize and convert the line
            normalized_line = normalize_unicode(line)
            preeti_line = convert_to_preeti(normalized_line)
            converted_lines.append((line, preeti_line))
        
        return converted_lines
    
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return []

def batch_convert_files(input_folder, output_csv):
    """
    Convert all .txt files in a folder to Preeti and save to CSV
    
    Args:
        input_folder (str): Path to folder containing .txt files
        output_csv (str): Path to save the output CSV file
    """
    # Find all .txt files in the input folder
    txt_files = glob.glob(os.path.join(input_folder, '*.txt'))
    
    # Prepare CSV for writing
    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        csv_writer = csv.writer(csvfile)
        
        # Write CSV header
        csv_writer.writerow(['File Name', 'Original Text', 'Preeti Text'])
        
        # Process each file
        for file_path in txt_files:
            # Get just the filename
            filename = os.path.basename(file_path)
            
            # Convert file contents
            converted_lines = read_and_convert_file(file_path)
            
            # Write converted lines to CSV
            for original, preeti in converted_lines:
                csv_writer.writerow([filename, original, preeti])
            
            print(f"Processed: {filename}")
    
    print(f"\nConversion complete! Results saved to {output_csv}")

# Interactive usage in Jupyter or Python script
def main():
    # Prompt for input folder and output file
    input_folder = input("Enter the path to the folder containing .txt files: ")
    output_csv = input("Enter the path to save the output CSV file: ")
    
    # Validate input folder
    if not os.path.isdir(input_folder):
        print("Error: Invalid folder path")
        return
    
    # Perform batch conversion
    batch_convert_files(input_folder, output_csv)

# For Jupyter Lab with widgets
def create_conversion_widget():
    import ipywidgets as widgets
    from IPython.display import display
    
    # Input folder selection
    input_folder_widget = widgets.Text(
        value='',
        placeholder='Enter path to input folder',
        description='Input Folder:',
        style={'description_width': 'initial'},
        layout={'width': '500px'}
    )
    
    # Output CSV file selection
    output_file_widget = widgets.Text(
        value='',
        placeholder='Enter path to output CSV file',
        description='Output CSV:',
        style={'description_width': 'initial'},
        layout={'width': '500px'}
    )
    
    # Status output
    status_widget = widgets.Output()
    
    def on_convert_clicked(b):
        with status_widget:
            status_widget.clear_output()
            try:
                print("Starting conversion...")
                batch_convert_files(input_folder_widget.value, output_file_widget.value)
                print("Conversion completed successfully!")
            except Exception as e:
                print(f"Error during conversion: {e}")
    
    convert_button = widgets.Button(description="Convert Files")
    convert_button.on_click(on_convert_clicked)
    
    # Display widgets
    display(widgets.HTML("<h2>Batch Unicode to Preeti Converter</h2>"))
    display(input_folder_widget)
    display(output_file_widget)
    display(convert_button)
    display(status_widget)

# Run methods based on environment
if __name__ == "__main__":
    main()

Enter the path to the folder containing .txt files:  /Users/sanjokdangol/Downloads/archive/SuchanaPrabidhi/SuchanaPrabidhi
Enter the path to save the output CSV file:  infotech.csv


Processed: 289.txt
Processed: 504.txt
Processed: 262.txt
Processed: 276.txt
Processed: 510.txt
Processed: 538.txt
Processed: 909.txt
Processed: 921.txt
Processed: 935.txt
Processed: 706.txt
Processed: 712.txt
Processed: 29.txt
Processed: 869.txt
Processed: 15.txt
Processed: 855.txt
Processed: 699.txt
Processed: 841.txt
Processed: 114.txt
Processed: 672.txt
Processed: 666.txt
Processed: 100.txt
Processed: 128.txt
Processed: 896.txt
Processed: 882.txt
Processed: 470.txt
Processed: 316.txt
Processed: 302.txt
Processed: 464.txt
Processed: 458.txt
Processed: 459.txt
Processed: 303.txt
Processed: 465.txt
Processed: 471.txt
Processed: 317.txt
Processed: 883.txt
Processed: 129.txt
Processed: 897.txt
Processed: 667.txt
Processed: 101.txt
Processed: 115.txt
Processed: 673.txt
Processed: 840.txt
Processed: 698.txt
Processed: 854.txt
Processed: 14.txt
Processed: 868.txt
Processed: 28.txt
Processed: 713.txt
Processed: 707.txt
Processed: 934.txt
Processed: 920.txt
Processed: 908.txt
Processed: 539.t

In [None]:
# Clean data remove empty

In [14]:
import pandas as pd
import numpy as np
import re

def clean_csv_file(input_csv, output_csv=None, 
                   original_text_column='unicode', 
                   preeti_text_column='preeti'):
    """
    Clean CSV file by:
    1. Removing rows with empty, null, or specific content
    2. Removing leading tabs from original and preeti text columns
    
    Parameters:
    -----------
    input_csv : str
        Path to the input CSV file
    output_csv : str, optional
        Path to save the cleaned CSV file. 
        If None, will overwrite the input file
    original_text_column : str, optional
        Name of the original text column
    preeti_text_column : str, optional
        Name of the preeti text column
    
    Returns:
    --------
    pandas.DataFrame
        Cleaned DataFrame
    """
    try:
        # Read the CSV file
        df = pd.read_csv(input_csv, encoding='utf-8')
        
        # Print initial information
        print(f"Initial DataFrame Shape: {df.shape}")
        
        # Function to check if a value is considered empty
        def is_empty(value):
            # Check for various types of "emptiness"
            if pd.isna(value):  # Checks for NaN, None
                return True
            if isinstance(value, str):
                # Check for empty string, string with only spaces, or specific empty markers
                return (value.strip() == '' or 
                        value.lower() in ['', 'nan', 'null', '"', "'", '""'] or
                        len(value.strip()) <= 2)
            return False
        
        # Remove rows where specified column is empty
        df_cleaned = df[~df[original_text_column].apply(is_empty)]
        
        # Function to remove leading tabs and whitespaces
        def remove_leading_tabs_and_spaces(text):
            if isinstance(text, str):
                # Remove leading tabs and whitespaces
                return text.lstrip('\t ')
            return text
        
        # Apply tab removal to both original and preeti text columns
        df_cleaned[original_text_column] = df_cleaned[original_text_column].apply(remove_leading_tabs_and_spaces)
        df_cleaned[preeti_text_column] = df_cleaned[preeti_text_column].apply(remove_leading_tabs_and_spaces)
        
        # Print information after cleaning
        print(f"Cleaned DataFrame Shape: {df_cleaned.shape}")
        print(f"Rows Removed: {df.shape[0] - df_cleaned.shape[0]}")
        
        # Save the cleaned DataFrame
        if output_csv is None:
            output_csv = input_csv  # Overwrite input file
        
        df_cleaned.to_csv(output_csv, index=False, encoding='utf-8')
        print(f"Cleaned CSV saved to: {output_csv}")
        
        return df_cleaned
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Comprehensive analysis function
def analyze_cleaned_data(df, 
                          original_text_column='Original Text', 
                          preeti_text_column='Preeti Text'):
    """
    Perform detailed analysis on the cleaned DataFrame
    """
    print("\n--- Data Cleaning Analysis ---")
    print(f"Total Rows: {len(df)}")
    
    # Original Text Analysis
    print("\n--- Original Text Analysis ---")
    print("Original Text Length Distribution:")
    df['Original Text Length'] = df[original_text_column].str.len()
    print(df['Original Text Length'].describe())
    
    # Preeti Text Analysis
    print("\n--- Preeti Text Analysis ---")
    print("Preeti Text Length Distribution:")
    df['Preeti Text Length'] = df[preeti_text_column].str.len()
    print(df['Preeti Text Length'].describe())
    
    # Unique value analysis
    print("\nUnique Values:")
    print(f"Unique Original Texts: {df[original_text_column].nunique()}")
    print(f"Unique Preeti Texts: {df[preeti_text_column].nunique()}")
    
    # Check for remaining tabs
    def contains_tab(text):
        return '\t' in str(text)
    
    # Check tabs in both columns
    original_text_tabs = df[original_text_column].apply(contains_tab).sum()
    preeti_text_tabs = df[preeti_text_column].apply(contains_tab).sum()
    
    print("\n--- Tab Character Check ---")
    print(f"Rows with Tab in Original Text: {original_text_tabs}")
    print(f"Rows with Tab in Preeti Text: {preeti_text_tabs}")

# Optional extra analysis (uncomment if needed)
def additional_text_analysis(df, 
                              original_text_column='Original Text', 
                              preeti_text_column='Preeti Text'):
    """
    Perform additional detailed text analysis
    """
    print("\n--- Advanced Text Analysis ---")
    
    # Check for special characters
    def count_special_chars(text):
        return len(re.findall(r'[^a-zA-Z0-9\s]', str(text)))
    
    df['Original Text Special Chars'] = df[original_text_column].apply(count_special_chars)
    df['Preeti Text Special Chars'] = df[preeti_text_column].apply(count_special_chars)
    
    print("Special Characters Distribution:")
    print("Original Text Special Chars:")
    print(df['Original Text Special Chars'].describe())
    print("\nPreeti Text Special Chars:")
    print(df['Preeti Text Special Chars'].describe())

# Example usage in Jupyter Lab
# Specify your input and output file paths
input_file = 'infotech.csv'
output_file = 'infotech_cleaned.csv'

# Clean the CSV file
cleaned_df = clean_csv_file(input_file, output_file)

# Perform analysis if cleaning is successful
if cleaned_df is not None:
    # Display first few rows
    print("\nFirst few rows of cleaned DataFrame:")
    display(cleaned_df.head())
    
    # Perform comprehensive analysis
    analyze_cleaned_data(cleaned_df)
    
    # Optional: Run additional text analysis
    additional_text_analysis(cleaned_df)

# Optional: Print column names to verify
print("\nColumns in the cleaned DataFrame:")
print(cleaned_df.columns.tolist())

Initial DataFrame Shape: (8536, 3)
Cleaned DataFrame Shape: (8536, 3)
Rows Removed: 0
Cleaned CSV saved to: infotech_cleaned.csv

First few rows of cleaned DataFrame:


Unnamed: 0,File Name,Original Text,Preeti Text
0,289.txt,कम्प्युटरबाट तेस्रो विश्वयुद्धअमेरिकाले तेस्रो...,sDKo'6/af6 t];|f] ljZjo'4cd]l/sfn] t];|f] ljZj...
1,289.txt,अमेरिकाले तेस्रो विश्वयुद्ध जमिनमा नभएर इन्टरन...,cd]l/sfn] t];|f] ljZjo'4 hldgdf geP/ OG6/g]6 ;...
2,289.txt,अमेरिकामा साइबर आक्रमण जारी रहेको उनले बताए प...,cd]l/sfdf ;fOa/ cfs|d0f hf/L /x]sf] pgn] atfP ...
3,504.txt,सामाजिक सञ्जालबाट अपराधफेसबुक र टि्वटरजस्ता सम...,;fdflhs ;`\hfnaf6 ck/fwkm];a's / l6\j6/h:tf ;d...
4,504.txt,फेसबुक र टि्वटरजस्ता समाजिक सञ्जालसँग जोडिएका ...,km];a's / l6\j6/h:tf ;dflhs ;`\hfn;Fu hf]l8Psf...



--- Data Cleaning Analysis ---
Total Rows: 8536

--- Original Text Analysis ---
Original Text Length Distribution:
count    8536.000000
mean      282.691073
std       213.929541
min         4.000000
25%       171.000000
50%       262.000000
75%       362.000000
max      8873.000000
Name: Original Text Length, dtype: float64

--- Preeti Text Analysis ---
Preeti Text Length Distribution:
count    8536.000000
mean      278.147376
std       209.460347
min         4.000000
25%       169.000000
50%       259.000000
75%       355.000000
max      8551.000000
Name: Preeti Text Length, dtype: float64

Unique Values:
Unique Original Texts: 8380
Unique Preeti Texts: 8378

--- Tab Character Check ---
Rows with Tab in Original Text: 2
Rows with Tab in Preeti Text: 2

--- Advanced Text Analysis ---
Special Characters Distribution:
Original Text Special Chars:
count    8536.000000
mean      237.032099
std       179.778375
min         4.000000
25%       144.000000
50%       220.000000
75%       303.00

In [16]:
import pandas as pd
import glob
import os

# Folder path where your CSVs are located
folder_path = 'rawdata/'

# Get all CSV file paths
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))

# Read and concatenate all CSVs, dropping the "File Name" column if it exists
merged_df = pd.concat(
    [pd.read_csv(f).drop(columns=["File Name"], errors='ignore') for f in csv_files],
    ignore_index=True
)

# Save to a new merged CSV file
merged_df.to_csv('unicode_preeti_dataset.csv', index=False)

print(f"Merged {len(csv_files)} CSV files into 'merged_output.csv' without 'File Name' column.")

Merged 2 CSV files into 'merged_output.csv' without 'File Name' column.
