To merge before and after dataset

In [None]:
import pandas as pd
df1  = pd.read_csv("Balen_election_month")
df2 = pd.read_csv("Balen_after")
df2.info()

In [None]:
# Add a 'time_period' column to differentiate datasets
df1['time_period'] = 'before_election'
df2['time_period'] = 'after_election'

# Combine the datasets
df_combined = pd.concat([df1, df2], ignore_index=True)

# Check for inconsistencies in column names or data types
print(df_combined.info())

To detect the language

In [None]:
from langdetect import detect
import re

def classify_script_langdetect(text):
    try:
        lang = detect(text)
        
        if lang == 'en':
            return 'English'
        elif lang == 'ne':  # Nepali
            # Check if it's Romanized Nepali (contains English characters)
            if re.search(r'[a-zA-Z]', text):
                return 'Romanized Nepali'
            else:
                return 'Devanagari'
        else:
            # If language detection fails or it's an unknown language, classify as Romanized Nepali
            return 'Romanized Nepali'
    except Exception as e:
        # If there is an exception (e.g., language detection fails), classify as Romanized Nepali
        return 'Romanized Nepali'

# Apply the function to your DataFrame
df_combined['script'] = df_combined['text'].apply(classify_script_langdetect)

# Check the result



To separate into 3 different dataframe for english, romanized nepali and devanagari

In [None]:
df_english = df_combined[df_combined['script'] == 'English']
df_romanized_nepali = df_combined[df_combined['script'] == 'Romanized Nepali']
df_devanagari = df_combined[df_combined['script'] == 'Devanagari']
df_devanagari

To transliterate romanized nepali into devanagari

In [None]:
import pandas as pd
from deep_translator import GoogleTranslator
import time
import re

# Load your CSV into a DataFrame
df = df_romanized_nepali  




# Function to transliterate Romanized Nepali to Devanagari
def roman_to_devanagari(text):
    try:
        # Handle empty or null text
        if not text or pd.isna(text):
            return ""  # Return empty string if the text is empty or NaN
        
        # Remove emojis and special characters that may cause issues with the translation
        # This will match most common emoji patterns
        text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII characters (emojis, symbols)

        # Transliterate Romanized Nepali to Devanagari using Google Translator
        return GoogleTranslator(source='en', target='ne').translate(text)
    except Exception as e:
        print(f"Error during translation: {e}")
        return ""  # Return an empty string if there's an error

# Function to estimate time for batch processing
def estimate_translation_time(df, batch_size=10):
    start_time = time.time()
    
    # Apply transliteration to a small batch of rows
    df_batch = df.head(batch_size)
    df_batch['text_devanagari'] = df_batch['text'].apply(roman_to_devanagari)
    
    # Time taken for this batch
    batch_time = time.time() - start_time
    print(f"Time taken for {batch_size} rows: {batch_time:.2f} seconds")
    
    # Estimate total time for all rows
    total_rows = len(df)
    estimated_time = (total_rows / batch_size) * batch_time  # Estimated time for all rows
    estimated_minutes = estimated_time / 60  # Convert to minutes
    print(f"Estimated time for {total_rows} rows: {estimated_minutes:.2f} minutes")
    
    return df_batch, estimated_time

# Test the function with a batch of 5 rows
df_batch, estimated_time = estimate_translation_time(df, batch_size=5)

# Apply transliteration to the entire DataFrame
df['text_devanagari'] = df['text'].apply(roman_to_devanagari)

# Optionally, save the updated DataFrame back to a CSV file
# df.to_csv('transliterated_output.csv', index=False)

# Check the result
print(df[['text', 'text_devanagari']].head())
