In [1]:
import pandas as pd
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate as indic_transliterate

def is_english_with_malayalam_meaning(text):
    """
    Check if a given English text has a meaningful transliteration in Malayalam.

    Parameters:
    - text (str): The input English text.

    Returns:
    - bool: True if there is a meaningful transliteration in Malayalam, False otherwise.
    """
    # Transliterate the text to Malayalam
    malayalam_transliteration = indic_transliterate(text, sanscript.ITRANS, sanscript.MALAYALAM)

    # Check if the original text and transliterated text are different
    return text.lower() != malayalam_transliteration.lower()

def filter_english_with_malayalam_meaning(dataset_path, output_path):
    """
    Filter entries from a dataset, keeping only those with Malayalam meanings.

    Parameters:
    - dataset_path (str): Path to the input CSV dataset.
    - output_path (str): Path to save the filtered CSV dataset.
    """
    # Read the dataset into a Pandas DataFrame
    df = pd.read_csv(dataset_path)

    # Filter the DataFrame to include only entries with Malayalam meanings
    english_with_malayalam_meaning_df = df[df['commentText'].apply(is_english_with_malayalam_meaning)]

    # Save the filtered DataFrame to a new CSV file
    english_with_malayalam_meaning_df.to_csv(output_path, index=False)

if __name__ == "__main__":
    dataset_path = "english_entries.csv"  # Update with the actual path to your dataset
    output_path = "english_with_malayalam_meaning.csv"  # Update with the desired output path

    filter_english_with_malayalam_meaning(dataset_path, output_path)

  df = pd.read_csv(dataset_path)


TypeError: object of type 'float' has no len()