In [3]:
import pandas as pd
import langid

def classify_language(text):
    # Check if the text is NaN, and return 'na' for NaN values
    if pd.isna(text):
        return 'na'
    
    # Classify the language of the text
    lang, _ = langid.classify(text)
    return lang

def filter_malayalam_and_mixed(df):
    # Add a new column for language classification
    df['language'] = df['commentText'].apply(classify_language)

    # Filter only Malayalam and Malayalam-English mixed
    filtered_df = df[(df['language'] == 'ml') | (df['language'] == 'ml-en')]

    # Drop the language column if you no longer need it
    filtered_df = filtered_df.drop(columns=['language'])

    return filtered_df

def save_to_csv(df, output_path):
    df.to_csv(output_path, index=False, encoding='utf-8')

# Example usage
input_dataset_path = 'comments1.csv'
output_csv_path = 'output_malayalam_mixed.csv'

# Read the dataset
df = pd.read_csv(input_dataset_path, low_memory=False)  # Set low_memory=False to handle mixed-type warnings

# Filter only Malayalam and Malayalam-English mixed
filtered_df = filter_malayalam_and_mixed(df)

# Save the filtered dataframe to a new CSV file
save_to_csv(filtered_df, output_csv_path)

print(f"Malayalam and Malayalam-English mixed data saved to {output_csv_path}.")


Malayalam and Malayalam-English mixed data saved to output_malayalam_mixed.csv.
