In [1]:
import pandas as pd

df = pd.read_csv("merged.csv")

In [2]:
df.shape

(7898, 5)

In [3]:
df = df.drop(columns=['id'])

# Keep only the desired columns
df = df[['type', 'content', 'question', 'answer']]

# Drop rows where any of the required columns have missing (NaN or empty) values
df = df.dropna(subset=['type', 'content', 'question', 'answer'])

# Also remove rows where any field is just empty string or whitespace
df = df[~df[['type', 'content', 'question', 'answer']].apply(lambda x: x.str.strip().eq('').any(), axis=1)]

# Optional: Reset the index
df = df.reset_index(drop=True)

In [4]:
df.shape

(3417, 4)

In [5]:
df.isnull().sum()

type        0
content     0
question    0
answer      0
dtype: int64

In [6]:
df.duplicated().sum()

0

In [8]:
df.to_csv("click.csv", index=False)

In [10]:
import json

def load_json(path):
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)

def save_json(data, path):
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

def preprocess_to_alpaca_format(data):
    processed = []

    for entry in data:
        content = entry.get("content", "").strip()
        question = entry.get("question", "").strip()
        answer = entry.get("answer", "").strip()

        # Skip invalid entries
        if not question or not answer or not content or answer == content or answer == question:
            continue

        alpaca_entry = {
            "instruction": "এই লেখাটির ভিত্তিতে প্রশ্নের উত্তর দাও।",  # Instruction in Bengali
            "input": f"প্রশ্ন: {question}\n\nলেখা:\n{content}",
            "output": answer
        }

        processed.append(alpaca_entry)

    return processed

def main():
    input_path = "merged_data.json"   # Replace with your actual JSON path
    output_path = "qa_alpaca.json"

    raw_data = load_json(input_path)
    processed_data = preprocess_to_alpaca_format(raw_data)

    print(f"✅ Total usable entries: {len(processed_data)}")
    save_json(processed_data, output_path)
    print(f"🚀 Saved preprocessed data to: {output_path}")

if __name__ == "__main__":
    main()


✅ Total usable entries: 3363
🚀 Saved preprocessed data to: qa_alpaca.json


In [11]:
import json
import re

def load_json(path):
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)

def save_json(data, path):
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

def is_bengali(text):
    """Basic Bengali character check using Unicode range."""
    return any('\u0980' <= char <= '\u09FF' for char in text)

def clean_text(text):
    """Removes extra whitespace, newlines, and unwanted characters."""
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def preprocess_denoise(data, min_len=5, max_len=1024):
    cleaned = []

    for entry in data:
        instruction = clean_text(entry.get("instruction", ""))
        input_text = clean_text(entry.get("input", ""))
        output = clean_text(entry.get("output", ""))

        # Remove entries with missing fields
        if not instruction or not input_text or not output:
            continue

        # Remove very short or overly long samples
        if len(output.split()) < min_len or len(output.split()) > max_len:
            continue

        # Remove if output is part of input
        if output in input_text:
            continue

        # Check if Bengali characters are present
        if not is_bengali(output):
            continue

        cleaned.append({
            "instruction": instruction,
            "input": input_text,
            "output": output
        })

    return cleaned

def main():
    input_path = "qa_alpaca.json"       # Your existing Alpaca JSON
    output_path = "qa_alpaca_cleaned.json"

    raw_data = load_json(input_path)
    cleaned_data = preprocess_denoise(raw_data)

    print(f"✅ Entries before cleaning: {len(raw_data)}")
    print(f"🧹 Entries after cleaning: {len(cleaned_data)}")
    
    save_json(cleaned_data, output_path)
    print(f"🚀 Cleaned dataset saved to: {output_path}")

if __name__ == "__main__":
    main()


✅ Entries before cleaning: 3363
🧹 Entries after cleaning: 3290
🚀 Cleaned dataset saved to: qa_alpaca_cleaned.json
