# Import


In [None]:
import pandas as pd


# Load Data


In [None]:

df = pd.read_csv('data_after_aug_preprocess.csv')



# Small Datapreprocess


In [None]:
# Check data types and missing values before removal
print("Before removing nulls:")
print(df['instruction_augmented'].dtype)
print(df['instruction_augmented'].isnull().sum())

# Remove rows with null values in 'instruction_augmented' column
df = df.dropna(subset=['instruction_augmented'])

# Check data types and missing values after removal
print("\nAfter removing nulls:")
print(df['instruction_augmented'].dtype)
print(df['instruction_augmented'].isnull().sum())


# Now you can work with the 'data' DataFrame
print(df.head())


# Identify Placeholder


In [None]:
import pandas as pd
import re
from collections import Counter

# Detect rows with placeholders
df['has_placeholder'] = df['instruction_augmented'].str.contains(r'\{\{.*?\}\}')

# Count rows with placeholders
rows_with_placeholders = df['has_placeholder'].sum()

# Extract placeholders function
def extract_placeholders(text):
    return re.findall(r'\{\{(.*?)\}\}', text)

# Apply function and get all placeholders
placeholders = df[df['has_placeholder']]['instruction_augmented'].apply(extract_placeholders).explode()

# Count occurrences of each placeholder
placeholder_counts = Counter(placeholders)

# Convert to DataFrame (including frequency count)
placeholder_df = pd.DataFrame(placeholder_counts.items(), columns=['placeholder', 'frequency'])

# Sort by frequency in descending order
placeholder_df = placeholder_df.sort_values(by='frequency', ascending=False)

# Display results
print(f"Rows with placeholders: {rows_with_placeholders}")
print(f"Total unique placeholders: {len(placeholder_df)}\n")

print("Distribution of Placeholders (Frequency Count):")
print(placeholder_df)



# Generate Fake Data For Placeholder and Saved Data


In [None]:
import re
import pandas as pd
from faker import Faker

# Initialize Faker
fake = Faker()

# Define synthetic data generators
data_generators = {
    'order number': lambda: fake.unique.bothify(text='ORD-#######'),
    'invoice number': lambda: fake.unique.bothify(text='INV#####'),
    'person name': lambda: fake.name(),
    'account type': lambda: fake.random_element(elements=(
        'Personal', 'Business', 'Corporate', 'Enterprise', 'VIP',
        'Premium', 'Standard', 'Basic', 'Student', 'Non-Profit', 'Government'
    )),
    'account category': lambda: fake.random_element(elements=(
        'Retail', 'E-commerce', 'Technology', 'Finance', 'Healthcare',
        'Education', 'Real Estate', 'Hospitality', 'Manufacturing',
        'Legal', 'Entertainment', 'Consulting', 'Logistics'
    )),
    'refund amount': lambda: f"${fake.random_number(digits=3)}.00",
    'currency symbol': lambda: fake.currency_symbol(),
    'delivery city': lambda: fake.city(),
    'delivery country': lambda: fake.country()
}

# Function to replace placeholders and track NER labels
def replace_placeholders(text, data_generators):
    if pd.isnull(text):
        return text, []  # Return empty list for NER labels

    placeholder_pattern = re.compile(r'\{\{(.*?)\}\}')  # Match placeholders {{...}}
    placeholders = placeholder_pattern.findall(text)
    ner_labels = []  # Store entity labels

    for placeholder in placeholders:
        placeholder_cleaned = placeholder.strip().lower()  # Normalize case
        if placeholder_cleaned in data_generators:
            replacement_value = data_generators[placeholder_cleaned]()  # Generate synthetic data
            text = text.replace(f'{{{{{placeholder}}}}}', replacement_value, 1)  # Replace placeholder

            # Append entity info in required format
            ner_labels.append({'text': replacement_value, 'label': placeholder_cleaned.replace(" ", "_")})

    return text, ner_labels

# Apply the placeholder replacement function
df[['instruction_augmented', 'ner_labels_only']] = df['instruction_augmented'].apply(
    lambda x: pd.Series(replace_placeholders(x, data_generators))
)

# Ensure all rows are retained, even if ner_labels is empty
df['ner_labels_only'] = df.apply(lambda row: row['ner_labels_only'] if row['has_placeholder'] else [], axis=1)

print(df.head())

df.to_csv("synthetic_data.csv", index=False)


# Create BIO Tags (FORMAT) and Tokenize


In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('synthetic_data.csv')

# Check data types and missing values before removal
print("Before removing nulls:")
print(df['instruction_augmented'].dtype)
print(df['instruction_augmented'].isnull().sum())

# Remove rows with null values in 'instruction_augmented' column
df = df.dropna(subset=['instruction_augmented'])

# Check data types and missing values after removal
print("\nAfter removing nulls:")
print(df['instruction_augmented'].dtype)
print(df['instruction_augmented'].isnull().sum())


# Convert CSV TO JSON AND save FINAL DATA FOR TRAINING


In [None]:
import pandas as pd
import json

# Load the CSV file
df = pd.read_csv("synthetic_data.csv")

# Ensure the `ner_labels_only` column is properly formatted as JSON
def fix_json_format(value):
    """
    Fix JSON formatting issues in the 'ner_labels_only' column.
    Ensures it is a valid JSON string.
    """
    try:
        # If already valid JSON, return as is
        if isinstance(value, str):
            return json.loads(value.replace("'", "\""))  # Replace single quotes with double quotes
    except json.JSONDecodeError:
        pass  # If there's an error, leave it as is (optional: log the error)
    return []  # Default to empty list if there's an issue

# Apply formatting fix
df["ner_labels_only"] = df["ner_labels_only"].apply(fix_json_format)

# Save as JSON file
df.to_json("fixed_dataset.json", orient="records", indent=4)

print("CSV successfully converted to valid JSON format.")
