# PIP


In [None]:
%pip install faker
%pip install easynmt #install package


# Data Cleaning


## Load Dataset


In [None]:
# prompt: load dataset and drop the flags column

import pandas as pd

# Load the dataset (replace 'your_dataset.csv' with the actual file name)
data = pd.read_csv('augmented_dataset_v4.csv')

# Drop the 'flags' column
# data = data.drop('flags', axis=1)

# Now you can work with the dataset without the 'flags' column
print(data.head())


## Lowercase


In [None]:
# Assuming 'data' is your DataFrame
for column in data.select_dtypes(include=['object']).columns:
    data[column] = data[column].str.lower()

print(data.head())


# Check missing values


In [None]:
# prompt: check missing values

print(data.isnull().sum())


## Drop missing values


In [None]:
# prompt: drop missing values in the instruction_augmented column

data = data.dropna(subset=['instruction_augmented'])
print(data.isnull().sum())


## Remove symbols


# Function to clean the text

def clean_text(text): # Remove unnecessary symbols except {{ }}
cleaned_text = re.sub(r"[^a-zA-Z0-9\{\}\s]", "", text)
return cleaned_text
data['instruction_augmented'] = data['instruction_augmented'].apply(clean_text)


In [None]:

def clean_placeholders(text):
    # Find and clean placeholders
    cleaned_text = re.sub(r'\{\{(.*?)\}\}', lambda m: "{{" + m.group(1).replace(" uh ", " ").replace(" hmm ", " ").replace(" well ", " ") + "}}", text)
    return cleaned_text

# Apply the cleaning function to the relevant column (e.g., 'instruction_augmented')
data['instruction_augmented'] = data['instruction_augmented'].apply(clean_placeholders)


# Check Distribution


## Category, Intent, NER


In [None]:

# Check Distribution for 'intent' and 'category' columns
print("Intent Distribution:\n", data['intent'].value_counts(normalize=True))
print("\nCategory Distribution:\n", data['category'].value_counts(normalize=True))

# Check for NER patterns in the 'instruction' column
import re

def find_ner_patterns(text):
    pattern = r'\{\{(.*?)\}\}'
    matches = re.findall(pattern, text)
    return matches

data['ner_patterns'] = data['instruction_augmented'].apply(find_ner_patterns)

# Display rows with found NER patterns
# print("\nNER Patterns in 'instruction' column:")
# print(data[data['ner_patterns'].apply(lambda x: len(x) > 0)])

# You can further analyze the 'ner_patterns' column:
# For example, count the occurrences of each NER pattern
from collections import Counter

all_ner_patterns = [item for sublist in data['ner_patterns'] for item in sublist]
ner_pattern_counts = Counter(all_ner_patterns)
print("\nNER Pattern Counts:")
ner_pattern_counts


## Augmentation Distribution


In [None]:
# prompt: check aug distribution based on the  augmentation_technique column and total counts of it also include the distribution in each intent

# Check augmentation technique distribution
aug_counts = data['augmentation_technique'].value_counts()
print("\nAugmentation Technique Distribution:\n", aug_counts)
print("\nAugmentation Technique Distribution (normalized):\n", aug_counts / len(data))

# Check augmentation technique distribution per intent
aug_intent_distribution = data.groupby(['intent', 'augmentation_technique']).size().unstack(fill_value=0)
print("\nAugmentation Technique Distribution per Intent:\n", aug_intent_distribution)

# Calculate and display the distribution within each intent
for intent in aug_intent_distribution.index:
    intent_counts = aug_intent_distribution.loc[intent]
    intent_distribution = intent_counts / intent_counts.sum()
    print(f"\nDistribution for intent '{intent}':\n{intent_distribution}")


## Save Data


In [None]:

# Save to CSV
data.to_csv("data_after_aug_preprocess.csv", index=False)

# Save to JSON Lines
data.to_json("data_after_aug_preprocess.json", orient="records", lines=True)
