In [2]:
import json
import pandas as pd

In [None]:
input_file_path = '../Dataset_Raw.json'

df = pd.read_json(input_file_path)

print(f"Initial shape of the dataset: {df.shape}")

Initial shape of the dataset: (2154, 9)


In [4]:
def standardize_labels(label_list):
    if isinstance(label_list, list):
        # Using .title() to capitalize the first letter of each word
        return [str(item).title() for item in label_list]
    return label_list # Return as-is if it's not a list (e.g., NaN)

label_columns = [f'A_{i}' for i in range(1, 8)]
for col in label_columns:
    df[col] = df[col].apply(standardize_labels)

In [5]:
initial_rows = len(df)
df.dropna(subset=label_columns, inplace=True)
rows_after_na = len(df)
print(f"\n Removed {initial_rows - rows_after_na} rows with null labels.")
print(f"Shape after dropping nulls: {df.shape}")


 Removed 4 rows with null labels.
Shape after dropping nulls: (2150, 9)


In [6]:
rows_before_duplicates = len(df)
df.drop_duplicates(subset=['text'], keep='first', inplace=True)
rows_after_duplicates = len(df)
print(f"\nRemoved {rows_before_duplicates - rows_after_duplicates} duplicate text entries.")
print(f"Shape after dropping duplicates: {df.shape}")


Removed 214 duplicate text entries.
Shape after dropping duplicates: (1936, 9)


In [7]:
initial_rows = len(df)
rows_to_drop = []

for index, row in df.iterrows():
    for col in label_columns:
        labels = row[col]
        # Check if it's a list, not empty, and the first element is 'Login Issue'
        if isinstance(labels, list) and len(labels) > 0 and labels[0] == 'Login Issue':
            rows_to_drop.append(index)
            break # Add the index and move to the next row

# Drop the identified rows
df.drop(index=rows_to_drop, inplace=True)
rows_after_dropping_login_issue = len(df)

print(f"\nRemoved {initial_rows - rows_after_dropping_login_issue} rows where 'Login Issue' was the Level 1 label in any annotator column.")
print(f"Shape after dropping 'Login Issue' rows: {df.shape}")


Removed 1 rows where 'Login Issue' was the Level 1 label in any annotator column.
Shape after dropping 'Login Issue' rows: (1935, 9)


In [8]:
df.reset_index(drop=True, inplace=True) # First, reset the DataFrame index
df['id'] = [f'{i+1:04d}' for i in range(len(df))]
print("\nStep 4: Re-indexed the 'id' column to be sequential.")


Step 4: Re-indexed the 'id' column to be sequential.


In [None]:
output_file_path = '../Dataset_Clean.json'

df.to_json(output_file_path, orient='records', indent=4)
print("\n--- Cleaning Process Complete ---")
print(f"Final clean dataset has {len(df)} samples.")
print(f"Cleaned data successfully saved to: {output_file_path}")


--- Cleaning Process Complete ---
Final clean dataset has 1935 samples.
Cleaned data successfully saved to: Dataset_Clean.json
