In [1]:
# %% [markdown]
# # Research Data Cleaning Demo
# 
# This notebook demonstrates the advanced data cleaning functions.

# %%
import pandas as pd
import numpy as np
from data_cleaner import clean_data, handle_outliers, standardize_dates
import ipywidgets as widgets
from IPython.display import display

# %% [markdown]
# ## 1. Load Sample Data

# %%
# Create sample data
data = {
    'Participant': ['John Doe', 'Jane Smith', 'Bob Johnson', 'Alice Brown', 'Eve White'],
    'Age': [32, 28, 45, 36, 150],  # Contains outlier (150)
    'Test_Score': [85.5, 92.0, 78.5, 88.0, 200],  # Contains outlier (200)
    'Join_Date': ['2023-01-15', '15/02/2023', 'March 3 2023', '04-04-23', '2023/05/10'],
    'Comments': ['Good progress', '  Needs follow-up  ', 'On track', '', np.nan]
}

df = pd.DataFrame(data)
df

# %% [markdown]
# ## 2. Interactive Cleaning Demo

# %%
# Create interactive widgets
outlier_method = widgets.Dropdown(
    options=['clip', 'remove', 'median'],
    value='clip',
    description='Outlier Method:'
)

clean_button = widgets.Button(description="Clean Data")
output = widgets.Output()

def on_clean_button_clicked(b):
    with output:
        output.clear_output()
        print("Cleaning data...")
        
        # Make a copy for demo
        demo_df = df.copy()
        
        # Clean data
        cleaned_df = clean_data(demo_df)
        
        # Handle outliers with selected method
        for col in ['Age', 'Test_Score']:
            cleaned_df = handle_outliers(cleaned_df, col, method=outlier_method.value)
        
        # Standardize dates
        cleaned_df = standardize_dates(cleaned_df, 'Join_Date')
        
        print("\nBefore Cleaning:")
        display(df)
        
        print("\nAfter Cleaning:")
        display(cleaned_df)

clean_button.on_click(on_clean_button_clicked)

display(outlier_method, clean_button, output)

# %% [markdown]
# ## 3. Step-by-Step Cleaning Process

# %%
# Show each step individually
print("Original Data:")
display(df)

# %%
print("\n1. After Text Cleaning:")
text_cleaned = df.copy()
for col in text_cleaned.select_dtypes(include=['object']).columns:
    text_cleaned[col] = text_cleaned[col].apply(clean_text)
display(text_cleaned)

# %%
print("\n2. After Handling Outliers (clip method):")
outlier_handled = text_cleaned.copy()
for col in ['Age', 'Test_Score']:
    outlier_handled = handle_outliers(outlier_handled, col, method='clip')
display(outlier_handled)

# %%
print("\n3. After Date Standardization:")
date_standardized = outlier_handled.copy()
date_standardized = standardize_dates(date_standardized, 'Join_Date')
display(date_standardized)

# %% [markdown]
# ## 4. Export Options
# 
# The cleaned data can be exported to various formats:

# %%
def export_data(format):
    cleaned_df = clean_data(df)
    filename = f'cleaned_data.{format}'
    
    if format == 'csv':
        cleaned_df.to_csv(filename, index=False)
    elif format == 'excel':
        cleaned_df.to_excel(filename, index=False)
    elif format == 'json':
        cleaned_df.to_json(filename, orient='records')
    
    print(f"Data exported to {filename}")

export_format = widgets.Dropdown(
    options=['csv', 'excel', 'json'],
    value='csv',
    description='Export Format:'
)

export_button = widgets.Button(description="Export Data")

def on_export_clicked(b):
    export_data(export_format.value)

export_button.on_click(on_export_clicked)

display(export_format, export_button)

Dropdown(description='Outlier Method:', options=('clip', 'remove', 'median'), value='clip')

Button(description='Clean Data', style=ButtonStyle())

Output()

Original Data:


Unnamed: 0,Participant,Age,Test_Score,Join_Date,Comments
0,John Doe,32,85.5,2023-01-15,Good progress
1,Jane Smith,28,92.0,15/02/2023,Needs follow-up
2,Bob Johnson,45,78.5,March 3 2023,On track
3,Alice Brown,36,88.0,04-04-23,
4,Eve White,150,200.0,2023/05/10,



1. After Text Cleaning:


NameError: name 'clean_text' is not defined