In [1]:
# Import necessary libraries and create sample data
import pandas as pd
import sys
import os

# Adjust the path to include the src directory
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../src")))

from data_preprocessing.data_preprocessing import DataPreprocessor

# Create a directory for test data if it doesn't exist
if not os.path.exists('test_data'):
    os.makedirs('test_data')

# Create a sample CSV file with missing values and categorical data for testing
data_with_categorical = {
    'Category': ['A', 'B', 'A', None, 'C'],
    'Value': [1, 2, None, 4, 5],
    'OtherValue': [10, 20, 30, None, 50]
}

df_with_categorical = pd.DataFrame(data_with_categorical)
df_with_categorical.to_csv('test_data/data_with_categorical.csv', index=False)
print("Sample data with categorical and missing values created for testing.")


Sample data with categorical and missing values created for testing.


In [2]:
# Cell: Test Handling Missing Data

# Initialize the DataPreprocessor with the sample file path
data_preprocessor = DataPreprocessor('test_data/data_with_categorical.csv')

# Ensure data is loaded successfully
if data_preprocessor.load_data():
    # Display data before handling readiness check
    print("Data before checking readiness for missing data:")
    print(data_preprocessor.df)

    # Test the readiness check method with parameters instead of interactive input
    is_ready = data_preprocessor.check_data_readiness(fill_missing='yes', convert_categorical='no')

    # Display results after attempting to handle missing data
    print(f"Data Readiness Check (Handling Missing Data): {'Passed' if is_ready else 'Failed'}")
    print("Data after checking readiness for missing data:")
    print(data_preprocessor.df)

    # Validate that missing data has been handled
    no_missing_data = not data_preprocessor.df.isnull().values.any()
    print(f"Test Handling Missing Data: {'Passed' if no_missing_data else 'Failed'}")
    assert no_missing_data, "All missing data should be handled."
else:
    print("Failed to load data. Cannot proceed with the test.")


Data loaded successfully.
Data before checking readiness for missing data:
  Category  Value  OtherValue
0        A    1.0        10.0
1        B    2.0        20.0
2        A    NaN        30.0
3      NaN    4.0         NaN
4        C    5.0        50.0
Data is not ready: Missing values found.
Columns with missing values:
Category      1
Value         1
OtherValue    1
dtype: int64
Missing data handled.
Missing values filled.
Data contains categorical/string columns: ['Category']
Please convert categorical data and try again.
Data Readiness Check (Handling Missing Data): Failed
Data after checking readiness for missing data:
  Category  Value  OtherValue
0        A    1.0        10.0
1        B    2.0        20.0
2        A    3.0        30.0
3        A    4.0        27.5
4        C    5.0        50.0
Test Handling Missing Data: Passed


In [3]:
# Cell: Test Conversion of Categorical Variables to Dummy Variables (Corrected)

# Reload the data for a fresh start
data_preprocessor = DataPreprocessor('test_data/data_with_categorical.csv')

# Ensure data is loaded successfully
if data_preprocessor.load_data():
    # Display data before handling missing values
    print("Data before handling missing values:")
    print(data_preprocessor.df)

    # Handle missing data first
    is_ready_after_missing_data = data_preprocessor.check_data_readiness(fill_missing='yes', convert_categorical='no')

    if not is_ready_after_missing_data:
        print("Failed to handle missing data. Cannot proceed with converting categorical variables.")
    else:
        # Display data after handling missing values
        print("Data after handling missing values:")
        print(data_preprocessor.df)

        # Convert categorical data to dummy variables
        is_ready_after_conversion = data_preprocessor.check_data_readiness(fill_missing='no', convert_categorical='yes')

        # Display results after conversion
        print(f"Data Readiness Check (Conversion of Categorical Variables): {'Passed' if is_ready_after_conversion else 'Failed'}")
        print("Data after converting categorical variables:")
        print(data_preprocessor.df)

        # Validate that categorical columns have been converted to dummy variables
        categorical_columns = ['Category']
        all_converted = all(col not in data_preprocessor.df.columns for col in categorical_columns) and \
                        any(col.startswith('Category_') for col in data_preprocessor.df.columns)
        print(f"Test Conversion of Categorical Variables: {'Passed' if all_converted else 'Failed'}")
        assert all_converted, "All categorical variables should be converted to dummy variables."
else:
    print("Failed to load data. Cannot proceed with the test.")



Data loaded successfully.
Data before handling missing values:
  Category  Value  OtherValue
0        A    1.0        10.0
1        B    2.0        20.0
2        A    NaN        30.0
3      NaN    4.0         NaN
4        C    5.0        50.0
Data is not ready: Missing values found.
Columns with missing values:
Category      1
Value         1
OtherValue    1
dtype: int64
Missing data handled.
Missing values filled.
Data contains categorical/string columns: ['Category']
Please convert categorical data and try again.
Failed to handle missing data. Cannot proceed with converting categorical variables.
