In [8]:
# Test_Data_Preprocessing.ipynb

import sys
import os

# Adjust the path to include the src directory
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../src")))

from data_preprocessing.data_preprocessing import DataPreprocessor

In [9]:
# Cell: Create sample data for testing

# Create a directory for test data if it doesn't exist
if not os.path.exists('test_data'):
    os.makedirs('test_data')

# Create a valid CSV file for testing
valid_data = {
    'Median_House_Value': [100000, 150000, 200000],
    'Median_Income': [2.5, 3.0, 3.5],
    'Median_Age': [30, 40, 50],
    'Tot_Rooms': [5, 6, 7],
    'Tot_Bedrooms': [2, 3, 4],
    'Population': [100, 150, 200]
}

valid_df = pd.DataFrame(valid_data)
valid_df.to_csv('test_data/valid_file.csv', index=False)

# Create a CSV file with missing data
invalid_data = {
    'Median_House_Value': [100000, None, 200000],  # Missing value in the second row
    'Median_Income': [2.5, 3.0, 3.5],
    'Median_Age': [30, 40, 50],
    'Tot_Rooms': [5, None, 7],  # Missing value in the second row
    'Tot_Bedrooms': [2, 3, 4],
    'Population': [100, 150, None]  # Missing value in the third row
}

invalid_df = pd.DataFrame(invalid_data)
invalid_df.to_csv('test_data/invalid_file.csv', index=False)


In [10]:
# Cell: Test loading valid data

# Test loading a valid CSV file
data_preprocessor = DataPreprocessor('test_data/valid_file.csv')
data_loaded = data_preprocessor.load_data()
print(f"Test Load Valid Data: {'Passed' if data_loaded else 'Failed'}")
assert data_loaded, "The data should load successfully."


Data loaded successfully.
Test Load Valid Data: Passed


In [11]:
# Cell: Test loading an invalid file path

# Test loading an invalid CSV file path
data_preprocessor.file_path = "test_data/non_existent_file.csv"
data_loaded = data_preprocessor.load_data()
print(f"Test Load Invalid Data Path: {'Passed' if not data_loaded else 'Failed'}")
assert not data_loaded, "Loading data should fail with an invalid file path."


Error: The file 'test_data/non_existent_file.csv' does not exist.
Test Load Invalid Data Path: Passed


In [12]:
# Cell: Test checking data readiness

# Load valid data and check readiness
data_preprocessor = DataPreprocessor('test_data/valid_file.csv')
data_preprocessor.load_data()
data_ready = data_preprocessor.check_data_readiness()
print(f"Test Data Readiness (Valid Data): {'Passed' if data_ready else 'Failed'}")
assert data_ready, "Data should be ready for the ML process."


Data loaded successfully.
Data is ready for the machine learning process.
Test Data Readiness (Valid Data): Passed


In [14]:
# Test handling missing data



# Load data with missing values and handle it
data_preprocessor = DataPreprocessor('test_data/invalid_file.csv')
if data_preprocessor.load_data():  # Ensure data is loaded successfully
    # Display data before handling missing values
    print("Data before handling missing values:")
    print(data_preprocessor.df)

    # Handle missing data using the method in the DataPreprocessor class
    data_preprocessor.fill_missing_data()

    # Display data after handling missing values
    print("Data after handling missing values:")
    print(data_preprocessor.df)

    # Verify that there are no missing values left
    no_missing_data = not data_preprocessor.df.isnull().values.any()
    print(f"Test Handling Missing Data: {'Passed' if no_missing_data else 'Failed'}")
    assert no_missing_data, "All missing data should be handled."
else:
    print("Failed to load data. Cannot proceed with the test.")


Data loaded successfully.
Data before handling missing values:
   Median_House_Value  Median_Income  Median_Age  Tot_Rooms  Tot_Bedrooms  \
0            100000.0            2.5          30        5.0             2   
1                 NaN            3.0          40        NaN             3   
2            200000.0            3.5          50        7.0             4   

   Population  
0       100.0  
1       150.0  
2         NaN  
Missing data handled.
Data after handling missing values:
   Median_House_Value  Median_Income  Median_Age  Tot_Rooms  Tot_Bedrooms  \
0            100000.0            2.5          30        5.0             2   
1            150000.0            3.0          40        6.0             3   
2            200000.0            3.5          50        7.0             4   

   Population  
0       100.0  
1       150.0  
2       125.0  
Test Handling Missing Data: Passed
