In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Step 1: Load Data and Handle Missing Values
def load_data_with_missing_values(file_path):
    data = pd.read_excel(file_path, header=None)
    
    # Handle columns containing comma-separated strings
    expanded_data = []
    for col in data.columns:
        expanded_col = data[col].apply(lambda x: pd.Series(map(float, x.split(','))) if isinstance(x, str) else x)
        expanded_data.append(expanded_col)

    # Combine columns into a full DataFrame
    data = pd.concat(expanded_data, axis=1)

    # Replace missing values (1.00000000000000e+99) with NaN
    data.replace(1.00000000000000e+99, np.nan, inplace=True)
    
    return data

In [None]:
# Step 2: Impute Missing Values
def impute_missing_values(data, method='mean'):
    if method == 'mean':
        imputer = SimpleImputer(strategy='mean')
    elif method == 'median':
        imputer = SimpleImputer(strategy='median')
    elif method == 'knn':
        imputer = KNNImputer(n_neighbors=5)
    else:
        raise ValueError("Invalid method. Choose 'mean', 'median', or 'knn'.")

    imputed_data = pd.DataFrame(imputer.fit_transform(data))
    return imputed_data

In [None]:
# Step 3: Train a Classifier
def train_classifier(train_data, train_labels):
    model = RandomForestClassifier(random_state=42)
    model.fit(train_data, train_labels.values.ravel())
    return model

In [None]:
# Step 4: Predict Test Labels
def predict(model, test_data):
    return model.predict(test_data)

In [None]:
# Step 5: Save Predictions
def save_predictions(predictions, output_file):
    pd.DataFrame(predictions).to_csv(output_file, index=False, header=False)
    print(f"Predictions saved to {output_file}")

In [None]:
# Step 6: Save Imputed Data
def save_imputed_data(imputed_data, output_file):
    imputed_data.to_excel(output_file, index=False, header=False)
    print(f"Imputed data saved to {output_file}")

In [1]:
# Main Workflow
# for i in range(1, 7):  # Loop for datasets 1-6
#     input_file = f'./Dataset/output_MissingData{i}.xlsx'
#     train_file = f'./Dataset/output_TrainData{i}.xlsx'
#     label_file = f'./Dataset/output_TrainLabel{i}.xlsx'
#     test_file = f'./Dataset/output_TestData{i}.xlsx'
#     imputed_output_file = f'./MissingData/BleMissingResult{i}.xlsx'
#     predictions_output_file = f'./Classification/BleClassification{i}.txt'

#     # Skip missing value estimation for datasets > 3
#     if i <= 3:
#         data_with_missing_values = load_data_with_missing_values(input_file)
#         imputed_data = impute_missing_values(data_with_missing_values, method='mean')
#         save_imputed_data(imputed_data, imputed_output_file)

#     train_data = load_data_with_missing_values(train_file)
#     test_data = load_data_with_missing_values(test_file)
#     train_labels = pd.read_excel(label_file, header=None)

#     imputed_train_data = impute_missing_values(train_data, method='mean')
#     imputed_test_data = impute_missing_values(test_data, method='mean')

#     model = train_classifier(imputed_train_data, train_labels)
#     predictions = predict(model, imputed_test_data)

#     save_predictions(predictions, predictions_output_file)

# Main Execution
# Load and impute missing data
data_with_missing_values = load_data_with_missing_values(input_file)
imputed_data = impute_missing_values(data_with_missing_values)
save_imputed_data(imputed_data, imputed_output_file)

# Load training data
train_data, train_labels = load_training_data(train_file, label_file)

# Train the model
model = train_classifier(train_data, train_labels)

# Make predictions
predictions = predict(model, test_file)

# Save predictions
save_predictions(predictions, predictions_output_file)


NameError: name 'load_data_with_missing_values' is not defined