In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('camp_teach_cleaned_standardized_final.csv')

# Calculate mean and standard deviation for the required columns
mean_std_values = {
    'age_rz': {'mean': df['age_rz'].mean(), 'std': df['age_rz'].std()},
    'PREFEV': {'mean': df['PREFEV'].mean(), 'std': df['PREFEV'].std()},
    'POSFEV': {'mean': df['POSFEV'].mean(), 'std': df['POSFEV'].std()},
    'PREFVC': {'mean': df['PREFVC'].mean(), 'std': df['PREFVC'].std()},
    'POSFVC': {'mean': df['POSFVC'].mean(), 'std': df['POSFVC'].std()},
}

print(mean_std_values)


{'age_rz': {'mean': 4.028813742522333e-16, 'std': 1.0000502702023633}, 'PREFEV': {'mean': 1.614382811719942e-16, 'std': 1.0000502702023717}, 'POSFEV': {'mean': 4.314545213623208e-16, 'std': 1.0000502702023666}, 'PREFVC': {'mean': -3.4859239474306714e-16, 'std': 1.0000502702023724}, 'POSFVC': {'mean': 7.857615455274054e-17, 'std': 1.000050270202367}}


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
import tkinter as tk
from tkinter import messagebox

# Load the correct dataset
df = pd.read_csv('camp_teach_cleaned_standardized_final.csv')

# Assign correct column names based on your dataset
df.columns = [
    'TX', 'TG', 'id', 'age_rz', 'GENDER', 'ETHNIC', 'PREFEV', 'PREFVC', 'PREFF', 'PREPF', 
    'POSFEV', 'POSFVC', 'POSFF', 'POSPF', 'PREFEVPP', 'PREFVCPP', 'POSFEVPP', 'POSFVCPP', 
    'visitc', 'fdays', 'FEV1_Change'
]

# Encode categorical variables
categorical_columns = ['TX', 'TG', 'GENDER', 'ETHNIC']
label_encoders = {}

for column in categorical_columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

# Handle missing values: Impute missing values with KNNImputer
imputer = KNNImputer(n_neighbors=5)
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# Define the target column name correctly
target_column = 'FEV1_Change'  # This is a continuous variable

# Select only the features needed for training and prediction
selected_features = ['age_rz', 'PREFEV', 'POSFEV', 'PREFVC', 'POSFVC', 'TX', 'GENDER', 'ETHNIC']
X = df_imputed[selected_features]

# Split the dataset into features and targetand other
y = df_imputed[target_column]

# Split the dataset into training and testing sets with these features only
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Calculate mean and standard deviation for standardization
mean_std_values = {
    'age_rz': {'mean': df['age_rz'].mean(), 'std': df['age_rz'].std()},
    'PREFEV': {'mean': df['PREFEV'].mean(), 'std': df['PREFEV'].std()},
    'POSFEV': {'mean': df['POSFEV'].mean(), 'std': df['POSFEV'].std()},
    'PREFVC': {'mean': df['PREFVC'].mean(), 'std': df['PREFVC'].std()},
    'POSFVC': {'mean': df['POSFVC'].mean(), 'std': df['POSFVC'].std()},
}

# Define a regressor pipeline (PCA removed)
def create_pipeline(regressor):
    return Pipeline([
        ('reg', regressor)
    ])

# Define regressors
dt = DecisionTreeRegressor(random_state=42)
rf = RandomForestRegressor(random_state=42)

# Create a list of regressors
regressors = [
    ('dt', create_pipeline(dt)),
    ('rf', create_pipeline(rf))
]

# Define optimized hyperparameter tuning space for RandomForestRegressor
param_grid_rf = {
    'reg__n_estimators': [50, 100],  # Reduced number of estimators for faster training
    'reg__max_depth': [10, 20],    # Focused depth options
    'reg__min_samples_split': [2, 10],
    'reg__min_samples_leaf': [1, 2]
}

# Use RandomizedSearchCV instead of GridSearchCV
random_search_rf = RandomizedSearchCV(create_pipeline(rf), param_distributions=param_grid_rf, 
                                      n_iter=10, cv=3, scoring='neg_mean_squared_error', 
                                      random_state=42, n_jobs=-1)
random_search_rf.fit(X_train, y_train)

# Print the best hyperparameters and the corresponding score
print(random_search_rf.best_params_)
print(random_search_rf.best_score_)

# Directly use the trained model for prediction
model = random_search_rf.best_estimator_

# Evaluate model performance on the test set
y_pred = model.predict(X_test)

# Calculate performance metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

# Function to classify health based on predicted FEV1 change
def classify_health(fev1_change):
    """
    Classify health based on the predicted FEV1 change.
    """
    if fev1_change > 0.2:
        return "Improved Health"
    elif -0.2 <= fev1_change <= 0.2:
        return "Stable Health"
    else:
        return "Deteriorated Health"

# Create the Tkinter window
root = tk.Tk()
root.title("Childhood Asthma Treatment Prediction")

# Labels based on the asthma dataset with unstandardized values for display
labels = [
    "Age at Randomization (5-13 years)", 
    "Pre-Treatment FEV1 (1.0-3.5 L)", 
    "Post-Treatment FEV1 (1.0-3.5 L)", 
    "Pre-Treatment FVC (1.0-4.0 L)", 
    "Post-Treatment FVC (1.0-4.0 L)", 
    "Treatment Group (0: Budesonide, 1: Nedocromil, 2: Placebo)", 
    "Gender (0: Male, 1: Female)", 
    "Ethnicity (0: White, 1: Black, 2: Hispanic, 3: Other)"
]

entries = []
for i, label in enumerate(labels):
    tk.Label(root, text=label).grid(row=i, column=0)
    entry = tk.Entry(root)
    entry.grid(row=i, column=1)
    entries.append(entry)

# Standardization function to convert user input from unstandardized to standardized values
def standardize_input(value, mean, std):
    return (value - mean) / std

# Create a predict button with debugging
def predict():
    try:
        inputs = []
        # Iterate through the entries and convert them to floats
        for i in range(5):  # The first 5 fields are numeric
            entry_value = entries[i].get().strip()  # Strip any extra whitespace
            print(f"Processing entry {i}: '{entry_value}' (Type: {type(entry_value)})")  # Debug print
            try:
                float_value = float(entry_value)
                standardized_value = standardize_input(float_value, mean_std_values[list(mean_std_values.keys())[i]]['mean'], mean_std_values[list(mean_std_values.keys())[i]]['std'])
                print(f"Standardized value for entry {i}: {standardized_value}")
                inputs.append(standardized_value)
            except ValueError as ve:
                print(f"Error converting entry {i}: '{entry_value}' to float. Error: {ve}")
                messagebox.showerror("Input Error", f"Please enter a valid numeric value for field {i+1}.")
                return
        
        # Directly append categorical variables (already in the correct form)
        for i in range(5, 8):  # The last 3 fields are categorical
            entry_value = entries[i].get().strip()  # Strip any extra whitespace
            print(f"Processing categorical entry {i}: '{entry_value}' (Type: {type(entry_value)})")  # Debug print
            try:
                int_value = int(entry_value)
                print(f"Converted categorical entry {i}: {int_value}")
                inputs.append(int_value)
            except ValueError as ve:
                print(f"Error converting entry {i}: '{entry_value}' to int. Error: {ve}")
                messagebox.showerror("Input Error", f"Please enter a valid numeric value for field {i+1}.")
                return

        print(f"Final input list: {inputs}")
        prediction = model.predict([inputs])[0]
        health_status = classify_health(prediction)
        result_label.config(text=f"Predicted FEV1 Change: {prediction:.4f}\nHealth Status: {health_status}")
    except Exception as e:
        print(f"Unexpected error: {e}")  # Print any unexpected error
        messagebox.showerror("Input Error", "An unexpected error occurred. Please try again.")

predict_button = tk.Button(root, text="Predict", command=predict)
predict_button.grid(row=8, column=0, columnspan=2)

# Create a label to display the prediction result
result_label = tk.Label(root, text="")
result_label.grid(row=9, column=0, columnspan=2)

root.mainloop()


{'reg__n_estimators': 100, 'reg__min_samples_split': 2, 'reg__min_samples_leaf': 1, 'reg__max_depth': 20}
-0.0016839897979816984
Mean Squared Error: 0.0006930811365694203
R^2 Score: 0.9762327707731441
Processing entry 0: '6' (Type: <class 'str'>)
Standardized value for entry 0: 5.999698393947618
Processing entry 1: '1.8' (Type: <class 'str'>)
Standardized value for entry 1: 1.79990951818427
Processing entry 2: '2.8' (Type: <class 'str'>)
Standardized value for entry 2: 2.799859250508878
Processing entry 3: '3.1' (Type: <class 'str'>)
Standardized value for entry 3: 3.0998441702062416
Processing entry 4: '2.9' (Type: <class 'str'>)
Standardized value for entry 4: 2.8998542237413374
Processing categorical entry 5: '1' (Type: <class 'str'>)
Converted categorical entry 5: 1
Processing categorical entry 6: '0' (Type: <class 'str'>)
Converted categorical entry 6: 0
Processing categorical entry 7: '2' (Type: <class 'str'>)
Converted categorical entry 7: 2
Final input list: [5.999698393947618,

