In [None]:
import pandas as pd
import joblib
import numpy as np
import pandas as pd

# Load the feature ranking CSV file
df = pd.read_csv('/kaggle/working/day_2048/featureranking.csv', header=None)

# Assuming the last row contains the aggregate feature importance scores
feature_names = df.iloc[0].tolist()
importance_scores = df.iloc[-1].astype(float).tolist()

# Create a DataFrame with feature names and their corresponding importance scores
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importance_scores
})

# Sort the features by importance in descending order
sorted_features = feature_importance_df.sort_values(by='Importance', ascending=False)

# Extract the top 8 features
top_8_features = sorted_features['Feature'].head(8).tolist()

print("Top 8 features based on importance:")
for i, feature in enumerate(top_4_features, start=1):
    print(f"{i}. {feature}")


Top 4 features based on importance:
1. km Z5-T1-T2.6
2. perceived trainingSuccess.6
3. perceived recovery
4. perceived exertion.4
5. perceived exertion.3
6. km sprinting.5
7. nr. sessions.5
8. km Z5-T1-T2.4


In [None]:
# Load preprocessing and models
means = joblib.load("/kaggle/working/stores/stats/input_train_means.pkl")
stds = joblib.load("/kaggle/working/stores/stats/input_train_std.pkl")
feature_order = joblib.load("/kaggle/working/stores/stats/feature_order.pkl")  # List of 70 features
model_filenames = [
    "xgb_model_8061.joblib",
]

# Load the models
models = [joblib.load(f"/kaggle/working/stores/{filename}") for filename in model_filenames]

# Define preprocessing
def preprocess_input_1(user_input_dict):
    # Create a DataFrame with all required features
    input_df = pd.DataFrame(columns=feature_order)
    input_df.loc[0] = [user_input_dict.get(feat, means.get(feat, 0)) for feat in feature_order]
    
    # Standardize the input
    standardized = (input_df - means[feature_order]) / stds[feature_order]
    
    # Convert to NumPy array
    return standardized.to_numpy()

# Define preprocessing
def preprocess_input(user_input_dict):
    # Create a DataFrame with all required features
    input_data = {feat: [user_input_dict.get(feat, 0)] for feat in feature_order}
    input_df = pd.DataFrame(input_data)
    
    # Standardize the input
    standardized = (input_df - means[feature_order]) / stds[feature_order]
    
    # Convert to NumPy array
    return standardized.to_numpy()
# Predict injury risk
def predict_injury_risk(input_vec):
    preds = [model.predict_proba(input_vec)[0][1] for model in models]
    return np.mean(preds)

# --- Example usage ---
# Provide values for the top 4 features; others will be filled with mean values
user_input_dict = {
    "km Z5-T1-T2.6": 20.7,
    "perceived trainingSuccess.6": 32.8,
    "perceived recovery": 0.83,
    "perceived exertion.4": 12.9,
    "perceived exertion.3": 1.6,
    "km sprinting.5": 9.0,
    "nr. sessions.5": 9.8,
    "km Z5-T1-T2.4": 9.6
}

# Run prediction
x_input = preprocess_input(user_input_dict)
injury_risk = predict_injury_risk(x_input)
print(f"🔍 Injury Risk Prediction: {injury_risk:.2f} → {'⚠️ High' if injury_risk > 0.5 else '✅ Low'}")


🔍 Injury Risk Prediction: 0.69 → ⚠️ High
