In [8]:
import pandas as pd

# Load the cleaned dataset
file_path = "merged.csv"
df = pd.read_csv(file_path)

# Clean column names (optional step)
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

# Convert appropriate columns to numeric, forcing errors to NaN
numeric_columns = [
    "player_height_inches", "player_weight", "min", "usg_pct", "pace", "poss",
    "fga_pg", "drives", "drive_fga", "drive_passes", "dist_miles", "avg_speed",
    "pull_up_fga", "pull_up_fg3a", "touches", "front_ct_touches", "avg_sec_per_touch",
    "avg_drib_per_touch", "elbow_touches", "post_touches", "paint_touches", "days_missed"
]

for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Show summary of missing values after conversion
missing_summary = df[numeric_columns].isnull().sum().sort_values(ascending=False)

# Preview cleaned data
df_preview = df.head(3)

missing_summary, df_preview
# Save the cleaned dataset to a new file
new_file_path = "cleaned_merged.csv"
df.to_csv(new_file_path, index=False)


In [60]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np

# Load the cleaned dataset
df = pd.read_csv("cleaned_merged.csv")

# Step 1: Create the injury label
df['injured'] = np.where(df['injured_on'].notna(), 'yes', 'no')

# Step 2: Select relevant features - these should include biomechanical and performance metrics
feature_columns = [
    'player_height_inches', 'player_weight', 'min', 'usg_pct', 'pace', 'poss',
    'fga_pg', 'drives', 'drive_fga', 'drive_passes', 'dist_miles', 'avg_speed',
    'pull_up_fga', 'pull_up_fg3a', 'touches', 'front_ct_touches', 'avg_sec_per_touch',
    'avg_drib_per_touch', 'elbow_touches', 'post_touches', 'paint_touches', 'gp'
]

# Step 3: Preprocess the data
# Handle missing values
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(df[feature_columns])

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Get the target variable
y = df['injured']

# Step 4: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.25, random_state=45, stratify=y)

# Step 5: Build and evaluate the model
model = RandomForestClassifier(n_estimators=200, random_state=76)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Model Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Step 6: Feature importance analysis (for understanding what factors contribute to injuries)
feature_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance)

# Save the model for future use
import joblib
joblib.dump(model, 'injury_prediction_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

# For IoT integration (conceptual example)
class InjuryMonitoringSystem:
    def __init__(self, model_path, scaler_path):
        self.model = joblib.load(model_path)
        self.scaler = joblib.load(scaler_path)
        
    def predict_injury_risk(self, player_data):
        """Predict injury risk for new player data"""
        # Preprocess the new data
        processed_data = self.scaler.transform(player_data)
        # Make prediction
        prediction = self.model.predict(processed_data)
        probabilities = self.model.predict_proba(processed_data)
        
        return {
            'injury_prediction': prediction[0],
            'injury_probability': probabilities[0][1],  # Probability of 'yes'
            'risk_factors': self.get_risk_factors(player_data)
        }
    
    def get_risk_factors(self, player_data):
        """Identify which features contribute most to the risk"""
        importance = self.model.feature_importances_
        top_risks = []
        for i, val in enumerate(player_data[0]):
            if importance[i] > 0.05:  # Only consider significant factors
                top_risks.append({
                    'feature': feature_columns[i],
                    'value': val,
                    'importance': importance[i]
                })
        return sorted(top_risks, key=lambda x: x['importance'], reverse=True)

# Example usage (conceptual)
monitoring_system = InjuryMonitoringSystem('injury_prediction_model.pkl', 'scaler.pkl')

# This would come from IoT/wearable devices in a real system
new_player_data = np.array([[48, 4000, 30.1, 0.806, 100.06, 3828, 
                            11.0, 3.2, 1.5, 0.8, 7.14, 3.99, 
                            1.6, 0.9, 46.7, 27.3, 2.94, 2.11, 
                            1.8, 2.1, 7.8, 61]]).reshape(1, -1)

prediction = monitoring_system.predict_injury_risk(new_player_data)
print("\nExample Prediction for New Player Data:")
print(prediction)

Model Accuracy: 0.886021505376344

Classification Report:
              precision    recall  f1-score   support

          no       0.88      0.98      0.93      1091
         yes       0.90      0.54      0.67       304

    accuracy                           0.89      1395
   macro avg       0.89      0.76      0.80      1395
weighted avg       0.89      0.89      0.87      1395


Feature Importance:
                 feature  importance
21                    gp    0.064605
5                   poss    0.061781
4                   pace    0.061632
10            dist_miles    0.059508
15      front_ct_touches    0.055411
14               touches    0.054597
6                 fga_pg    0.054568
2                    min    0.053940
3                usg_pct    0.050591
11             avg_speed    0.048369
17    avg_drib_per_touch    0.044394
16     avg_sec_per_touch    0.043795
20         paint_touches    0.040095
7                 drives    0.038045
13          pull_up_fg3a    0.038001
12