In [None]:
# ==================================================
# INSTALL PACKAGES
# ==================================================
!pip install -q catboost
!pip install -q gradio joblib openpyxl

# ==================================================
# IMPORTS
# ==================================================
import pandas as pd
import numpy as np
import zipfile, os, joblib
from google.colab import files

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Import CatBoostClassifier here, after it's been installed
from catboost import CatBoostClassifier

import gradio as gr
import warnings
warnings.filterwarnings("ignore")

# ==================================================
# FILE UPLOAD
# ==================================================
print("Upload ZIP or Excel file")
uploaded = files.upload()

file_name = list(uploaded.keys())[0]
base_dir = "/content/data"
os.makedirs(base_dir, exist_ok=True)

file_path = os.path.join(base_dir, file_name)
with open(file_path, "wb") as f:
    f.write(uploaded[file_name])

# ==================================================
# EXTRACT ZIP & FIND DATASET
# ==================================================
excel_files = []

if file_name.lower().endswith(".zip"):
    with zipfile.ZipFile(file_path, "r") as z:
        z.extractall(base_dir)
    for root, _, files_ in os.walk(base_dir):
        for f in files_:
            if f.lower().endswith((".xlsx", ".xls")):
                excel_files.append(os.path.join(root, f))
else:
    excel_files.append(file_path)

if not excel_files:
    raise ValueError("No Excel files found")

selected_file = None
for f in excel_files:
    try:
        tmp = pd.read_excel(f, nrows=5, engine='openpyxl')  # Use openpyxl engine
    except Exception as e:
        print(f"Skipping {f}, cannot read as Excel. Error: {e}")
        continue
    if "days_missed" in tmp.columns:
        selected_file = f
        break

if selected_file is None:
    raise ValueError("No valid Excel file with 'days_missed' column found. "
                     "Please upload a proper .xlsx file or a ZIP containing it.")

print("Using dataset:", os.path.basename(selected_file))
df = pd.read_excel(selected_file, engine='openpyxl')

# ==================================================
# TARGET CREATION
# ==================================================
def categorize_days(v):
    if v <= 5:
        return 0
    elif v <= 15:
        return 1
    else:
        return 2

y = df["days_missed"].apply(categorize_days)
X = df.drop(columns=["days_missed", "player_id"], errors="ignore")

# ==================================================
# FEATURE ENGINEERING
# ==================================================
for c in X.columns:
    if "date" in c.lower():
        X[c] = pd.to_datetime(X[c], errors="coerce")
        X[c] = (X[c] - X[c].min()).dt.days

X = pd.get_dummies(X, drop_first=True)

# ==================================================
# TRAIN TEST SPLIT
# ==================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# ==================================================
# PREPROCESSING
# ==================================================
prep = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

X_train = pd.DataFrame(prep.fit_transform(X_train), columns=X.columns)
X_test  = pd.DataFrame(prep.transform(X_test), columns=X.columns)

# ==================================================
# FEATURE SELECTION (RF + LASSO)
# ==================================================
rf = RandomForestClassifier(n_estimators=300, random_state=42)
rf.fit(X_train, y_train)

rf_imp = pd.Series(rf.feature_importances_, index=X.columns)
rf_features = rf_imp[rf_imp > 0.01].index.tolist()
if not rf_features:
    rf_features = rf_imp.sort_values(ascending=False).head(5).index.tolist()

X_train = X_train[rf_features]
X_test  = X_test[rf_features]

lasso = LassoCV(cv=5)
lasso.fit(X_train, y_train)

final_features = X_train.columns[np.abs(lasso.coef_) > 0].tolist()
if not final_features:
    final_features = X_train.columns.tolist()

X_train = X_train[final_features]
X_test  = X_test[final_features]

print("Final selected features:", final_features)

# ==================================================
# MODEL TRAINING (CATBOOST)
# ==================================================
model = CatBoostClassifier(
    loss_function="MultiClass",
    iterations=500,
    learning_rate=0.05,
    depth=8,
    verbose=0
)

model.fit(X_train, y_train)

# ==================================================
# EVALUATION
# ==================================================
preds = model.predict(X_test)
print("\nAccuracy:", accuracy_score(y_test, preds))
print(classification_report(
    y_test, preds,
    target_names=["Short-term", "Medium-term", "Long-term"]
))

# ==================================================
# SAVE ARTIFACTS
# ==================================================
joblib.dump(model, "catboost_model.joblib")
joblib.dump(prep, "preprocessor.joblib")
joblib.dump(final_features, "features.joblib")

print("\nModel & artifacts saved")

# ==================================================
# GRADIO FRONTEND
# ==================================================
labels = {
    0: "Short-term Injury",
    1: "Medium-term Injury",
    2: "Long-term Injury"
}

def predict_ui(*inputs):
    data = dict(zip(final_features, inputs))
    df = pd.DataFrame([data])
    df = prep.transform(df)
    pred = model.predict(df)[0]
    return labels[int(pred)]

inputs = [gr.Number(label=f) for f in final_features]

ui = gr.Interface(
    fn=predict_ui,
    inputs=inputs,
    outputs=gr.Textbox(label="Prediction"),
    title="üè• Injury Duration Prediction",
    description="CatBoost-based ML system with feature selection"
)

ui.launch()

[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m99.2/99.2 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hUpload ZIP or Excel file


Saving U=10(main)-Save the model,Build backend,use joblead and built,f-API-OP.zip to U=10(main)-Save the model,Build backend,use joblead and built,f-API-OP.zip
Using dataset: player_injuries_cleaned_final (2) (1) (1) (2) (3) (1) (1) (1) (1).xlsx
Final selected features: ['end_date', 'games_missed', 'injury_reason_Ill', 'injury_reason_Rest']


NameError: name 'CatBoostClassifier' is not defined

In [None]:
# ==================================================
# INSTALL & IMPORT PACKAGES (BULLETPROOF)
# ==================================================
try:
    from catboost import CatBoostClassifier
except ModuleNotFoundError:
    !pip install -q catboost
    from catboost import CatBoostClassifier

try:
    import gradio as gr
except ModuleNotFoundError:
    !pip install -q gradio joblib openpyxl
    import gradio as gr

import pandas as pd
import numpy as np
import zipfile, os, joblib, warnings
from google.colab import files

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

warnings.filterwarnings("ignore")

# ==================================================
# FILE UPLOAD
# ==================================================
print("Upload ZIP or Excel file")
uploaded = files.upload()

file_name = list(uploaded.keys())[0]
base_dir = "/content/data"
os.makedirs(base_dir, exist_ok=True)

file_path = os.path.join(base_dir, file_name)
with open(file_path, "wb") as f:
    f.write(uploaded[file_name])

# ==================================================
# EXTRACT ZIP & FIND DATASET
# ==================================================
excel_files = []

if file_name.lower().endswith(".zip"):
    with zipfile.ZipFile(file_path, "r") as z:
        z.extractall(base_dir)
    for root, _, files_ in os.walk(base_dir):
        for f in files_:
            if f.lower().endswith((".xlsx", ".xls")):
                excel_files.append(os.path.join(root, f))
else:
    excel_files.append(file_path)

if not excel_files:
    raise ValueError("No Excel files found")

selected_file = None
for f in excel_files:
    try:
        tmp = pd.read_excel(f, nrows=5, engine="openpyxl")
        if "days_missed" in tmp.columns:
            selected_file = f
            break
    except:
        pass

if selected_file is None:
    raise ValueError("No file with 'days_missed' column found")

print("Using dataset:", os.path.basename(selected_file))
df = pd.read_excel(selected_file, engine="openpyxl")

# ==================================================
# TARGET CREATION
# ==================================================
def categorize_days(v):
    if v <= 5:
        return 0
    elif v <= 15:
        return 1
    else:
        return 2

y = df["days_missed"].apply(categorize_days)
X = df.drop(columns=["days_missed", "player_id"], errors="ignore")

# ==================================================
# FEATURE ENGINEERING
# ==================================================
for c in X.columns:
    if "date" in c.lower():
        X[c] = pd.to_datetime(X[c], errors="coerce")
        X[c] = (X[c] - X[c].min()).dt.days

X = pd.get_dummies(X, drop_first=True)

# ==================================================
# SPLIT & PREPROCESS
# ==================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

prep = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

X_train = pd.DataFrame(prep.fit_transform(X_train), columns=X.columns)
X_test  = pd.DataFrame(prep.transform(X_test), columns=X.columns)

# ==================================================
# FEATURE SELECTION
# ==================================================
rf = RandomForestClassifier(n_estimators=300, random_state=42)
rf.fit(X_train, y_train)

rf_imp = pd.Series(rf.feature_importances_, index=X.columns)
rf_features = rf_imp[rf_imp > 0.01].index.tolist() or rf_imp.nlargest(5).index.tolist()

X_train = X_train[rf_features]
X_test  = X_test[rf_features]

lasso = LassoCV(cv=5)
lasso.fit(X_train, y_train)

final_features = X_train.columns[np.abs(lasso.coef_) > 0].tolist() or rf_features

X_train = X_train[final_features]
X_test  = X_test[final_features]

print("Final features:", final_features)

# ==================================================
# CATBOOST MODEL (NO NameError NOW)
# ==================================================
model = CatBoostClassifier(
    loss_function="MultiClass",
    iterations=500,
    learning_rate=0.05,
    depth=8,
    verbose=0
)

model.fit(X_train, y_train)

# ==================================================
# EVALUATION
# ==================================================
preds = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, preds))
print(classification_report(y_test, preds))

# ==================================================
# SAVE
# ==================================================
joblib.dump(model, "catboost_model.joblib")
joblib.dump(prep, "preprocessor.joblib")
joblib.dump(final_features, "features.joblib")

print("‚úÖ Model & artifacts saved")

# ==================================================
# GRADIO UI
# ==================================================
labels = {0: "Short-term Injury", 1: "Medium-term Injury", 2: "Long-term Injury"}

def predict_ui(*inputs):
    data = dict(zip(final_features, inputs))
    df = pd.DataFrame([data])
    df = prep.transform(df)
    pred = model.predict(df)[0]
    return labels[int(pred)]

inputs = [gr.Number(label=f) for f in final_features]

ui = gr.Interface(
    fn=predict_ui,
    inputs=inputs,
    outputs=gr.Textbox(),
    title=" Duration Prediction"
)

ui.launch()


Upload ZIP or Excel file


Saving U=10(main)-Save the model,Build backend,use joblead and built,f-API-OP.zip to U=10(main)-Save the model,Build backend,use joblead and built,f-API-OP (1).zip
Using dataset: player_injuries_cleaned_final (2) (1) (1) (2) (3) (1) (1) (1) (1).xlsx
Final features: ['end_date', 'games_missed', 'injury_reason_Ill', 'injury_reason_Rest']
Accuracy: 0.7641866330390921
              precision    recall  f1-score   support

           0       0.63      0.28      0.39      2716
           1       0.63      0.78      0.69      8020
           2       0.88      0.86      0.87     13054

    accuracy                           0.76     23790
   macro avg       0.71      0.64      0.65     23790
weighted avg       0.77      0.76      0.76     23790

‚úÖ Model & artifacts saved
It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook de



In [None]:
# ==================================================
# INSTALL & IMPORT PACKAGES (BULLETPROOF)
# ==================================================
try:
    from catboost import CatBoostClassifier
except ModuleNotFoundError:
    !pip install -q catboost
    from catboost import CatBoostClassifier

try:
    import gradio as gr
except ModuleNotFoundError:
    !pip install -q gradio joblib openpyxl
    import gradio as gr

import pandas as pd
import numpy as np
import zipfile, os, joblib, warnings
from google.colab import files

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

warnings.filterwarnings("ignore")

# ==================================================
# FILE UPLOAD
# ==================================================
print("Upload ZIP or Excel file")
uploaded = files.upload()

file_name = list(uploaded.keys())[0]
base_dir = "/content/data"
os.makedirs(base_dir, exist_ok=True)

file_path = os.path.join(base_dir, file_name)
with open(file_path, "wb") as f:
    f.write(uploaded[file_name])

# ==================================================
# EXTRACT ZIP & FIND DATASET
# ==================================================
excel_files = []

if file_name.lower().endswith(".zip"):
    with zipfile.ZipFile(file_path, "r") as z:
        z.extractall(base_dir)
    for root, _, files_ in os.walk(base_dir):
        for f in files_:
            if f.lower().endswith((".xlsx", ".xls")):
                excel_files.append(os.path.join(root, f))
else:
    excel_files.append(file_path)

if not excel_files:
    raise ValueError("No Excel files found")

selected_file = None
for f in excel_files:
    try:
        tmp = pd.read_excel(f, nrows=5, engine="openpyxl")
        if "days_missed" in tmp.columns:
            selected_file = f
            break
    except:
        pass

if selected_file is None:
    raise ValueError("No file with 'days_missed' column found")

print("Using dataset:", os.path.basename(selected_file))
df = pd.read_excel(selected_file, engine="openpyxl")

# ==================================================
# TARGET CREATION
# ==================================================
def categorize_days(v):
    if v <= 5:
        return 0
    elif v <= 15:
        return 1
    else:
        return 2

y = df["days_missed"].apply(categorize_days)
X = df.drop(columns=["days_missed", "player_id"], errors="ignore")

# ==================================================
# FEATURE ENGINEERING
# ==================================================
for c in X.columns:
    if "date" in c.lower():
        X[c] = pd.to_datetime(X[c], errors="coerce")
        X[c] = (X[c] - X[c].min()).dt.days

X = pd.get_dummies(X, drop_first=True)

# SAVE ALL FEATURES (IMPORTANT FOR GRADIO)
all_features = X.columns.tolist()

# ==================================================
# SPLIT & PREPROCESS
# ==================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

prep = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

X_train = pd.DataFrame(prep.fit_transform(X_train), columns=all_features)
X_test  = pd.DataFrame(prep.transform(X_test), columns=all_features)

# ==================================================
# FEATURE SELECTION
# ==================================================
rf = RandomForestClassifier(n_estimators=300, random_state=42)
rf.fit(X_train, y_train)

rf_imp = pd.Series(rf.feature_importances_, index=all_features)
rf_features = rf_imp[rf_imp > 0.01].index.tolist() or rf_imp.nlargest(5).index.tolist()

X_train = X_train[rf_features]
X_test  = X_test[rf_features]

lasso = LassoCV(cv=5)
lasso.fit(X_train, y_train)

final_features = X_train.columns[np.abs(lasso.coef_) > 0].tolist() or rf_features

X_train = X_train[final_features]
X_test  = X_test[final_features]

print("Final features:", final_features)

# ==================================================
# CATBOOST MODEL
# ==================================================
model = CatBoostClassifier(
    loss_function="MultiClass",
    iterations=500,
    learning_rate=0.05,
    depth=8,
    verbose=0
)

model.fit(X_train, y_train)

# ==================================================
# EVALUATION
# ==================================================
preds = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, preds))
print(classification_report(y_test, preds))

# ==================================================
# SAVE ARTIFACTS
# ==================================================
joblib.dump(model, "catboost_model.joblib")
joblib.dump(prep, "preprocessor.joblib")
joblib.dump(final_features, "features.joblib")
joblib.dump(all_features, "all_features.joblib")

print(" Model & artifacts saved")

# ==================================================
# GRADIO UI (FIXED ‚Äì NO ERRORS)
# ==================================================
labels = {0: "Short-term Injury", 1: "Medium-term Injury", 2: "Long-term Injury"}

def predict_ui(*inputs):
    # build full feature row
    row = dict.fromkeys(all_features, np.nan)
    row.update(dict(zip(final_features, inputs)))

    df = pd.DataFrame([row])

    # apply preprocessing
    df = pd.DataFrame(prep.transform(df), columns=all_features)

    # select final features
    df = df[final_features]

    pred = model.predict(df)[0]
    return labels[int(pred)]

inputs = [gr.Number(label=f, value=0) for f in final_features]

ui = gr.Interface(
    fn=predict_ui,
    inputs=inputs,
    outputs=gr.Textbox(label="Prediction"),
    title=" Injury Duration Prediction"
)

ui.launch()


Upload ZIP or Excel file


Saving U=10(main)-Save the model,Build backend,use joblead and built,f-API-OP.zip to U=10(main)-Save the model,Build backend,use joblead and built,f-API-OP (2).zip
Using dataset: player_injuries_cleaned_final (2) (1) (1) (2) (3) (1) (1) (1) (1).xlsx
Final features: ['end_date', 'games_missed', 'injury_reason_Ill', 'injury_reason_Rest']
Accuracy: 0.7641866330390921
              precision    recall  f1-score   support

           0       0.63      0.28      0.39      2716
           1       0.63      0.78      0.69      8020
           2       0.88      0.86      0.87     13054

    accuracy                           0.76     23790
   macro avg       0.71      0.64      0.65     23790
weighted avg       0.77      0.76      0.76     23790

 Model & artifacts saved
It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detec



In [None]:
# ==================================================
# INSTALL & IMPORT PACKAGES (BULLETPROOF)
# ==================================================
try:
    from catboost import CatBoostClassifier
except ModuleNotFoundError:
    !pip install -q catboost
    from catboost import CatBoostClassifier

try:
    import gradio as gr
except ModuleNotFoundError:
    !pip install -q gradio joblib openpyxl
    import gradio as gr

import pandas as pd
import numpy as np
import zipfile, os, joblib, warnings
from google.colab import files

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

warnings.filterwarnings("ignore")

# ==================================================
# FILE UPLOAD
# ==================================================
print("Upload ZIP or Excel file")
uploaded = files.upload()

file_name = list(uploaded.keys())[0]
base_dir = "/content/data"
os.makedirs(base_dir, exist_ok=True)

file_path = os.path.join(base_dir, file_name)
with open(file_path, "wb") as f:
    f.write(uploaded[file_name])

# ==================================================
# EXTRACT ZIP & FIND DATASET
# ==================================================
excel_files = []

if file_name.lower().endswith(".zip"):
    with zipfile.ZipFile(file_path, "r") as z:
        z.extractall(base_dir)
    for root, _, files_ in os.walk(base_dir):
        for f in files_:
            if f.lower().endswith((".xlsx", ".xls")):
                excel_files.append(os.path.join(root, f))
else:
    excel_files.append(file_path)

if not excel_files:
    raise ValueError("No Excel files found")

selected_file = None
for f in excel_files:
    try:
        tmp = pd.read_excel(f, nrows=5, engine="openpyxl")
        if "days_missed" in tmp.columns:
            selected_file = f
            break
    except:
        pass

if selected_file is None:
    raise ValueError("No file with 'days_missed' column found")

print("Using dataset:", os.path.basename(selected_file))
df = pd.read_excel(selected_file, engine="openpyxl")

# ==================================================
# TARGET CREATION
# ==================================================
def categorize_days(v):
    if v <= 5:
        return 0
    elif v <= 15:
        return 1
    else:
        return 2

y = df["days_missed"].apply(categorize_days)
X = df.drop(columns=["days_missed", "player_id"], errors="ignore")

# ==================================================
# FEATURE ENGINEERING
# ==================================================
for c in X.columns:
    if "date" in c.lower():
        X[c] = pd.to_datetime(X[c], errors="coerce")
        X[c] = (X[c] - X[c].min()).dt.days

X = pd.get_dummies(X, drop_first=True)

# SAVE ALL FEATURES (IMPORTANT FOR GRADIO)
all_features = X.columns.tolist()

# ==================================================
# SPLIT & PREPROCESS
# ==================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

prep = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

X_train = pd.DataFrame(prep.fit_transform(X_train), columns=all_features)
X_test  = pd.DataFrame(prep.transform(X_test), columns=all_features)

# ==================================================
# FEATURE SELECTION
# ==================================================
rf = RandomForestClassifier(n_estimators=300, random_state=42)
rf.fit(X_train, y_train)

rf_imp = pd.Series(rf.feature_importances_, index=all_features)
rf_features = rf_imp[rf_imp > 0.01].index.tolist() or rf_imp.nlargest(5).index.tolist()

X_train = X_train[rf_features]
X_test  = X_test[rf_features]

lasso = LassoCV(cv=5)
lasso.fit(X_train, y_train)

final_features = X_train.columns[np.abs(lasso.coef_) > 0].tolist() or rf_features

X_train = X_train[final_features]
X_test  = X_test[final_features]

print("Final features:", final_features)

# ==================================================
# CATBOOST MODEL
# ==================================================
model = CatBoostClassifier(
    loss_function="MultiClass",
    iterations=500,
    learning_rate=0.05,
    depth=8,
    verbose=0
)

model.fit(X_train, y_train)

# ==================================================
# EVALUATION
# ==================================================
preds = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, preds))
print(classification_report(y_test, preds))

# ==================================================
# SAVE ARTIFACTS
# ==================================================
joblib.dump(model, "catboost_model.joblib")
joblib.dump(prep, "preprocessor.joblib")
joblib.dump(final_features, "features.joblib")
joblib.dump(all_features, "all_features.joblib")

print(" Model & artifacts saved")

# ==================================================
# GRADIO UI (FIXED ‚Äì NO ERRORS)
# ==================================================
labels = {0: "Short-term Injury", 1: "Medium-term Injury", 2: "Long-term Injury"}

def predict_ui(*inputs):
    # build full feature row
    row = dict.fromkeys(all_features, np.nan)
    row.update(dict(zip(final_features, inputs)))

    df = pd.DataFrame([row])

    # apply preprocessing
    df = pd.DataFrame(prep.transform(df), columns=all_features)

    # select final features
    df = df[final_features]

    pred = model.predict(df)[0]
    return labels[int(pred)]

inputs = [gr.Number(label=f, value=0) for f in final_features]

ui = gr.Interface(
    fn=predict_ui,
    inputs=inputs,
    outputs=gr.Textbox(label="Prediction"),
    title=" Duration Prediction"
)

ui.launch()


Upload ZIP or Excel file


Saving U=10(main)-Save the model,Build backend,use joblead and built,f-API-OP.zip to U=10(main)-Save the model,Build backend,use joblead and built,f-API-OP (3).zip
Using dataset: player_injuries_cleaned_final (2) (1) (1) (2) (3) (1) (1) (1) (1).xlsx
Final features: ['end_date', 'games_missed', 'injury_reason_Ill', 'injury_reason_Rest']
Accuracy: 0.7641866330390921
              precision    recall  f1-score   support

           0       0.63      0.28      0.39      2716
           1       0.63      0.78      0.69      8020
           2       0.88      0.86      0.87     13054

    accuracy                           0.76     23790
   macro avg       0.71      0.64      0.65     23790
weighted avg       0.77      0.76      0.76     23790

 Model & artifacts saved
It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detec



In [None]:
# ==================================================
# INSTALL & IMPORT PACKAGES (BULLETPROOF)
# ==================================================
try:
    from catboost import CatBoostClassifier
except ModuleNotFoundError:
    !pip install -q catboost
    from catboost import CatBoostClassifier

try:
    import gradio as gr
except ModuleNotFoundError:
    !pip install -q gradio joblib openpyxl
    import gradio as gr

import pandas as pd
import numpy as np
import zipfile, os, joblib, warnings
from google.colab import files

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

warnings.filterwarnings("ignore")

# ==================================================
# FILE UPLOAD
# ==================================================
print("Upload ZIP or Excel file")
uploaded = files.upload()

file_name = list(uploaded.keys())[0]
base_dir = "/content/data"
os.makedirs(base_dir, exist_ok=True)

file_path = os.path.join(base_dir, file_name)
with open(file_path, "wb") as f:
    f.write(uploaded[file_name])

# ==================================================
# EXTRACT ZIP & FIND DATASET
# ==================================================
excel_files = []

if file_name.lower().endswith(".zip"):
    with zipfile.ZipFile(file_path, "r") as z:
        z.extractall(base_dir)
    for root, _, files_ in os.walk(base_dir):
        for f in files_:
            if f.lower().endswith((".xlsx", ".xls")):
                excel_files.append(os.path.join(root, f))
else:
    excel_files.append(file_path)

if not excel_files:
    raise ValueError("No Excel files found")

selected_file = None
for f in excel_files:
    try:
        tmp = pd.read_excel(f, nrows=5, engine="openpyxl")
        if "days_missed" in tmp.columns:
            selected_file = f
            break
    except:
        pass

if selected_file is None:
    raise ValueError("No file with 'days_missed' column found")

print("Using dataset:", os.path.basename(selected_file))
df = pd.read_excel(selected_file, engine="openpyxl")

# ==================================================
# TARGET CREATION
# ==================================================
def categorize_days(v):
    if v <= 5:
        return 0
    elif v <= 15:
        return 1
    else:
        return 2

y = df["days_missed"].apply(categorize_days)
X = df.drop(columns=["days_missed", "player_id"], errors="ignore")

# ==================================================
# FEATURE ENGINEERING
# ==================================================
for c in X.columns:
    if "date" in c.lower():
        X[c] = pd.to_datetime(X[c], errors="coerce")
        X[c] = (X[c] - X[c].min()).dt.days

X = pd.get_dummies(X, drop_first=True)

# SAVE ALL FEATURES (IMPORTANT FOR GRADIO)
all_features = X.columns.tolist()

# ==================================================
# SPLIT & PREPROCESS
# ==================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

prep = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

X_train = pd.DataFrame(prep.fit_transform(X_train), columns=all_features)
X_test  = pd.DataFrame(prep.transform(X_test), columns=all_features)

# ==================================================
# FEATURE SELECTION
# ==================================================
rf = RandomForestClassifier(n_estimators=300, random_state=42)
rf.fit(X_train, y_train)

rf_imp = pd.Series(rf.feature_importances_, index=all_features)
rf_features = rf_imp[rf_imp > 0.01].index.tolist() or rf_imp.nlargest(5).index.tolist()

X_train = X_train[rf_features]
X_test  = X_test[rf_features]

lasso = LassoCV(cv=5)
lasso.fit(X_train, y_train)

final_features = X_train.columns[np.abs(lasso.coef_) > 0].tolist() or rf_features

X_train = X_train[final_features]
X_test  = X_test[final_features]

print("Final features:", final_features)

# ==================================================
# CATBOOST MODEL
# ==================================================
model = CatBoostClassifier(
    loss_function="MultiClass",
    iterations=500,
    learning_rate=0.05,
    depth=8,
    verbose=0
)

model.fit(X_train, y_train)

# ==================================================
# EVALUATION
# ==================================================
preds = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, preds))
print(classification_report(y_test, preds))

# ==================================================
# SAVE ARTIFACTS
# ==================================================
joblib.dump(model, "catboost_model.joblib")
joblib.dump(prep, "preprocessor.joblib")
joblib.dump(final_features, "features.joblib")
joblib.dump(all_features, "all_features.joblib")

print(" Model & artifacts saved")

# ==================================================
# GRADIO UI (FIXED ‚Äì NO ERRORS)
# ==================================================
labels = {0: "Short-term Injury", 1: "Medium-term Injury", 2: "Long-term Injury"}

def predict_ui(*inputs):
    # build full feature row
    row = dict.fromkeys(all_features, np.nan)
    row.update(dict(zip(final_features, inputs)))

    df = pd.DataFrame([row])

    # apply preprocessing
    df = pd.DataFrame(prep.transform(df), columns=all_features)

    # select final features
    df = df[final_features]

    pred = model.predict(df)[0]
    return labels[int(pred)]

inputs = [gr.Number(label=f, value=0) for f in final_features]

ui = gr.Interface(
    fn=predict_ui,
    inputs=inputs,
    outputs=gr.Textbox(label="Prediction"),
    title=" Duration Prediction"
)

ui.launch()

Upload ZIP or Excel file


Saving U=10(main)-Save the model,Build backend,use joblead and built,f-API-OP.zip to U=10(main)-Save the model,Build backend,use joblead and built,f-API-OP (1).zip
Using dataset: player_injuries_cleaned_final (2) (1) (1) (2) (3) (1) (1) (1) (1).xlsx
Final features: ['end_date', 'games_missed', 'injury_reason_Ill', 'injury_reason_Rest']
Accuracy: 0.7641866330390921
              precision    recall  f1-score   support

           0       0.63      0.28      0.39      2716
           1       0.63      0.78      0.69      8020
           2       0.88      0.86      0.87     13054

    accuracy                           0.76     23790
   macro avg       0.71      0.64      0.65     23790
weighted avg       0.77      0.76      0.76     23790

 Model & artifacts saved
It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detec



In [None]:
# ==================================================
# INSTALL & IMPORT PACKAGES (BULLETPROOF)
# ==================================================
try:
    from catboost import CatBoostClassifier
except ModuleNotFoundError:
    !pip install -q catboost
    from catboost import CatBoostClassifier

try:
    import gradio as gr
except ModuleNotFoundError:
    !pip install -q gradio joblib openpyxl
    import gradio as gr

import pandas as pd
import numpy as np
import zipfile, os, joblib, warnings
from google.colab import files

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

warnings.filterwarnings("ignore")

# ==================================================
# FILE UPLOAD
# ==================================================
print("Upload ZIP or Excel file")
uploaded = files.upload()

file_name = list(uploaded.keys())[0]
base_dir = "/content/data"
os.makedirs(base_dir, exist_ok=True)

file_path = os.path.join(base_dir, file_name)
with open(file_path, "wb") as f:
    f.write(uploaded[file_name])

# ==================================================
# EXTRACT ZIP & FIND DATASET
# ==================================================
excel_files = []

if file_name.lower().endswith(".zip"):
    with zipfile.ZipFile(file_path, "r") as z:
        z.extractall(base_dir)
    for root, _, files_ in os.walk(base_dir):
        for f in files_:
            if f.lower().endswith((".xlsx", ".xls")):
                excel_files.append(os.path.join(root, f))
else:
    excel_files.append(file_path)

if not excel_files:
    raise ValueError("No Excel files found")

selected_file = None
for f in excel_files:
    try:
        tmp = pd.read_excel(f, nrows=5, engine="openpyxl")
        if "days_missed" in tmp.columns:
            selected_file = f
            break
    except:
        pass

if selected_file is None:
    raise ValueError("No file with 'days_missed' column found")

print("Using dataset:", os.path.basename(selected_file))
df = pd.read_excel(selected_file, engine="openpyxl")

# ==================================================
# TARGET CREATION
# ==================================================
def categorize_days(v):
    if v <= 5:
        return 0
    elif v <= 15:
        return 1
    else:
        return 2

y = df["days_missed"].apply(categorize_days)
X = df.drop(columns=["days_missed", "player_id"], errors="ignore")

# ==================================================
# FEATURE ENGINEERING
# ==================================================
for c in X.columns:
    if "date" in c.lower():
        X[c] = pd.to_datetime(X[c], errors="coerce")
        X[c] = (X[c] - X[c].min()).dt.days

X = pd.get_dummies(X, drop_first=True)

# SAVE ALL FEATURES (IMPORTANT FOR GRADIO)
all_features = X.columns.tolist()

# ==================================================
# SPLIT & PREPROCESS
# ==================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

prep = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

X_train = pd.DataFrame(prep.fit_transform(X_train), columns=all_features)
X_test  = pd.DataFrame(prep.transform(X_test), columns=all_features)

# ==================================================
# FEATURE SELECTION
# ==================================================
rf = RandomForestClassifier(n_estimators=300, random_state=42)
rf.fit(X_train, y_train)

rf_imp = pd.Series(rf.feature_importances_, index=all_features)
rf_features = rf_imp[rf_imp > 0.01].index.tolist() or rf_imp.nlargest(5).index.tolist()

X_train = X_train[rf_features]
X_test  = X_test[rf_features]

lasso = LassoCV(cv=5)
lasso.fit(X_train, y_train)

final_features = X_train.columns[np.abs(lasso.coef_) > 0].tolist() or rf_features

X_train = X_train[final_features]
X_test  = X_test[final_features]

print("Final features:", final_features)

# ==================================================
# CATBOOST MODEL
# ==================================================
model = CatBoostClassifier(
    loss_function="MultiClass",
    iterations=500,
    learning_rate=0.05,
    depth=8,
    verbose=0
)

model.fit(X_train, y_train)

# ==================================================
# EVALUATION
# ==================================================
preds = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, preds))
print(classification_report(y_test, preds))

# ==================================================
# SAVE ARTIFACTS
# ==================================================
joblib.dump(model, "catboost_model.joblib")
joblib.dump(prep, "preprocessor.joblib")
joblib.dump(final_features, "features.joblib")
joblib.dump(all_features, "all_features.joblib")

print(" Model & artifacts saved")

# ==================================================
# GRADIO UI (FIXED ‚Äì NO ERRORS)
# ==================================================
labels = {0: "Short-term Injury", 1: "Medium-term Injury", 2: "Long-term Injury"}

def predict_ui(*inputs):
    # build full feature row
    row = dict.fromkeys(all_features, np.nan)
    row.update(dict(zip(final_features, inputs)))

    df = pd.DataFrame([row])

    # apply preprocessing
    df = pd.DataFrame(prep.transform(df), columns=all_features)

    # select final features
    df = df[final_features]

    pred = model.predict(df)[0]
    return labels[int(pred)]

inputs = [gr.Number(label=f, value=0) for f in final_features]

ui = gr.Interface(
    fn=predict_ui,
    inputs=inputs,
    outputs=gr.Textbox(label="Prediction"),
    title=" Duration Prediction"
)

ui.launch()

[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m99.2/99.2 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hUpload ZIP or Excel file


Saving U=10(main)-Save the model,Build backend,use joblead and built,f-API-OP.zip to U=10(main)-Save the model,Build backend,use joblead and built,f-API-OP.zip
Using dataset: player_injuries_cleaned_final (2) (1) (1) (2) (3) (1) (1) (1) (1).xlsx
Final features: ['end_date', 'games_missed', 'injury_reason_Ill', 'injury_reason_Rest']
Accuracy: 0.7641866330390921
              precision    recall  f1-score   support

           0       0.63      0.28      0.39      2716
           1       0.63      0.78      0.69      8020
           2       0.88      0.86      0.87     13054

    accuracy                           0.76     23790
   macro avg       0.71      0.64      0.65     23790
weighted avg       0.77      0.76      0.76     23790

 Model & artifacts saved
It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected.



In [None]:
# ==================================================
# INSTALL & IMPORT PACKAGES (BULLETPROOF)
# ==================================================
try:
    from catboost import CatBoostClassifier
except ModuleNotFoundError:
    !pip install -q catboost
    from catboost import CatBoostClassifier

try:
    import gradio as gr
except ModuleNotFoundError:
    !pip install -q gradio joblib openpyxl
    import gradio as gr

import pandas as pd
import numpy as np
import zipfile, os, joblib, warnings
from google.colab import files

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

warnings.filterwarnings("ignore")

# ==================================================
# FILE UPLOAD
# ==================================================
print("Upload ZIP or Excel file")
uploaded = files.upload()

file_name = list(uploaded.keys())[0]
base_dir = "/content/data"
os.makedirs(base_dir, exist_ok=True)

file_path = os.path.join(base_dir, file_name)
with open(file_path, "wb") as f:
    f.write(uploaded[file_name])

# ==================================================
# EXTRACT ZIP & FIND DATASET
# ==================================================
excel_files = []

if file_name.lower().endswith(".zip"):
    with zipfile.ZipFile(file_path, "r") as z:
        z.extractall(base_dir)
    for root, _, files_ in os.walk(base_dir):
        for f in files_:
            if f.lower().endswith((".xlsx", ".xls")):
                excel_files.append(os.path.join(root, f))
else:
    excel_files.append(file_path)

if not excel_files:
    raise ValueError("No Excel files found")

selected_file = None
for f in excel_files:
    try:
        tmp = pd.read_excel(f, nrows=5, engine="openpyxl")
        if "days_missed" in tmp.columns:
            selected_file = f
            break
    except:
        pass

if selected_file is None:
    raise ValueError("No file with 'days_missed' column found")

print("Using dataset:", os.path.basename(selected_file))
df = pd.read_excel(selected_file, engine="openpyxl")

# ==================================================
# TARGET CREATION
# ==================================================
def categorize_days(v):
    if v <= 5:
        return 0
    elif v <= 15:
        return 1
    else:
        return 2

y = df["days_missed"].apply(categorize_days)
X = df.drop(columns=["days_missed", "player_id"], errors="ignore")

# ==================================================
# FEATURE ENGINEERING
# ==================================================
for c in X.columns:
    if "date" in c.lower():
        X[c] = pd.to_datetime(X[c], errors="coerce")
        X[c] = (X[c] - X[c].min()).dt.days

X = pd.get_dummies(X, drop_first=True)

# SAVE ALL FEATURES (IMPORTANT FOR GRADIO)
all_features = X.columns.tolist()

# ==================================================
# SPLIT & PREPROCESS
# ==================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

prep = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

X_train = pd.DataFrame(prep.fit_transform(X_train), columns=all_features)
X_test  = pd.DataFrame(prep.transform(X_test), columns=all_features)

# ==================================================
# FEATURE SELECTION
# ==================================================
rf = RandomForestClassifier(n_estimators=300, random_state=42)
rf.fit(X_train, y_train)

rf_imp = pd.Series(rf.feature_importances_, index=all_features)
rf_features = rf_imp[rf_imp > 0.01].index.tolist() or rf_imp.nlargest(5).index.tolist()

X_train = X_train[rf_features]
X_test  = X_test[rf_features]

lasso = LassoCV(cv=5)
lasso.fit(X_train, y_train)

final_features = X_train.columns[np.abs(lasso.coef_) > 0].tolist() or rf_features

X_train = X_train[final_features]
X_test  = X_test[final_features]

print("Final features:", final_features)

# ==================================================
# CATBOOST MODEL
# ==================================================
model = CatBoostClassifier(
    loss_function="MultiClass",
    iterations=500,
    learning_rate=0.05,
    depth=8,
    verbose=0
)

model.fit(X_train, y_train)

# ==================================================
# EVALUATION
# ==================================================
preds = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, preds))
print(classification_report(y_test, preds))

# ==================================================
# SAVE ARTIFACTS
# ==================================================
joblib.dump(model, "catboost_model.joblib")
joblib.dump(prep, "preprocessor.joblib")
joblib.dump(final_features, "features.joblib")
joblib.dump(all_features, "all_features.joblib")

print(" Model & artifacts saved")

# ==================================================
# GRADIO UI (FIXED ‚Äì NO ERRORS)
# ==================================================
labels = {0: "Short-term Injury", 1: "Medium-term Injury", 2: "Long-term Injury"}

def predict_ui(*inputs):
    # build full feature row
    row = dict.fromkeys(all_features, np.nan)
    row.update(dict(zip(final_features, inputs)))

    df = pd.DataFrame([row])

    # apply preprocessing
    df = pd.DataFrame(prep.transform(df), columns=all_features)

    # select final features
    df = df[final_features]

    pred = model.predict(df)[0]
    return labels[int(pred)]

inputs = [gr.Number(label=f, value=0) for f in final_features]

ui = gr.Interface(
    fn=predict_ui,
    inputs=inputs,
    outputs=gr.Textbox(label="Prediction"),
    title=" Duration Prediction"
)

ui.launch()

Upload ZIP or Excel file


Saving U=10(main)-Save the model,Build backend,use joblead and built,f-API-OP.zip to U=10(main)-Save the model,Build backend,use joblead and built,f-API-OP (1).zip
Using dataset: player_injuries_cleaned_final (2) (1) (1) (2) (3) (1) (1) (1) (1).xlsx
Final features: ['end_date', 'games_missed', 'injury_reason_Ill', 'injury_reason_Rest']
Accuracy: 0.7641866330390921
              precision    recall  f1-score   support

           0       0.63      0.28      0.39      2716
           1       0.63      0.78      0.69      8020
           2       0.88      0.86      0.87     13054

    accuracy                           0.76     23790
   macro avg       0.71      0.64      0.65     23790
weighted avg       0.77      0.76      0.76     23790

 Model & artifacts saved
It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detec



In [1]:
# ==================================================
# INSTALL & IMPORT PACKAGES (BULLETPROOF)
# ==================================================
try:
    from catboost import CatBoostClassifier
except ModuleNotFoundError:
    !pip install -q catboost
    from catboost import CatBoostClassifier

try:
    import gradio as gr
except ModuleNotFoundError:
    !pip install -q gradio joblib openpyxl
    import gradio as gr

import pandas as pd
import numpy as np
import zipfile, os, joblib, warnings
from google.colab import files

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

warnings.filterwarnings("ignore")

# ==================================================
# FILE UPLOAD
# ==================================================
print("Upload ZIP or Excel file")
uploaded = files.upload()

file_name = list(uploaded.keys())[0]
base_dir = "/content/data"
os.makedirs(base_dir, exist_ok=True)

file_path = os.path.join(base_dir, file_name)
with open(file_path, "wb") as f:
    f.write(uploaded[file_name])

# ==================================================
# EXTRACT ZIP & FIND DATASET
# ==================================================
excel_files = []

if file_name.lower().endswith(".zip"):
    with zipfile.ZipFile(file_path, "r") as z:
        z.extractall(base_dir)
    for root, _, files_ in os.walk(base_dir):
        for f in files_:
            if f.lower().endswith((".xlsx", ".xls")):
                excel_files.append(os.path.join(root, f))
else:
    excel_files.append(file_path)

if not excel_files:
    raise ValueError("No Excel files found")

selected_file = None
for f in excel_files:
    try:
        tmp = pd.read_excel(f, nrows=5, engine="openpyxl")
        if "days_missed" in tmp.columns:
            selected_file = f
            break
    except:
        pass

if selected_file is None:
    raise ValueError("No file with 'days_missed' column found")

print("Using dataset:", os.path.basename(selected_file))
df = pd.read_excel(selected_file, engine="openpyxl")

# ==================================================
# TARGET CREATION
# ==================================================
def categorize_days(v):
    if v <= 5:
        return 0
    elif v <= 15:
        return 1
    else:
        return 2

y = df["days_missed"].apply(categorize_days)
X = df.drop(columns=["days_missed", "player_id"], errors="ignore")

# ==================================================
# FEATURE ENGINEERING
# ==================================================
for c in X.columns:
    if "date" in c.lower():
        X[c] = pd.to_datetime(X[c], errors="coerce")
        X[c] = (X[c] - X[c].min()).dt.days

X = pd.get_dummies(X, drop_first=True)

# SAVE ALL FEATURES (IMPORTANT FOR GRADIO)
all_features = X.columns.tolist()

# ==================================================
# SPLIT & PREPROCESS
# ==================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

prep = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

X_train = pd.DataFrame(prep.fit_transform(X_train), columns=all_features)
X_test  = pd.DataFrame(prep.transform(X_test), columns=all_features)

# ==================================================
# FEATURE SELECTION
# ==================================================
rf = RandomForestClassifier(n_estimators=300, random_state=42)
rf.fit(X_train, y_train)

rf_imp = pd.Series(rf.feature_importances_, index=all_features)
rf_features = rf_imp[rf_imp > 0.01].index.tolist() or rf_imp.nlargest(5).index.tolist()

X_train = X_train[rf_features]
X_test  = X_test[rf_features]

lasso = LassoCV(cv=5)
lasso.fit(X_train, y_train)

final_features = X_train.columns[np.abs(lasso.coef_) > 0].tolist() or rf_features

X_train = X_train[final_features]
X_test  = X_test[final_features]

print("Final features:", final_features)

# ==================================================
# CATBOOST MODEL
# ==================================================
model = CatBoostClassifier(
    loss_function="MultiClass",
    iterations=500,
    learning_rate=0.05,
    depth=8,
    verbose=0
)

model.fit(X_train, y_train)

# ==================================================
# EVALUATION
# ==================================================
preds = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, preds))
print(classification_report(y_test, preds))

# ==================================================
# SAVE ARTIFACTS
# ==================================================
joblib.dump(model, "catboost_model.joblib")
joblib.dump(prep, "preprocessor.joblib")
joblib.dump(final_features, "features.joblib")
joblib.dump(all_features, "all_features.joblib")

print(" Model & artifacts saved")

# ==================================================
# GRADIO UI (FIXED ‚Äì NO ERRORS)
# ==================================================
labels = {0: "Short-term Injury", 1: "Medium-term Injury", 2: "Long-term Injury"}

def predict_ui(*inputs):
    # build full feature row
    row = dict.fromkeys(all_features, np.nan)
    row.update(dict(zip(final_features, inputs)))

    df = pd.DataFrame([row])

    # apply preprocessing
    df = pd.DataFrame(prep.transform(df), columns=all_features)

    # select final features
    df = df[final_features]

    pred = model.predict(df)[0]
    return labels[int(pred)]

inputs = [gr.Number(label=f, value=0) for f in final_features]

ui = gr.Interface(
    fn=predict_ui,
    inputs=inputs,
    outputs=gr.Textbox(label="Prediction"),
    title=" Duration Prediction"
)

ui.launch()

[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m99.2/99.2 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hUpload ZIP or Excel file


Saving U=10(main)-Save the model,Build backend,use joblead and built,f-API-OP.zip to U=10(main)-Save the model,Build backend,use joblead and built,f-API-OP.zip
Using dataset: player_injuries_cleaned_final (2) (1) (1) (2) (3) (1) (1) (1) (1).xlsx
Final features: ['end_date', 'games_missed', 'injury_reason_Ill', 'injury_reason_Rest']
Accuracy: 0.7641866330390921
              precision    recall  f1-score   support

           0       0.63      0.28      0.39      2716
           1       0.63      0.78      0.69      8020
           2       0.88      0.86      0.87     13054

    accuracy                           0.76     23790
   macro avg       0.71      0.64      0.65     23790
weighted avg       0.77      0.76      0.76     23790

 Model & artifacts saved
It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected.

