In [None]:

# 1. IMPORTS
import pandas as pd
import numpy as np
import zipfile, os, shutil
from google.colab import files
import ipywidgets as widgets
from IPython.display import display

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings("ignore")

# 2. FILE UPLOAD
print("Upload ZIP or Excel file…")
uploaded = files.upload()
file_name = list(uploaded.keys())[0]

extract_dir = "/content/extracted_excel"
if os.path.exists(extract_dir):
    shutil.rmtree(extract_dir)
os.makedirs(extract_dir, exist_ok=True)

excel_files = []
if file_name.endswith(".zip"):
    with zipfile.ZipFile(file_name, 'r') as z:
        z.extractall(extract_dir)
    for root, _, files in os.walk(extract_dir):
        for f in files:
            if f.endswith((".xlsx", ".xls")):
                excel_files.append(os.path.join(root, f))
else:
    new_path = os.path.join(extract_dir, file_name)
    shutil.move(file_name, new_path)
    excel_files.append(new_path)

dropdown = widgets.Dropdown(options=excel_files, description="Choose File:")
display(dropdown)
print("Select file and RUN AGAIN")

# 3. LOAD DATA
df = pd.read_excel(dropdown.value)
print("Columns:", list(df.columns))

# 4. SELECT TARGET (NUMERIC OR CATEGORICAL)
# Recommended targets: days_missed or games_missed
target_candidates = [c for c in df.columns if c not in ['player_id', 'from_date', 'end_date']]
print("Recommended target columns:", target_candidates)

# For this example, we choose 'days_missed'
target_column = 'days_missed'
print("Selected target:", target_column)

# 5. TARGET CLASSIFICATION (3 classes)
def categorize_days(v):
    if v <= 5:
        return 0  # short-term
    elif v <= 15:
        return 1  # medium-term
    else:
        return 2  # long-term

y = df[target_column].apply(categorize_days)

if y.nunique() < 2:
    raise ValueError(f"Target '{target_column}' has less than 2 classes.")

# 6. FEATURE CLEANING
X = df.drop(columns=[target_column, 'player_id'], errors='ignore')

# Convert date columns
for c in X.columns:
    if 'date' in c.lower():
        X[c] = pd.to_datetime(X[c], errors='coerce')
        X[c] = (X[c] - X[c].min()).dt.days

# Encode categorical columns
X = pd.get_dummies(X, drop_first=True)

if X.shape[1] == 0:
    raise ValueError("No usable features found after preprocessing.")

# 7. SPLIT + SCALE
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

prep = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

X_train = pd.DataFrame(prep.fit_transform(X_train), columns=X.columns)
X_test = pd.DataFrame(prep.transform(X_test), columns=X.columns)

# 8. RANDOM FOREST FEATURE SELECTION
rf = RandomForestClassifier(n_estimators=300, random_state=42)
rf.fit(X_train, y_train)

rf_imp = pd.Series(rf.feature_importances_, index=X_train.columns)
rf_features = rf_imp[rf_imp > 0.01].sort_values(ascending=False).index.tolist()
if len(rf_features) == 0:
    rf_features = rf_imp.sort_values(ascending=False).head(3).index.tolist()

X_train_rf = X_train[rf_features]
X_test_rf = X_test[rf_features]
print("Features after RF:", len(rf_features))

# 9. LASSO FEATURE SELECTION
lasso = LassoCV(cv=5, random_state=42, max_iter=5000)
lasso.fit(X_train_rf, y_train)
lasso_imp = pd.Series(np.abs(lasso.coef_), index=X_train_rf.columns)
lasso_features = lasso_imp[lasso_imp > 0].sort_values(ascending=False).index.tolist()
if len(lasso_features) == 0:
    lasso_features = X_train_rf.columns.tolist()

X_train_final = X_train_rf[lasso_features]
X_test_final = X_test_rf[lasso_features]
print("Features after Lasso:", len(lasso_features))

# 10. FINAL MODEL (CATBOOST)
final_model = CatBoostClassifier(
    iterations=600,
    depth=8,
    learning_rate=0.05,
    verbose=0
)
final_model.fit(X_train_final, y_train)
preds = final_model.predict(X_test_final)

# 11. RESULTS
acc = accuracy_score(y_test, preds)
print("\n==============================")
print("FINAL ACCURACY:", round(acc*100,2), "%")
print("==============================")
print(classification_report(y_test, preds, target_names=['Short-term','Medium-term','Long-term']))


# 12. CREATE REDUCED DATASET
reduced_df = pd.concat([X_train_final, y_train.reset_index(drop=True)], axis=1)
reduced_df.to_csv("/content/reduced_dataset.csv", index=False)
print("Reduced dataset saved as /content/reduced_dataset.csv")


Upload ZIP or Excel file…


Saving U-7=extract imp feature,built new data set,XGboost,catboost,drop the features not required-OP.zip to U-7=extract imp feature,built new data set,XGboost,catboost,drop the features not required-OP (4).zip


Dropdown(description='Choose File:', options=('/content/extracted_excel/player_injuries_cleaned_final (2) (1) …

Select file and RUN AGAIN
Columns: ['player_id', 'season_name', 'injury_reason', 'from_date', 'end_date', 'days_missed', 'games_missed']
Recommended target columns: ['season_name', 'injury_reason', 'days_missed', 'games_missed']
Selected target: days_missed
Features after RF: 5
Features after Lasso: 4

FINAL ACCURACY: 76.56 %
              precision    recall  f1-score   support

  Short-term       0.65      0.28      0.39      2716
 Medium-term       0.63      0.78      0.70      8020
   Long-term       0.89      0.86      0.87     13054

    accuracy                           0.77     23790
   macro avg       0.72      0.64      0.65     23790
weighted avg       0.77      0.77      0.76     23790

Reduced dataset saved as /content/reduced_dataset.csv
