In [None]:
import os
import numpy as np
import pandas as pd
import time
import optuna
import warnings
warnings.filterwarnings('ignore')

from sklearn.feature_selection import RFECV
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.calibration import LabelEncoder

# Configuração da semente para garantir resultados reprodutíveis
SEED = 42
np.random.seed(SEED)

# Definir a semente também no Optuna e no SVM
optuna.logging.set_verbosity(optuna.logging.WARNING)  # Para evitar logs excessivos do Optuna

# Leitura dos dados
teams = pd.read_csv("data_prepared/teams.csv")
temp_teams = teams[teams['confID'] == "EA"]
label_encoder = LabelEncoder()
numerical_features = teams.select_dtypes(include=['float', 'int']).columns
numerical_features = numerical_features.drop('year')
scaler = StandardScaler()
teams[numerical_features] = scaler.fit_transform(teams[numerical_features])
teams['playoff'] = teams['playoff'].map({'Y': 1, 'N': 0})
teams['confID'] = teams['confID'].map({'EA': 0, 'WE': 1})

def encode_categorical_columns(df):
    for col in df.select_dtypes(include=['object']).columns:
        if col == 'playoff' or col == 'confID': continue
        else: df[col] = label_encoder.fit_transform(df[col])
    return df

encode_categorical_columns(teams)
teams = teams.fillna(0)

# Normalize predictions
def normalize_predictions(predictions):
    return (predictions - np.min(predictions)) / (np.max(predictions) - np.min(predictions))

# Calculate error
def get_error(pred_proba, label_playoff):
    return sum(abs(pred - label) for pred, label in zip(pred_proba, label_playoff))

from imblearn.over_sampling import SMOTE

def get_train_and_test_data(data, year):
    train = data[((data['year'] < year) | (data['year'] > year)) & (data['year'] != 11)].drop("year", axis=1)
    test = data[data['year'] == year].drop("year", axis=1)
    X_train, Y_train = train.drop("playoff", axis=1), train["playoff"]
    X_test, Y_test = test.drop("playoff", axis=1), test["playoff"]

    smote = SMOTE(random_state=SEED)
    X_train, Y_train = smote.fit_resample(X_train, Y_train)
    return X_train, Y_train, X_test, Y_test


# Select best features using RFECV
def select_best_features(X_train, Y_train, feature_list, n_features):
    svc = SVC(kernel="linear", random_state=SEED)
    selector = RFECV(estimator=svc, step=1, min_features_to_select=n_features, cv=3, scoring='accuracy')
    selector.fit(X_train[feature_list], Y_train)
    selected_features = [feature_list[i] for i in range(len(feature_list)) if selector.support_[i]]
    return selected_features + ['tmID']

# Run model pipeline
def run_model(model, data, year, number, only_df=False, feature_list=None, n_features=10):
    feature_list = feature_list or [
        'o_fgm', 'o_fga', 'o_ftm', 'o_fta', 'o_3pm', 'o_3pa', 'o_oreb', 'o_dreb', 'o_asts',
        'o_pf', 'o_stl', 'o_to', 'o_blk', 'o_pts', 'd_fgm', 'd_fga', 'd_ftm', 'd_fta', 'd_3pm',
        'd_3pa', 'd_oreb', 'd_dreb', 'd_asts', 'd_pf', 'd_stl', 'd_to', 'd_blk', 'd_pts', 'won',
        'lost', 'homeW', 'homeL', 'awayW', 'awayL', 'confW', 'confL', 'min', 'wonPost', 'lostPost',
        'wonPointsPost', 'lostPointsPost', 'awards_players', 'awards_coaches', 'offensive_efficiency',
        'defensive_efficiency', 'play_percent', 'factors4', 'possession', 'opponent_possession',
        'avg_pie', 'avg_per'
    ]
    X_train, Y_train, X_test, Y_test = get_train_and_test_data(data, year)
    selected_features = select_best_features(X_train, Y_train, feature_list, n_features)
    X_train, X_test = X_train[selected_features], X_test[selected_features]
    
    start_timer = time.time()
    model.fit(X_train, Y_train)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    error = get_error(y_pred_proba, Y_test)
    
    y_pred = np.zeros_like(y_pred_proba)
    y_pred[np.argsort(y_pred_proba)[-number:]] = 1
    stop_timer = time.time()

    prediction_df = pd.DataFrame({'tmID': X_test['tmID'], 'Playoff': y_pred_proba})
    prediction_df['tmID'] = label_encoder.inverse_transform(X_test['tmID'])

    if only_df: return prediction_df
    prediction_df['Playoff_Binary'] = y_pred
    prediction_df['Playoff_Labeled'] = Y_test.values
    
    time_elapsed = stop_timer - start_timer
    metrics = calculate_metrics(Y_test, y_pred, y_pred_proba)
    metrics.update({"time": time_elapsed, "error": error})
    return prediction_df, metrics

# Metrics calculation
def calculate_metrics(y_true, y_pred, y_proba):
    return {
        "precision": precision_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "f1": f1_score(y_true, y_pred),
        "accuracy": accuracy_score(y_true, y_pred),
        "auc": roc_auc_score(y_true, y_proba)
    }

# Run model by conferences
def run_model_by_conferences(model, year, data):
    if year == 11:
        east_pred_df = run_model(model, data[data['confID'] == 0], year, 4, True)
        west_pred_df = run_model(model, data[data['confID'] == 1], year, 4, True)
        pred_df = pd.concat([east_pred_df, west_pred_df])
        pred_df['tmID'] = pred_df['tmID'].replace("DET", "TUL") # replace team that has different 'tmID' and 'franchID
        pred_df = pred_df.sort_values('tmID')
        prob = pred_df['Playoff']
        pred_df['Playoff'] = prob * (8 / prob.sum())
        pred_df['Playoff'] = pred_df['Playoff'].round(2)
        print(pred_df['Playoff'].sum())
        return pred_df

    stats = {}
    for confID in [0, 1]:
        conf_data = data[data['confID'] == confID]
        _, conf_stats = run_model(model, conf_data, year, 4)
        stats[confID] = conf_stats

    combined_stats = {k: np.mean([v[k] for v in stats.values()]) for k in stats[0]}
    print(f"Metrics: {combined_stats}")
    return combined_stats

# Hyperparameter tuning
def svm_objective(trial):
    C = trial.suggest_loguniform('C', 1e-5, 1e2)
    kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])
    gamma = trial.suggest_loguniform('gamma', 1e-5, 1e1) if kernel in ['rbf', 'poly', 'sigmoid'] else 'scale'
    degree = trial.suggest_int('degree', 2, 5) if kernel == 'poly' else 3
    coef0 = trial.suggest_float('coef0', -1, 1) if kernel in ['poly', 'sigmoid'] else 0
    class_weight = trial.suggest_categorical('class_weight', [None, 'balanced'])

    scores = []
    for i in range(2,11):
        X_train, Y_train, _, _ = get_train_and_test_data(teams, i)
        model = SVC(C=C, kernel=kernel, gamma=gamma, degree=degree, coef0=coef0, class_weight=class_weight, random_state=SEED)
        cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=SEED)
        score = cross_val_score(model, X_train, Y_train, cv=cv, scoring='accuracy').mean()
        scores.append(score)
    
    return 1 - np.mean(scores)

# Main Execution
study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=SEED))
study.optimize(svm_objective, n_trials=50)
best_params = study.best_params

final_model = SVC(**best_params, random_state=SEED, probability=True)
run_model_by_conferences(final_model, 11, teams)

BEST_MODELS = {"svm": run_model_by_conferences(final_model, 11, teams)}
os.makedirs('data_prediction', exist_ok=True)
for k,v in BEST_MODELS.items():
    v.to_csv(os.path.join('data_prediction', k+'.csv'), index=False)