In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.calibration import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, accuracy_score, classification_report
import time

teams = pd.read_csv("data_prepared/teams.csv")
teams = teams.drop(columns=['teams_firstRound', 'teams_semis', 'teams_finals', 'teams_rank', 'teams_post_W', 'teams_post_L'])
teams['teams_playoff'] = teams['teams_playoff'].map({'Y': 1, 'N': 0})
teams['teams_confID'] = teams['teams_confID'].map({'EA': 0, 'WE': 1})


players = pd.read_csv("data_prepared/players_teams.csv")
players = players.drop(columns=['teams_firstRound', 'teams_semis', 'teams_finals', 'teams_rank', 'teams_post_W', 'teams_post_L'])
players['teams_playoff'] = players['teams_playoff'].map({'Y': 1, 'N': 0})
players['teams_confID'] = players['teams_confID'].map({'EA': 0, 'WE': 1})

def encode_categorical_columns(df):
    label_encoder = LabelEncoder()
    for col in df.select_dtypes(include=['object']).columns:
        if col == 'teams_playoff' or col == 'teams_confID':
            continue
        else:
            df[col] = label_encoder.fit_transform(df[col])
    return df

encode_categorical_columns(teams)
encode_categorical_columns(players)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score
import time

def second(data, year):

    train = data[data['players_teams_year'] < year]
    test = data[data['players_teams_year'] == year]

    X_train = train.drop("teams_playoff", axis=1)
    Y_train = train["teams_playoff"]

    X_test = test.drop("teams_playoff", axis=1)
    Y_test = test["teams_playoff"]

    start_time = time.time()
    decision_tree = DecisionTreeClassifier(random_state=42)
    decision_tree.fit(X_train, Y_train)
    end_time = time.time()
    
    y_pred_proba = decision_tree.predict_proba(X_test)[:, 1]
    test['probability'] = y_pred_proba
    team_probs = test.groupby(['players_teams_tmID', 'players_teams_year'])['probability'].mean().reset_index()
    team_probs['predicted_playoff'] = 0
    team_probs.loc[team_probs['probability'].nlargest(4).index, 'predicted_playoff'] = 1
    y_pred = test.merge(team_probs[['players_teams_tmID', 'players_teams_year', 'predicted_playoff']], on=['players_teams_tmID', 'players_teams_year'], how='left')['predicted_playoff']

    print(f"Time: {(end_time - start_time):.2f} segundos")
    print(f"Precision: {precision_score(Y_test, y_pred):.2f}")
    print(f"Recall: {recall_score(Y_test, y_pred):.2f}")
    print(f"F1: {f1_score(Y_test, y_pred):.2f}")
    print(f"Accuracy: {accuracy_score(Y_test, y_pred):.2f}")
    print(f"AUC: {roc_auc_score(Y_test, y_pred):.2f}")
    

    # feature_importances_df = pd.DataFrame(decision_tree.feature_importances_, index=X_train.columns, columns=["Importance"])
    # feature_importances_df.sort_values(by='Importance', ascending=False, inplace=True)
    # feature_importances_df.plot(kind='bar', figsize=(10, 4))
    # 
    # plt.figure(figsize=(8, 8))
    # plot_tree(decision_tree, filled=True, feature_names=X_test.columns.to_list(), rounded=True)
    # plt.show()
    
def first(year):
    data1 = players[players['teams_confID'] == 0]
    data2 = players[players['teams_confID'] == 1]
    print("--- First Conference ---\n")
    second(data1, year)
    print("\n--- Second Conference ---\n")
    second(data2,year)

first(10)