# Predictive Model

The metrics that will be used to evaluate this stage are:

- **Diversity** of tasks (use of classification and regression) and of algorithms, this is, tested more than 4 with significantly different language bias OR with a significant number of variants.

- **Parameter Tuning**, with a systematic approach.

- **Understanding Algorithm Behavior**, solid (even if not deep) understanding of the behavior of most algorithms used OR <3 algorithms, also understanding the effect of parameters.

- Training and testing on properly separated data, with multiple splits.

- **Performance Estimation**, additional factors correctly taken into account (e.g. time), focus on performance measures aligned with DM goals and data characteristics, advanced performance measures (e.g. AUC), adequate baseline, correct analysis of values ​​for comparison, including tests of statistical significance, correct estimate of overfitting.

- **Model Improvement**, development guided by performance improvement goals, even if pedagogical goals have not been ignored.

- **Feature Importance**, correctly interpreted, related to the application domain.

- Analysis of "white box" models, correctly interpreted, related to the application domain

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.calibration import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, accuracy_score, classification_report
import time

teams = pd.read_csv("data_prepared/teams.csv")
teams = teams.drop(columns=['teams_firstRound', 'teams_semis', 'teams_finals'])
teams['teams_playoff'] = teams['teams_playoff'].map({'Y': 1, 'N': 0})
teams['teams_confID'] = teams['teams_confID'].map({'EA': 0, 'WE': 1})

def encode_categorical_columns(df):
    label_encoder = LabelEncoder()
    for col in df.select_dtypes(include=['object']).columns:
        if col == 'teams_playoff' or col == 'teams_confID':
            continue
        else:
            df[col] = label_encoder.fit_transform(df[col])
    return df

encode_categorical_columns(teams)
teams = teams.fillna(0)
linhas_com_nan = teams[teams.isna().any(axis=1)]
print(linhas_com_nan[['teams_year','teams_tmID','teams_confID','teams_playoff']])

## 1 - Algorithms

### 1.1 - Decision Tree

In [None]:
def algorithm(data, year, number):

    train = data[data['teams_year'] < year]
    test = data[data['teams_year'] == year]

    # train.drop("teams_year", axis=1, inplace=True)
    # test.drop("teams_year", axis=1, inplace=True)

    X_train = train.drop("teams_playoff", axis=1)
    Y_train = train["teams_playoff"]

    X_test = test.drop("teams_playoff", axis=1)
    Y_test = test["teams_playoff"]

    start_time = time.time()

    decision_tree = DecisionTreeClassifier(random_state=42)
    decision_tree.fit(X_train, Y_train)
    
    y_pred_proba = decision_tree.predict_proba(X_test)[:, 1]
    y_pred = np.zeros_like(y_pred_proba) 
    y_pred[np.argsort(y_pred_proba)[-number:]] = 1
    
    end_time = time.time()

    # print(f"\nClassification Report:\n", classification_report(Y_test, y_pred))   
    print(f"Time: {(end_time - start_time):.2f}") 
    print(f"Precision: {precision_score(Y_test, y_pred):.2f}")
    print(f"Recall: {recall_score(Y_test, y_pred):.2f}")
    print(f"F1: {f1_score(Y_test, y_pred):.2f}")
    print(f"Accuracy: {accuracy_score(Y_test, y_pred):.2f}")
    print(f"AUC: { roc_auc_score(Y_test, y_pred):.2f}")

    feature_importances_df = pd.DataFrame(decision_tree.feature_importances_, index=X_train.columns, columns=["Importance"])
    feature_importances_df.sort_values(by='Importance', ascending=False, inplace=True)
    feature_importances_df.plot(kind='bar', figsize=(10, 4))

    plt.figure(figsize=(8, 8))
    plot_tree(decision_tree, filled=True, feature_names=X_test.columns.to_list(), rounded=True)
    plt.show()

print("--- First Conference ---\n")
algorithm(teams[teams['teams_confID']==0],10,4)
print("\n--- Second Conference ---\n")
algorithm(teams[teams['teams_confID']==1],10,4)
print("\n--- All Conferences ---\n")
algorithm(teams,9,8)

1.2 - Support Vector Machine

In [None]:
def algorithm_svm(data, year, numberSeclected):
    # train = data[data['teams_year'] < year]
    # test = data[data['teams_year'] == year]

    # x_test = pd.get_dummies(test.drop(columns=['teams_playoff']), drop_first=True)
    # x_train = pd.get_dummies(train.drop(columns=['teams_playoff']), drop_first=True)
    # y_train = train['teams_playoff']

    # x_train, x_test = x_train.align(x_test, join='left', axis=1, fill_value=0)

    train = data[data['teams_year'] < year]
    test = data[data['teams_year'] == year]

    train.drop("teams_year", axis=1, inplace=True)
    test.drop("teams_year", axis=1, inplace=True)

    X_train = train.drop("teams_playoff", axis=1)
    Y_train = train["teams_playoff"]

    X_test = test.drop("teams_playoff", axis=1)
    Y_test = test["teams_playoff"]

    # Usar Support Vector Machine
    svm = SVC(probability=True, random_state=42)
    svm.fit(X_train, Y_train)
    y_pred = svm.predict(X_test)
    # y_pred_proba = svm.predict_proba(X_test)[:, 1]

    # y_pred = np.zeros_like(y_pred_proba) 
    # top_8_indices = np.argsort(y_pred_proba)[-numberSeclected:]
    # y_pred[top_8_indices] = 1
    print(f"\nClassification Report:\n", classification_report(Y_test, y_pred))
    print(f"Accuracy: {accuracy_score(Y_test, y_pred):.2f}")
    # accuracy = accuracy_score(test['teams_playoff'], y_pred)
    # print(f"SVM Accuracy: {accuracy:.2f}")
    # print(y_pred)

# Chamar a função para Support Vector Machine
algorithm_svm(teams[teams['teams_confID']==0], 10, 4)
algorithm_svm(teams[teams['teams_confID']==1], 10, 4)
algorithm_svm(teams, 10, 8)


### 1.3 - Grandient Boosting

In [None]:
def algorithm_gradient_boosting(data, year, numberSeclected):
    # train = data[data['teams_year'] < year]
    # test = data[data['teams_year'] == year]

    # x_test = pd.get_dummies(test.drop(columns=['teams_playoff']), drop_first=True)
    # x_train = pd.get_dummies(train.drop(columns=['teams_playoff']), drop_first=True)
    # y_train = train['teams_playoff']

    # x_train, x_test = x_train.align(x_test, join='left', axis=1, fill_value=0)

    train = data[data['teams_year'] < year]
    test = data[data['teams_year'] == year]

    train.drop("teams_year", axis=1, inplace=True)
    test.drop("teams_year", axis=1, inplace=True)

    X_train = train.drop("teams_playoff", axis=1)
    Y_train = train["teams_playoff"]

    X_test = test.drop("teams_playoff", axis=1)
    Y_test = test["teams_playoff"]

    # Usar Gradient Boosting
    gradient_boosting = GradientBoostingClassifier(random_state=42)
    gradient_boosting.fit(X_train, Y_train)
    y_pred = gradient_boosting.predict(X_test)
    # y_pred_proba = gradient_boosting.predict_proba(X_test)[:, 1]

    # y_pred = np.zeros_like(y_pred_proba) 
    # top_8_indices = np.argsort(y_pred_proba)[-numberSeclected:]
    # y_pred[top_8_indices] = 1

    # accuracy = accuracy_score(test['teams_playoff'], y_pred)
    # print(f"Gradient Boosting Accuracy: {accuracy:.2f}")

    print(f"\nClassification Report:\n", classification_report(Y_test, y_pred))
    print(f"Accuracy: {accuracy_score(Y_test, y_pred):.2f}")

    feature_importances_df = pd.DataFrame(
        gradient_boosting.feature_importances_,
        index=X_train.columns,
        columns=["Importance"]
    )
    feature_importances_df.sort_values(by='Importance', ascending=False, inplace=True)
    feature_importances_df.plot(kind='bar', figsize=(12, 12))
    # print(feature_importances_df.head(10))
    # print(y_pred)

# Chamar a função para Gradient Boosting
algorithm_gradient_boosting(teams[teams['teams_confID']==0], 10, 4)
algorithm_gradient_boosting(teams[teams['teams_confID']==1], 10, 4)
algorithm_gradient_boosting(teams, 10, 8)


### 1.4 - K Nearest Neighbors

In [None]:
def algorithm_knn(data, year, numberSeclected):
    # train = data[data['teams_year'] < year]
    # test = data[data['teams_year'] == year]

    # x_test = pd.get_dummies(test.drop(columns=['teams_playoff']), drop_first=True)
    # x_train = pd.get_dummies(train.drop(columns=['teams_playoff']), drop_first=True)
    # y_train = train['teams_playoff']

    # x_train, x_test = x_train.align(x_test, join='left', axis=1, fill_value=0)

    train = data[data['teams_year'] < year]
    test = data[data['teams_year'] == year]

    train.drop("teams_year", axis=1, inplace=True)
    test.drop("teams_year", axis=1, inplace=True)

    X_train = train.drop("teams_playoff", axis=1)
    Y_train = train["teams_playoff"]

    X_test = test.drop("teams_playoff", axis=1)
    Y_test = test["teams_playoff"]

    # Usar K-Nearest Neighbors
    knn = KNeighborsClassifier()
    knn.fit(X_train, Y_train)

    y_pred = knn.predict(X_test)
    print(f"\nClassification Report:\n", classification_report(Y_test, y_pred))
    print(f"Accuracy: {accuracy_score(Y_test, y_pred):.2f}")
    # y_pred_proba = knn.predict_proba(x_test)[:, 1]

    # y_pred = np.zeros_like(y_pred_proba) 
    # top_8_indices = np.argsort(y_pred_proba)[-numberSeclected:]
    # y_pred[top_8_indices] = 1

    # accuracy = accuracy_score(test['teams_playoff'], y_pred)
    # print(f"KNN Accuracy: {accuracy:.2f}")
    # print(y_pred)

# Chamar a função para KNN
algorithm_knn(teams[teams['teams_confID']==0], 10, 4)
algorithm_knn(teams[teams['teams_confID']==1], 10, 4)
algorithm_knn(teams, 10, 8)
