# Predictive Model

The metrics that will be used to evaluate this stage are:

- **Diversity** of tasks (use of classification and regression) and of algorithms, this is, tested more than 4 with significantly different language bias OR with a significant number of variants.

- **Parameter Tuning**, with a systematic approach.

- **Understanding Algorithm Behavior**, solid (even if not deep) understanding of the behavior of most algorithms used OR <3 algorithms, also understanding the effect of parameters.

- Training and testing on properly separated data, with multiple splits.

- **Performance Estimation**, additional factors correctly taken into account (e.g. time), focus on performance measures aligned with DM goals and data characteristics, advanced performance measures (e.g. AUC), adequate baseline, correct analysis of values ​​for comparison, including tests of statistical significance, correct estimate of overfitting.

- **Model Improvement**, development guided by performance improvement goals, even if pedagogical goals have not been ignored.

- **Feature Importance**, correctly interpreted, related to the application domain.

- Analysis of "white box" models, correctly interpreted, related to the application domain

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.discriminant_analysis import StandardScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.calibration import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, accuracy_score
import time

teams = pd.read_csv("data_prepared/teams.csv")
label_encoder = LabelEncoder()

numerical_features = teams.select_dtypes(include=['float', 'int']).columns
numerical_features = numerical_features.drop('year')

scaler = StandardScaler()
teams[numerical_features] = scaler.fit_transform(teams[numerical_features])

teams['playoff'] = teams['playoff'].map({'Y': 1, 'N': 0})
teams['confID'] = teams['confID'].map({'EA': 0, 'WE': 1})

def encode_categorical_columns(df):
    for col in df.select_dtypes(include=['object']).columns:
        if col == 'playoff' or col == 'confID':
            continue
        else:
            df[col] = label_encoder.fit_transform(df[col])
    return df

encode_categorical_columns(teams)
teams = teams.fillna(0)


print(teams[['confID']])

In [None]:
def run_model_classification(model, data, year, number, run_by_conference=False):
    train = data[data['year'] < year]
    test = data[data['year'] == year]

    train.drop("year", axis=1, inplace=True)
    test.drop("year", axis=1, inplace=True)

    X_train = train.drop("playoff", axis=1)
    Y_train = train["playoff"]

    X_test = test.drop("playoff", axis=1)
    Y_test = test["playoff"]

    start_timer = time.time()

    model.fit(X_train, Y_train)
    
    if isinstance(model, LinearRegression):
        y_pred_proba = model.predict(X_test)
    else:
        y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred = np.zeros_like(y_pred_proba) 
    top_8_indices = np.argsort(y_pred_proba)[-number:]
    y_pred[top_8_indices] = 1
    
    stop_timer = time.time()

    # build prediction df
    prediction_df = pd.DataFrame()
    prediction_df.loc[:, 'teams'] = label_encoder.inverse_transform(X_test.loc[:, 'tmID'])
    prediction_df.loc[:, 'playoff'] = Y_test.values
    prediction_df.loc[:, 'prediction'] = y_pred

    # statistics
    time_elapsed = stop_timer - start_timer
    precision = precision_score(Y_test, y_pred)
    recall = recall_score(Y_test, y_pred)
    f1 = f1_score(Y_test, y_pred)
    accuracy = accuracy_score(Y_test, y_pred)
    auc = roc_auc_score(Y_test, y_pred)

    if not run_by_conference:
        print(prediction_df)
        print(f"Time: {time_elapsed:.3f}    Accuracy: {accuracy:.2f}    Precision: {precision:.2f}    Recall: {recall:.2f}    F1: {f1:.2f}    AUC: {auc:.2f}")
        print()

    if not run_by_conference and isinstance(model, (DecisionTreeClassifier, GradientBoostingClassifier)):
        feature_importances_df = pd.DataFrame(model.feature_importances_, index=X_train.columns, columns=["Importance"])
        feature_importances_df.sort_values(by='Importance', ascending=False, inplace=True)
        feature_importances_df.plot(kind='bar', figsize=(10, 4))

        if isinstance(model, DecisionTreeClassifier):
            plt.figure(figsize=(8, 8))
            plot_tree(model, filled=True, feature_names=X_test.columns.to_list(), rounded=True)
            plt.show()

    if run_by_conference:
        return prediction_df, {
            "time": time_elapsed,
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "accuracy": accuracy,
            "auc": auc
        }

In [None]:
import time
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

def run_model_regression(model, data, year, number, run_by_conference=False):
    # Separar os dados de treino e teste
    train = data[data['year'] < year]
    test = data[data['year'] == year]

    train.drop("year", axis=1, inplace=True)
    test.drop("year", axis=1, inplace=True)

    X_train = train.drop("playoff", axis=1)
    Y_train = train["playoff"]

    X_test = test.drop("playoff", axis=1)
    Y_test = test["playoff"]

    start_timer = time.time()

    # Ajustar o modelo
    model.fit(X_train, Y_train)
    
    # Previsões para modelos de regressão
    y_pred = model.predict(X_test)
    
    # Calcular as métricas de avaliação para regressão
    mae = mean_absolute_error(Y_test, y_pred)
    mse = mean_squared_error(Y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(Y_test, y_pred)
    
    stop_timer = time.time()

    # Construir o DataFrame de previsões
    prediction_df = pd.DataFrame()
    prediction_df.loc[:, 'teams'] = label_encoder.inverse_transform(X_test.loc[:, 'tmID'])
    prediction_df.loc[:, 'actual'] = Y_test.values
    prediction_df.loc[:, 'prediction'] = y_pred

    # Estatísticas
    time_elapsed = stop_timer - start_timer
    if not run_by_conference:
        print(prediction_df)
        print(f"Time: {time_elapsed:.3f}    MAE: {mae:.2f}    MSE: {mse:.2f}    RMSE: {rmse:.2f}    R²: {r2:.2f}")
        print()

    # Exibir importância das variáveis para modelos baseados em árvores
    if not run_by_conference and isinstance(model, (DecisionTreeRegressor, GradientBoostingRegressor)):
        feature_importances_df = pd.DataFrame(model.feature_importances_, index=X_train.columns, columns=["Importance"])
        feature_importances_df.sort_values(by='Importance', ascending=False, inplace=True)
        feature_importances_df.plot(kind='bar', figsize=(10, 4))

        if isinstance(model, DecisionTreeRegressor):
            plt.figure(figsize=(8, 8))
            plot_tree(model, filled=True, feature_names=X_test.columns.to_list(), rounded=True)
            plt.show()

    if run_by_conference:
        return prediction_df, {
            "time": time_elapsed,
            "mae": mae,
            "mse": mse,
            "rmse": rmse,
            "r2": r2
        }


In [None]:
def run_model_classification_by_conferences(model, year):
    
    east_pred_df, east_statistics = run_model_classification(model, teams[teams['confID'] == 0], year, 4, True)
    weast_pred_df, weast_statistics = run_model_classification(model, teams[teams['confID'] == 1], year, 4, True)

    statistics = {}

    for k in east_statistics.keys():
        statistics[k] = (east_statistics[k] + weast_statistics[k]) / 2
    
    print(pd.concat([east_pred_df, weast_pred_df]))
    print(f"Time: {statistics['time']:.3f}    Accuracy: {statistics['accuracy']:.2f}    Precision: {statistics['precision']:.2f}    Recall: {statistics['recall']:.2f}    F1: {statistics['f1']:.2f}    AUC: {statistics['auc']:.2f}")
    print()

In [None]:
def run_model_regression_by_conferences(model, year):
    # Executar o modelo para as conferências Leste e Oeste
    east_pred_df, east_statistics = run_model_regression(model, teams[teams['confID'] == 0], year, 4, True)
    weast_pred_df, weast_statistics = run_model_regression(model, teams[teams['confID'] == 1], year, 4, True)

    statistics = {}

    # Calcular as estatísticas médias para cada métrica
    for k in east_statistics.keys():
        statistics[k] = (east_statistics[k] + weast_statistics[k]) / 2
    
    # Exibir os resultados combinados das conferências
    print(pd.concat([east_pred_df, weast_pred_df]))
    print(f"Time: {statistics['time']:.3f}    MAE: {statistics['mae']:.2f}    MSE: {statistics['mse']:.2f}    RMSE: {statistics['rmse']:.2f}    R²: {statistics['r2']:.2f}")
    print()


## 1 - Algorithms

### 1.1 - Decision Tree

In [None]:
run_model_classification_by_conferences(DecisionTreeClassifier(), 10)
#run_model_classification(DecisionTreeClassifier(random_state=40), teams, 10, 8)

In [None]:
run_model_classification_by_conferences(SVC(kernel='linear', probability=True),10)

1.2 - KNeighborsClassifier

In [None]:
run_model_classification_by_conferences(KNeighborsClassifier(n_neighbors=5), 10)
#run_model_classification(KNeighborsClassifier(n_neighbors=1), teams, 10, 8)

In [None]:
run_model_classification_by_conferences(GaussianNB(), 10)
#run_model_classification(GaussianNB(), teams, 10, 8)

In [None]:
run_model_regression_by_conferences(RandomForestRegressor(n_estimators=100, random_state=42), 10)
#run_model_regression(RandomForestRegressor(n_estimators=100, random_state=42), teams, 10, 8)

### 1.3 - Grandient Boosting

In [None]:
run_model_classification_by_conferences(GradientBoostingClassifier(random_state=42), 10)
#run_model_classification(GradientBoostingClassifier(random_state=42), teams, 10, 8)

### 1.4 - Linear Regression

In [None]:
run_model_classification_by_conferences(LinearRegression(), 10)
#run_model_classification(LinearRegression(), teams, 10, 8)