# Predictive Model

The metrics that will be used to evaluate this stage are:

- **Diversity** of tasks (use of classification and regression) and of algorithms, this is, tested more than 4 with significantly different language bias OR with a significant number of variants.

- **Parameter Tuning**, with a systematic approach.

- **Understanding Algorithm Behavior**, solid (even if not deep) understanding of the behavior of most algorithms used OR <3 algorithms, also understanding the effect of parameters.

- Training and testing on properly separated data, with multiple splits.

- **Performance Estimation**, additional factors correctly taken into account (e.g. time), focus on performance measures aligned with DM goals and data characteristics, advanced performance measures (e.g. AUC), adequate baseline, correct analysis of values ​​for comparison, including tests of statistical significance, correct estimate of overfitting.

- **Model Improvement**, development guided by performance improvement goals, even if pedagogical goals have not been ignored.

- **Feature Importance**, correctly interpreted, related to the application domain.

- Analysis of "white box" models, correctly interpreted, related to the application domain

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.calibration import LabelEncoder
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, accuracy_score, classification_report
import time

In [None]:
teams = pd.read_csv("data_prepared/teams.csv")
teams = teams.drop(columns=['teams_firstRound', 'teams_semis', 'teams_finals'])
teams['teams_playoff'] = teams['teams_playoff'].map({'Y': 1, 'N': 0})
teams['teams_confID'] = teams['teams_confID'].map({'EA': 0, 'WE': 1})

label_encoder = LabelEncoder()

def encode_categorical_columns(df):
    for col in df.select_dtypes(include=['object']).columns:
        if col == 'teams_playoff' or col == 'teams_confID':
            continue
        else:
            df[col] = label_encoder.fit_transform(df[col])
    return df

encode_categorical_columns(teams)
teams = teams.fillna(0)
linhas_com_nan = teams[teams.isna().any(axis=1)]
print(linhas_com_nan[['teams_year','teams_tmID','teams_confID','teams_playoff']])

In [None]:
def run_model(model, data, year, number):
    train = data[data['teams_year'] < year]
    test = data[data['teams_year'] == year]

    train.drop("teams_year", axis=1, inplace=True)
    test.drop("teams_year", axis=1, inplace=True)

    X_train = train.drop("teams_playoff", axis=1)
    Y_train = train["teams_playoff"]

    X_test = test.drop("teams_playoff", axis=1)
    Y_test = test["teams_playoff"]

    start_timer = time.time()

    model.fit(X_train, Y_train)
    
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred = np.zeros_like(y_pred_proba) 
    top_8_indices = np.argsort(y_pred_proba)[-number:]
    y_pred[top_8_indices] = 1
    
    stop_timer = time.time()

    # build prediction df
    prediction_df = pd.DataFrame()
    prediction_df.loc[:, 'teams'] = label_encoder.inverse_transform(X_test.loc[:, 'teams_tmID'])
    prediction_df.loc[:, 'playoff'] = Y_test.values
    prediction_df.loc[:, 'prediction'] = y_pred

    print(prediction_df.head(20))

    # statistics
    time_elapsed = stop_timer - start_timer
    precision = precision_score(Y_test, y_pred)
    recall = recall_score(Y_test, y_pred)
    f1 = f1_score(Y_test, y_pred)
    accuracy = accuracy_score(Y_test, y_pred)
    auc = roc_auc_score(Y_test, y_pred)

    print(f"Time: {time_elapsed:.3f}    Accuracy: {accuracy:.2f}    Precision: {precision:.2f}    Recall: {recall:.2f}    F1: {f1:.2f}    AUC: {auc:.2f}")

    if isinstance(model, (DecisionTreeClassifier, GradientBoostingClassifier)):
        feature_importances_df = pd.DataFrame(model.feature_importances_, index=X_train.columns, columns=["Importance"])
        feature_importances_df.sort_values(by='Importance', ascending=False, inplace=True)
        feature_importances_df.plot(kind='bar', figsize=(10, 4))

        if isinstance(model, DecisionTreeClassifier):
            plt.figure(figsize=(8, 8))
            plot_tree(model, filled=True, feature_names=X_test.columns.to_list(), rounded=True)
            plt.show()

    print()

## 1 - Algorithms

### 1.1 - Decision Tree

In [None]:
run_model(DecisionTreeClassifier(random_state=42), teams[teams['teams_confID'] == 0], 10, 4)
run_model(DecisionTreeClassifier(random_state=42), teams[teams['teams_confID'] == 1], 10, 4)
run_model(DecisionTreeClassifier(random_state=42), teams, 10, 8)

1.2 - Support Vector Machine

In [None]:
run_model(SVC(probability=True, random_state=42), teams[teams['teams_confID']==0], 10, 4)
run_model(SVC(probability=True, random_state=42), teams[teams['teams_confID']==1], 10, 4)
run_model(SVC(probability=True, random_state=42), teams, 10, 8)


### 1.3 - Grandient Boosting

In [None]:
run_model(GradientBoostingClassifier(random_state=42), teams[teams['teams_confID']==0], 10, 4)
run_model(GradientBoostingClassifier(random_state=42), teams[teams['teams_confID']==1], 10, 4)
run_model(GradientBoostingClassifier(random_state=42), teams, 10, 8)


### 1.4 - K Nearest Neighbors

In [None]:
run_model(KNeighborsClassifier(), teams[teams['teams_confID']==0], 10, 4)
run_model(KNeighborsClassifier(), teams[teams['teams_confID']==1], 10, 4)
run_model(KNeighborsClassifier(), teams, 10, 8)