# Predictive Model

The metrics that will be used to evaluate this stage are:

- **Diversity** of tasks (use of classification and regression) and of algorithms, this is, tested more than 4 with significantly different language bias OR with a significant number of variants.

- **Parameter Tuning**, with a systematic approach.

- **Understanding Algorithm Behavior**, solid (even if not deep) understanding of the behavior of most algorithms used OR <3 algorithms, also understanding the effect of parameters.

- Training and testing on properly separated data, with multiple splits.

- **Performance Estimation**, additional factors correctly taken into account (e.g. time), focus on performance measures aligned with DM goals and data characteristics, advanced performance measures (e.g. AUC), adequate baseline, correct analysis of values ​​for comparison, including tests of statistical significance, correct estimate of overfitting.

- **Model Improvement**, development guided by performance improvement goals, even if pedagogical goals have not been ignored.

- **Feature Importance**, correctly interpreted, related to the application domain.

- Analysis of "white box" models, correctly interpreted, related to the application domain

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.discriminant_analysis import StandardScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.calibration import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, accuracy_score
import time

teams = pd.read_csv("data_prepared/teams.csv")
label_encoder = LabelEncoder()

numerical_features = teams.select_dtypes(include=['float', 'int']).columns
numerical_features = numerical_features.drop('year')

scaler = StandardScaler()
teams[numerical_features] = scaler.fit_transform(teams[numerical_features])

teams['playoff'] = teams['playoff'].map({'Y': 1, 'N': 0})
teams['confID'] = teams['confID'].map({'EA': 0, 'WE': 1})

def encode_categorical_columns(df):
    for col in df.select_dtypes(include=['object']).columns:
        if col == 'playoff' or col == 'confID': continue
        else: df[col] = label_encoder.fit_transform(df[col])
    return df

encode_categorical_columns(teams)
teams = teams.fillna(0)

In [None]:
def normalize_predictions(predictions):
        return (predictions - np.min(predictions)) / (np.max(predictions) - np.min(predictions))

def get_error(pred_proba, label_results):
    errors = []
    for pred, label in zip(pred_proba, label_results):
        errors.append(abs(pred - label))
    return sum(errors)

def run_model(model, data, year, number, run_by_conference=False):
    train = data[data['year'] < year]
    test = data[data['year'] == year]

    train.drop("year", axis=1, inplace=True)
    test.drop("year", axis=1, inplace=True)

    X_train = train.drop("playoff", axis=1)
    Y_train = train["playoff"]

    X_test = test.drop("playoff", axis=1)
    Y_test = test["playoff"]

    start_timer = time.time()

    model.fit(X_train, Y_train)
    
    if isinstance(model, (LinearRegression, RandomForestRegressor)):
        y_pred_proba = model.predict(X_test)
        y_pred_proba = normalize_predictions(y_pred_proba)
    else:
        y_pred_proba = model.predict_proba(X_test)[:, 1]

    errors = get_error(y_pred_proba, Y_test)

    y_pred = np.zeros_like(y_pred_proba) 
    top_indices = np.argsort(y_pred_proba)[-number:]
    y_pred[top_indices] = 1
    
    stop_timer = time.time()

    # build prediction df
    prediction_df = pd.DataFrame()
    prediction_df.loc[:, 'teams'] = label_encoder.inverse_transform(X_test.loc[:, 'tmID'])
    prediction_df.loc[:, 'playoff'] = Y_test.values
    prediction_df.loc[:, 'prediction'] = y_pred

    # statistics
    time_elapsed = stop_timer - start_timer
    precision = precision_score(Y_test, y_pred)
    recall = recall_score(Y_test, y_pred)
    f1 = f1_score(Y_test, y_pred)
    accuracy = accuracy_score(Y_test, y_pred)
    auc = roc_auc_score(Y_test, y_pred)

    if not run_by_conference and isinstance(model, (DecisionTreeClassifier, GradientBoostingClassifier)):
        feature_importances_df = pd.DataFrame(model.feature_importances_, index=X_train.columns, columns=["Importance"])
        feature_importances_df.sort_values(by='Importance', ascending=False, inplace=True)
        feature_importances_df.plot(kind='bar', figsize=(10, 4))

        if isinstance(model, DecisionTreeClassifier):
            plt.figure(figsize=(8, 8))
            plot_tree(model, filled=True, feature_names=X_test.columns.to_list(), rounded=True)
            plt.show()

    if run_by_conference:
        return prediction_df, {
            "time": time_elapsed,
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "accuracy": accuracy,
            "auc": auc,
            "error": errors
        }

In [None]:
results_accuracy = {}
results_precision = {}

def run_model_by_conferences(model, year):
    
    east_pred_df, east_statistics = run_model(model, teams[teams['confID'] == 0], year, 4, True)
    weast_pred_df, weast_statistics = run_model(model, teams[teams['confID'] == 1], year, 4, True)

    statistics = {}

    for k in east_statistics.keys():
        statistics[k] = (east_statistics[k] + weast_statistics[k]) / 2
    
    print(pd.concat([east_pred_df, weast_pred_df]))
    print(f"Time: {statistics['time']:.3f}    Error: {statistics['error']:.2f}    Accuracy: {statistics['accuracy']:.2f}    Precision: {statistics['precision']:.2f}    Recall: {statistics['recall']:.2f}    F1: {statistics['f1']:.2f}    AUC: {statistics['auc']:.2f}")
    print()
    
    results_accuracy[str(model.__class__.__name__)] = statistics['accuracy']
    results_precision[str(model.__class__.__name__)] = statistics['precision']

## 1 - Algorithms

### 1.1 - Decision Tree

In [None]:
run_model_by_conferences(DecisionTreeClassifier(random_state=42), 10)

### 1.2 - SVC

In [None]:
run_model_by_conferences(SVC(random_state=42, kernel='linear', probability=True),10)

### 1.3 - KNeighborsClassifier

In [None]:
# run_model_by_conferences(KNeighborsClassifier(n_neighbors=5), 10)

### 1.4 - Naive Bayes

In [None]:
run_model_by_conferences(GaussianNB(), 10)

### 1.5 - Random Forest

In [None]:
run_model_by_conferences(RandomForestRegressor(n_estimators=100, random_state=42), 10)

### 1.6 - Grandient Boosting

In [None]:
run_model_by_conferences(GradientBoostingClassifier(random_state=42), 10)

### 1.7 - Linear Regression

In [None]:
run_model_by_conferences(LinearRegression(), 10)

In [None]:
results_accuracy = dict(sorted(results_accuracy.items(), key=lambda item: item[1]))
labels = list(results_accuracy.keys())
values = list(results_accuracy.values())

plt.figure(figsize=(6, 4))
plt.barh(labels, values, color='skyblue', edgecolor='black')
plt.title("Accuracy by Models", fontsize=16)
plt.xlabel("Accuracy", fontsize=14)
plt.ylabel("Models", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()