# Predictive Model

The metrics that will be used to evaluate this stage are:

- **Diversity** of tasks (use of classification and regression) and of algorithms, this is, tested more than 4 with significantly different language bias OR with a significant number of variants.

- **Parameter Tuning**, with a systematic approach.

- **Understanding Algorithm Behavior**, solid (even if not deep) understanding of the behavior of most algorithms used OR <3 algorithms, also understanding the effect of parameters.

- Training and testing on properly separated data, with multiple splits.

- **Performance Estimation**, additional factors correctly taken into account (e.g. time), focus on performance measures aligned with DM goals and data characteristics, advanced performance measures (e.g. AUC), adequate baseline, correct analysis of values ​​for comparison, including tests of statistical significance, correct estimate of overfitting.

- **Model Improvement**, development guided by performance improvement goals, even if pedagogical goals have not been ignored.

- **Feature Importance**, correctly interpreted, related to the application domain.

- Analysis of "white box" models, correctly interpreted, related to the application domain

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.calibration import LabelEncoder
from sklearn.preprocessing import StandardScaler

from keras.models import Sequential
from keras.layers import Dense

teams = pd.read_csv("data_prepared/teams.csv")
teams = teams.drop(columns=['teams_firstRound', 'teams_semis', 'teams_finals', 'teams_rank', 'teams_post_W', 'teams_post_L'])
teams['teams_playoff'] = teams['teams_playoff'].map({'Y': 1, 'N': 0})
teams['teams_confID'] = teams['teams_confID'].map({'EA': 0, 'WE': 1})

def encode_categorical_columns(df):
    label_encoder = LabelEncoder()
    for col in df.select_dtypes(include=['object']).columns:
        if col == 'teams_playoff' or col == 'teams_confID':
            continue
        else:
            df[col] = label_encoder.fit_transform(df[col])
    return df

encode_categorical_columns(teams)

## 1 - Algorithms

### 1.1 - Decision Tree

In [None]:
def algorithm(data, year, number):
    train = data[data['teams_year'] < year]
    test = data[data['teams_year'] == year]

    train.drop("teams_year", axis=1, inplace=True)
    test.drop("teams_year", axis=1, inplace=True)

    X_train = train.drop("teams_playoff", axis=1)
    Y_train = train["teams_playoff"]

    X_test = test.drop("teams_playoff", axis=1)
    Y_test = test["teams_playoff"]

    decision_tree = DecisionTreeClassifier(random_state=42)
    decision_tree.fit(X_train, Y_train)
    
    y_pred_proba = decision_tree.predict_proba(X_test)[:, 1]
    y_pred = np.zeros_like(y_pred_proba) 
    top_8_indices = np.argsort(y_pred_proba)[-number:]
    y_pred[top_8_indices] = 1

    print(f"\nClassification Report:\n", classification_report(Y_test, y_pred))
    print(f"Accuracy: {accuracy_score(Y_test, y_pred):.2f}")

    feature_importances_df = pd.DataFrame(decision_tree.feature_importances_, index=X_train.columns, columns=["Importance"])
    feature_importances_df.sort_values(by='Importance', ascending=False, inplace=True)
    feature_importances_df.plot(kind='bar', figsize=(12, 12))

    plt.figure(figsize=(10, 7))
    plot_tree(decision_tree, filled=True, feature_names=X_test.columns.to_list(), rounded=True)
    plt.show()

algorithm(teams[teams['teams_confID']==0],10,4)
algorithm(teams[teams['teams_confID']==1],10,4)
algorithm(teams,9,8)

1.2 - Support Vector Machine

In [None]:
def algorithm_svm(data, year, numberSeclected):
    train = data[data['teams_year'] < year]
    test = data[data['teams_year'] == year]

    train.drop("teams_year", axis=1, inplace=True)
    test.drop("teams_year", axis=1, inplace=True)

    X_train = train.drop("teams_playoff", axis=1)
    Y_train = train["teams_playoff"]

    X_test = test.drop("teams_playoff", axis=1)
    Y_test = test["teams_playoff"]

    svm = SVC(probability=True, random_state=42)
    svm.fit(X_train, Y_train)

    y_pred_proba = svm.predict_proba(X_test)[:, 1]
    y_pred = np.zeros_like(y_pred_proba) 
    top_8_indices = np.argsort(y_pred_proba)[-numberSeclected:]
    y_pred[top_8_indices] = 1

    print(f"\nClassification Report:\n", classification_report(Y_test, y_pred))
    print(f"Accuracy: {accuracy_score(Y_test, y_pred):.2f}")

algorithm_svm(teams[teams['teams_confID']==0], 10, 4)
algorithm_svm(teams[teams['teams_confID']==1], 10, 4)
algorithm_svm(teams, 10, 8)


### 1.3 - Grandient Boosting

In [None]:
def algorithm_gradient_boosting(data, year, numberSeclected):
    train = data[data['teams_year'] < year]
    test = data[data['teams_year'] == year]

    train.drop("teams_year", axis=1, inplace=True)
    test.drop("teams_year", axis=1, inplace=True)

    X_train = train.drop("teams_playoff", axis=1)
    Y_train = train["teams_playoff"]

    X_test = test.drop("teams_playoff", axis=1)
    Y_test = test["teams_playoff"]

    gradient_boosting = GradientBoostingClassifier(random_state=42)
    gradient_boosting.fit(X_train, Y_train)

    y_pred_proba = gradient_boosting.predict_proba(X_test)[:, 1]
    y_pred = np.zeros_like(y_pred_proba) 
    top_8_indices = np.argsort(y_pred_proba)[-numberSeclected:]
    y_pred[top_8_indices] = 1

    print(f"\nClassification Report:\n", classification_report(Y_test, y_pred))
    print(f"Accuracy: {accuracy_score(Y_test, y_pred):.2f}")

    feature_importances_df = pd.DataFrame(
        gradient_boosting.feature_importances_,
        index=X_train.columns,
        columns=["Importance"]
    )
    feature_importances_df.sort_values(by='Importance', ascending=False, inplace=True)
    feature_importances_df.plot(kind='bar', figsize=(12, 12))

algorithm_gradient_boosting(teams[teams['teams_confID']==0], 10, 4)
algorithm_gradient_boosting(teams[teams['teams_confID']==1], 10, 4)
algorithm_gradient_boosting(teams, 10, 8)


### 1.4 - K Nearest Neighbors

In [None]:
def algorithm_knn(data, year, numberSeclected):
    train = data[data['teams_year'] < year]
    test = data[data['teams_year'] == year]

    train.drop("teams_year", axis=1, inplace=True)
    test.drop("teams_year", axis=1, inplace=True)

    X_train = train.drop("teams_playoff", axis=1)
    Y_train = train["teams_playoff"]

    X_test = test.drop("teams_playoff", axis=1)
    Y_test = test["teams_playoff"]

    knn = KNeighborsClassifier()
    knn.fit(X_train, Y_train)

    y_pred_proba = knn.predict_proba(X_test)[:, 1]
    y_pred = np.zeros_like(y_pred_proba) 
    top_8_indices = np.argsort(y_pred_proba)[-numberSeclected:]
    y_pred[top_8_indices] = 1

    print(y_pred_proba)
    print(y_pred)

    print(f"\nClassification Report:\n", classification_report(Y_test, y_pred))
    print(f"Accuracy: {accuracy_score(Y_test, y_pred):.2f}")

algorithm_knn(teams[teams['teams_confID']==0], 10, 4)
algorithm_knn(teams[teams['teams_confID']==1], 10, 4)
algorithm_knn(teams, 10, 8)


In [None]:
def encode_categorical_columns(df):
    from sklearn.preprocessing import LabelEncoder
    label_encoder = LabelEncoder()
    for col in df.select_dtypes(include=['object']).columns:
        if(col=='teams_confID'):
            continue
        df[col] = label_encoder.fit_transform(df[col])
    return df

def prepare_data(data, year):

    train = data[data['teams_year'] < year]
    test = data[(data['teams_year'] == year)]
    
    x_train = pd.get_dummies(train.drop(columns=['teams_playoff']), drop_first=True)
    y_train = train['teams_playoff']
    x_test = pd.get_dummies(test.drop(columns=['teams_playoff']), drop_first=True)

    x_train, x_test = x_train.align(x_test, join='left', axis=1, fill_value=0)

    return x_train, y_train, x_test, test['teams_playoff']

def train_and_evaluate_model(x_train, y_train, x_test, y_test):

    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)

    model = Sequential()
    model.add(Dense(units=64, activation='relu', input_shape=(x_train.shape[1],)))
    model.add(Dense(units=32, activation='relu'))
    model.add(Dense(units=1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    history = model.fit(x_train, y_train, epochs=50, batch_size=32, validation_split=0.2)    
    loss, accuracy = model.evaluate(x_test, y_test)
    print(f"Loss: {loss:.4f}, Accuracy: {accuracy:.4f}")

    y_pred_proba = model.predict(x_test)
    y_pred = (y_pred_proba > 0.5).astype(int)
    print(classification_report(y_test, y_pred))

    return history, y_test, y_pred

def print_predictions(y_test, y_pred, test_data):
    results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred.flatten()})
    results = pd.concat([results, test_data.reset_index(drop=True)], axis=1)
    print(results)

teams = encode_categorical_columns(teams)

x_train_EA, y_train_EA, x_test_EA, y_test_EA = prepare_data(teams[teams['teams_confID'] == 0], 10)
history_EA, y_test_EA, y_pred_EA = train_and_evaluate_model(x_train_EA, y_train_EA, x_test_EA, y_test_EA)
print_predictions(y_test_EA, y_pred_EA, x_test_EA)

x_train_WE, y_train_WE, x_test_WE, y_test_WE = prepare_data(teams[teams['teams_confID']== 1], 10)
history_WE, y_test_WE, y_pred_WE = train_and_evaluate_model(x_train_WE, y_train_WE, x_test_WE, y_test_WE)
print_predictions(y_test_WE, y_pred_WE, x_test_WE)
