# Predictive Model

The metrics that will be used to evaluate this stage are:

- **Diversity** of tasks (use of classification and regression) and of algorithms, this is, tested more than 4 with significantly different language bias OR with a significant number of variants.

- **Parameter Tuning**, with a systematic approach.

- **Understanding Algorithm Behavior**, solid (even if not deep) understanding of the behavior of most algorithms used OR <3 algorithms, also understanding the effect of parameters.

- Training and testing on properly separated data, with multiple splits.

- **Performance Estimation**, additional factors correctly taken into account (e.g. time), focus on performance measures aligned with DM goals and data characteristics, advanced performance measures (e.g. AUC), adequate baseline, correct analysis of values ​​for comparison, including tests of statistical significance, correct estimate of overfitting.

- **Model Improvement**, development guided by performance improvement goals, even if pedagogical goals have not been ignored.

- **Feature Importance**, correctly interpreted, related to the application domain.

- Analysis of "white box" models, correctly interpreted, related to the application domain

In [156]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.calibration import LabelEncoder
from sklearn.preprocessing import StandardScaler

from keras.models import Sequential
from keras.layers import Dense

teams = pd.read_csv("data_prepared/teams.csv")
teams = teams.drop(columns=['teams_firstRound', 'teams_semis', 'teams_finals', 'teams_rank', 'teams_post_W', 'teams_post_L'])
teams['teams_playoff'] = teams['teams_playoff'].map({'Y': 1, 'N': 0})

# def encode_categorical_columns(df):
#     label_encoder = LabelEncoder()
#     for col in df.select_dtypes(include=['object']).columns:
#         if col == 'teams_playoff':
#             df['teams_playoff'] = df['teams_playoff'].map({'Y': 1, 'N': 0})
#         elif (col == 'tmID') or (col == 'teams_confID'):
#             continue
#         else:
#             df[col] = label_encoder.fit_transform(df[col])
#     return df
# 
# teams = encode_categorical_columns(teams)

## 1 - Algorithms

### 1.1 - Decision Tree

In [157]:
def algorithm(data, year, number):

    train = data[data['teams_year'] < year]
    test = data[data['teams_year'] == year]
    X_train = pd.get_dummies(train.drop(columns=['teams_playoff']), drop_first=True)
    y_train = train['teams_playoff']
    X_test = pd.get_dummies(test.drop(columns=['teams_playoff']), drop_first=True)
    X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

    decision_tree = DecisionTreeClassifier(random_state=42)
    decision_tree.fit(X_train, y_train)
    y_pred_proba = decision_tree.predict_proba(X_test)[:, 1]
    y_pred = np.zeros_like(y_pred_proba) 
    top_8_indices = np.argsort(y_pred_proba)[-number:]
    y_pred[top_8_indices] = 1

    accuracy = accuracy_score(test['teams_playoff'], y_pred)
    print(f"Accuracy: {accuracy:.2f}")

    # feature_importances = decision_tree.feature_importances_
    # feature_importances_df = pd.DataFrame({
    #     'Feature': X_train.columns,
    #     'Importance': feature_importances
    # })
    # feature_importances_df = feature_importances_df.sort_values(by='Importance', ascending=False)
    # print("Top 10 Most Important Features:")
    # print(y_pred)

algorithm(teams[teams['teams_confID']=='EA'],10,4)
algorithm(teams[teams['teams_confID']=='WE'],10,4)
algorithm(teams,10,8)

Accuracy: 0.56
Accuracy: 0.41
Accuracy: 0.49


1.2 - Support Vector Machine

In [158]:
def algorithm_svm(data, year, numberSeclected):
    train = data[data['teams_year'] < year]
    test = data[data['teams_year'] == year]

    x_test = pd.get_dummies(test.drop(columns=['teams_playoff']), drop_first=True)
    x_train = pd.get_dummies(train.drop(columns=['teams_playoff']), drop_first=True)
    y_train = train['teams_playoff']

    x_train, x_test = x_train.align(x_test, join='left', axis=1, fill_value=0)

    # Usar Support Vector Machine
    svm = SVC(probability=True, random_state=42)
    svm.fit(x_train, y_train)
    y_pred_proba = svm.predict_proba(x_test)[:, 1]

    y_pred = np.zeros_like(y_pred_proba) 
    top_8_indices = np.argsort(y_pred_proba)[-numberSeclected:]
    y_pred[top_8_indices] = 1

    accuracy = accuracy_score(test['teams_playoff'], y_pred)
    print(f"SVM Accuracy: {accuracy:.2f}")
    # print(y_pred)

# Chamar a função para Support Vector Machine
algorithm_svm(teams[teams['teams_confID']=='EA'], 10, 4)
algorithm_svm(teams[teams['teams_confID']=='WE'], 10, 4)
algorithm_svm(teams, 10, 8)


SVM Accuracy: 0.32
SVM Accuracy: 0.41
SVM Accuracy: 0.28


### 1.3 - Grandient Boosting

In [159]:
def algorithm_gradient_boosting(data, year, numberSeclected):
    train = data[data['teams_year'] < year]
    test = data[data['teams_year'] == year]

    x_test = pd.get_dummies(test.drop(columns=['teams_playoff']), drop_first=True)
    x_train = pd.get_dummies(train.drop(columns=['teams_playoff']), drop_first=True)
    y_train = train['teams_playoff']

    x_train, x_test = x_train.align(x_test, join='left', axis=1, fill_value=0)

    # Usar Gradient Boosting
    gradient_boosting = GradientBoostingClassifier(random_state=42)
    gradient_boosting.fit(x_train, y_train)
    y_pred_proba = gradient_boosting.predict_proba(x_test)[:, 1]

    y_pred = np.zeros_like(y_pred_proba) 
    top_8_indices = np.argsort(y_pred_proba)[-numberSeclected:]
    y_pred[top_8_indices] = 1

    accuracy = accuracy_score(test['teams_playoff'], y_pred)
    print(f"Gradient Boosting Accuracy: {accuracy:.2f}")

    feature_importances = gradient_boosting.feature_importances_
    feature_importances_df = pd.DataFrame({
        'Feature': x_train.columns,
        'Importance': feature_importances
    })
    feature_importances_df = feature_importances_df.sort_values(by='Importance', ascending=False)
    # print(feature_importances_df.head(10))
    # print(y_pred)

# Chamar a função para Gradient Boosting
algorithm_gradient_boosting(teams[teams['teams_confID']=='EA'], 10, 4)
algorithm_gradient_boosting(teams[teams['teams_confID']=='WE'], 10, 4)
algorithm_gradient_boosting(teams, 10, 8)


Gradient Boosting Accuracy: 0.56
Gradient Boosting Accuracy: 0.41
Gradient Boosting Accuracy: 0.49


### 1.4 - K Nearest Neighbors

In [160]:
def algorithm_knn(data, year, numberSeclected):
    train = data[data['teams_year'] < year]
    test = data[data['teams_year'] == year]

    x_test = pd.get_dummies(test.drop(columns=['teams_playoff']), drop_first=True)
    x_train = pd.get_dummies(train.drop(columns=['teams_playoff']), drop_first=True)
    y_train = train['teams_playoff']

    x_train, x_test = x_train.align(x_test, join='left', axis=1, fill_value=0)

    # Usar K-Nearest Neighbors
    knn = KNeighborsClassifier()
    knn.fit(x_train, y_train)
    y_pred_proba = knn.predict_proba(x_test)[:, 1]

    y_pred = np.zeros_like(y_pred_proba) 
    top_8_indices = np.argsort(y_pred_proba)[-numberSeclected:]
    y_pred[top_8_indices] = 1

    accuracy = accuracy_score(test['teams_playoff'], y_pred)
    print(f"KNN Accuracy: {accuracy:.2f}")
    # print(y_pred)

# Chamar a função para KNN
algorithm_knn(teams[teams['teams_confID']=='EA'], 10, 4)
algorithm_knn(teams[teams['teams_confID']=='WE'], 10, 4)
algorithm_knn(teams, 10, 8)


KNN Accuracy: 0.44
KNN Accuracy: 0.41
KNN Accuracy: 0.49


In [161]:
def encode_categorical_columns(df):
    from sklearn.preprocessing import LabelEncoder
    label_encoder = LabelEncoder()
    for col in df.select_dtypes(include=['object']).columns:
        if(col=='teams_confID'):
            continue
        df[col] = label_encoder.fit_transform(df[col])
    return df

def prepare_data(data, year):

    train = data[data['teams_year'] < year]
    test = data[(data['teams_year'] == year)]
    
    x_train = pd.get_dummies(train.drop(columns=['teams_playoff']), drop_first=True)
    y_train = train['teams_playoff']
    x_test = pd.get_dummies(test.drop(columns=['teams_playoff']), drop_first=True)

    x_train, x_test = x_train.align(x_test, join='left', axis=1, fill_value=0)

    return x_train, y_train, x_test, test['teams_playoff']

def train_and_evaluate_model(x_train, y_train, x_test, y_test):

    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)

    model = Sequential()
    model.add(Dense(units=64, activation='relu', input_shape=(x_train.shape[1],)))
    model.add(Dense(units=32, activation='relu'))
    model.add(Dense(units=1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    history = model.fit(x_train, y_train, epochs=50, batch_size=32, validation_split=0.2)    
    loss, accuracy = model.evaluate(x_test, y_test)
    print(f"Loss: {loss:.4f}, Accuracy: {accuracy:.4f}")

    y_pred_proba = model.predict(x_test)
    y_pred = (y_pred_proba > 0.5).astype(int)
    print(classification_report(y_test, y_pred))

    return history, y_test, y_pred

def print_predictions(y_test, y_pred, test_data):
    results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred.flatten()})
    results = pd.concat([results, test_data.reset_index(drop=True)], axis=1)
    print(results)

teams = encode_categorical_columns(teams)

x_train_EA, y_train_EA, x_test_EA, y_test_EA = prepare_data(teams[teams['teams_confID'] == 'EA'], 10)
history_EA, y_test_EA, y_pred_EA = train_and_evaluate_model(x_train_EA, y_train_EA, x_test_EA, y_test_EA)
print_predictions(y_test_EA, y_pred_EA, x_test_EA)

x_train_WE, y_train_WE, x_test_WE, y_test_WE = prepare_data(teams[teams['teams_confID']== 'WE'], 10)
history_WE, y_test_WE, y_pred_WE = train_and_evaluate_model(x_train_WE, y_train_WE, x_test_WE, y_test_WE)
print_predictions(y_test_WE, y_pred_WE, x_test_WE)


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 26ms/step - accuracy: 0.6589 - loss: 0.5803 - val_accuracy: 0.5152 - val_loss: 0.8548
Epoch 2/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.7998 - loss: 0.4325 - val_accuracy: 0.5152 - val_loss: 0.8340
Epoch 3/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.8493 - loss: 0.3477 - val_accuracy: 0.5152 - val_loss: 0.7643
Epoch 4/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.9584 - loss: 0.2663 - val_accuracy: 0.5152 - val_loss: 0.7171
Epoch 5/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.9549 - loss: 0.2184 - val_accuracy: 0.5909 - val_loss: 0.6521
Epoch 6/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.9570 - loss: 0.1884 - val_accuracy: 0.6667 - val_loss: 0.6077
Epoch 7/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - accuracy: 0.6920 - loss: 0.5843 - val_accuracy: 0.6438 - val_loss: 0.5859
Epoch 2/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8519 - loss: 0.4103 - val_accuracy: 0.6712 - val_loss: 0.5599
Epoch 3/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9467 - loss: 0.2929 - val_accuracy: 0.6712 - val_loss: 0.5398
Epoch 4/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9613 - loss: 0.2080 - val_accuracy: 0.6027 - val_loss: 0.5268
Epoch 5/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.9908 - loss: 0.1496 - val_accuracy: 0.6027 - val_loss: 0.5361
Epoch 6/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.9942 - loss: 0.1008 - val_accuracy: 0.6986 - val_loss: 0.5685
Epoch 7/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━