In [11]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

1 - Upload the datasets

In [12]:
awards_players = pd.read_csv("data/awards_players.csv")
coaches = pd.read_csv("data/coaches.csv")
players_teams = pd.read_csv("data/players_teams.csv")
players = pd.read_csv("data/players.csv")
series_post = pd.read_csv("data/series_post.csv")
teams_post = pd.read_csv("data/teams_post.csv")
teams = pd.read_csv("data/teams.csv")

2 - Remove the columns that have the same value in every entry

In [13]:
import numpy as np
import pandas as pd

def count_outliers_iqr(df):
    outlier_counts = {}
    for col in df.select_dtypes(include=[np.number]).columns:  # Considerar apenas colunas numéricas
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1  # Intervalo interquartil
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Filtrar os outliers
        outliers_in_col = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        outlier_counts[col] = outliers_in_col.shape[0]  # Armazena a contagem de outliers

    return outlier_counts

# Exemplo com DataFrame coaches (ou substitua pelo seu DataFrame)
outlier_counts = count_outliers_iqr(teams)

# Converter o dicionário em DataFrame para facilitar a ordenação
outlier_counts_df = pd.DataFrame(list(outlier_counts.items()), columns=['Column', 'Outlier Count'])

# Ordenar a contagem de outliers em ordem decrescente
outlier_counts_df = outlier_counts_df.sort_values(by='Outlier Count', ascending=False)

# Imprimir a contagem de outliers por coluna
if not outlier_counts_df.empty:
    print("Contagem de outliers por coluna (ordenada):")
    for index, row in outlier_counts_df.iterrows():
        print(f"{row['Column']}: {row['Outlier Count']} outliers")
else:
    print("Nenhum outlier encontrado.")


Contagem de outliers por coluna (ordenada):
attend: 14 outliers
awayL: 5 outliers
d_asts: 4 outliers
awayW: 4 outliers
homeL: 3 outliers
d_oreb: 3 outliers
lost: 3 outliers
o_pf: 2 outliers
d_stl: 2 outliers
d_ftm: 2 outliers
d_3pa: 2 outliers
d_3pm: 2 outliers
o_stl: 2 outliers
o_3pm: 2 outliers
o_3pa: 2 outliers
d_fta: 2 outliers
o_pts: 1 outliers
o_fgm: 1 outliers
o_blk: 1 outliers
homeW: 1 outliers
d_blk: 1 outliers
o_oreb: 1 outliers
o_asts: 1 outliers
d_pts: 1 outliers
d_fgm: 1 outliers
d_reb: 1 outliers
o_dreb: 0 outliers
divID: 0 outliers
year: 0 outliers
o_reb: 0 outliers
o_fta: 0 outliers
o_ftm: 0 outliers
o_fga: 0 outliers
seeded: 0 outliers
rank: 0 outliers
o_to: 0 outliers
tmORB: 0 outliers
d_pf: 0 outliers
d_dreb: 0 outliers
d_fga: 0 outliers
d_to: 0 outliers
opptmORB: 0 outliers
tmDRB: 0 outliers
GP: 0 outliers
won: 0 outliers
opptmTRB: 0 outliers
opptmDRB: 0 outliers
tmTRB: 0 outliers
confW: 0 outliers
confL: 0 outliers
min: 0 outliers


3 - Se if there are columns with null values

# players_teams eliminar rebounds

In [14]:
print("awards_players - " + str(list(awards_players.columns[awards_players.isna().any()])))
print("coaches - " + str(list(coaches.columns[coaches.isna().any()])))
print("players_teams - " + str(list(players_teams.columns[players_teams.isna().any()])))
print("players - " + str(list(players.columns[players.isna().any()])))
print("series_post - " + str(list(series_post.columns[series_post.isna().any()])))
print("teams_post - " + str(list(teams_post.columns[teams_post.isna().any()])))
print("teams - " + str(list(teams.columns[teams.isna().any()])))


awards_players - []
coaches - []
players_teams - []
players - ['pos', 'college', 'collegeOther']
series_post - []
teams_post - []
teams - ['divID', 'firstRound', 'semis', 'finals']


4 - Change the team ID because of the teams that changed their name

In [15]:
mapTeam = {}
for index, row in teams[['tmID', 'franchID']].iterrows():
    mapTeam [row['tmID']] = row['franchID']

coaches['tmID'] = coaches['tmID'].replace(mapTeam) 
players_teams['tmID'] = players_teams['tmID'].replace(mapTeam) 
series_post['tmIDWinner'] = series_post['tmIDWinner'].replace(mapTeam) 
series_post['tmIDLoser'] = series_post['tmIDLoser'].replace(mapTeam) 
teams_post['tmID'] = teams_post['tmID'].replace(mapTeam)
teams['tmID'] = teams['tmID'].replace(mapTeam).drop(columns=['franchID'])

5 - Rename the columns and merge the datasets teams and coaches

In [16]:
coaches = coaches.rename(columns=lambda x: x + '_coaches' if x not in ['year', 'tmID'] else x)
teams_coaches_merge = pd.merge(teams, coaches, on=['year', 'tmID'], how='left')

7 - Drop unnecessary columns (the ones that are directly related with the playoffs)

In [17]:
teams_coaches_merge_without_playoffs = teams_coaches_merge.drop(columns=['firstRound', 'semis', 'finals', 'rank', 'post_wins_coaches', 'post_losses_coaches'])

8 - Convert 'playoff' column to numeric

In [18]:
teams_coaches_merge_without_playoffs['playoff'] = teams_coaches_merge_without_playoffs['playoff'].map({'Y': 1, 'N': 0})

10 - Predictive model

In [19]:
def algorithm(data, year, numberSeclected):
    # Split training and testing data based on the year
    training_data = data[data['year'] < year]
    testing_data = data[data['year'] == year]

    # Drop the target column and apply one-hot encoding to the features
    X_train = pd.get_dummies(training_data.drop(columns=['playoff']), drop_first=True)
    y_train = training_data['playoff']

    X_test = pd.get_dummies(testing_data.drop(columns=['playoff']), drop_first=True)

    # Align the train and test sets
    X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

    # Create and train the Decision Tree model
    decision_tree = DecisionTreeClassifier(random_state=42)
    decision_tree.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred_proba = decision_tree.predict_proba(X_test)[:, 1]

    # Forçar exatamente 8 previsões de "1"
    y_pred = np.zeros_like(y_pred_proba) 
    top_8_indices = np.argsort(y_pred_proba)[-numberSeclected:]
    y_pred[top_8_indices] = 1

    # Evaluate the model's accuracy
    accuracy = accuracy_score(testing_data['playoff'], y_pred)
    print(f"Accuracy: {accuracy:.2f}")

    # Extrair as importâncias das features
    feature_importances = decision_tree.feature_importances_

    # Criar um DataFrame para organizar as importâncias
    feature_importances_df = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': feature_importances
    })

    # Ordenar as features pela importância
    feature_importances_df = feature_importances_df.sort_values(by='Importance', ascending=False)

    # Exibir as 10 features mais importantes
    # print("Top 10 Most Important Features:")
    print(feature_importances_df.head(10))

    # print(y_pred)


In [20]:
algorithm(teams_coaches_merge_without_playoffs[teams_coaches_merge_without_playoffs['confID']=='EA'],10,4)
algorithm(teams_coaches_merge_without_playoffs[teams_coaches_merge_without_playoffs['confID']=='WE'],10,4)
algorithm(teams_coaches_merge_without_playoffs,10,8)

Accuracy: 0.44


   Feature  Importance
39     won    0.705548
47   confL    0.188956
7    o_3pm    0.056684
11   o_reb    0.048812
4    o_fga    0.000000
3    o_fgm    0.000000
0     year    0.000000
6    o_fta    0.000000
5    o_ftm    0.000000
9   o_oreb    0.000000
Accuracy: 0.71
                       Feature  Importance
47                       confL    0.765957
52                lost_coaches    0.141004
49                      attend    0.051134
93  coachID_coaches_donovan99w    0.041905
1                        divID    0.000000
5                        o_ftm    0.000000
2                       seeded    0.000000
3                        o_fgm    0.000000
4                        o_fga    0.000000
9                       o_oreb    0.000000
Accuracy: 0.56
   Feature  Importance
39     won    0.719441
47   confL    0.114230
49  attend    0.048264
21   d_fta    0.031877
7    o_3pm    0.027198
11   o_reb    0.023043
31   d_blk    0.022121
48     min    0.013826
3    o_fgm    0.000000
4    o_fga    