In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
train = pd.read_csv("/kaggle/input/task-03/train_full.csv", sep = ",")
train.drop("Game", axis = 1, inplace = True)

In [None]:
test = pd.read_csv("/kaggle/input/task-03/test_without_label.csv")
game = test.Game
test.drop("Game", axis = 1, inplace = True)

# Pré processamento

In [None]:
train.columns = [x.strip() for x in train.columns] #Retira espaço que tinha no final do nome das colunas
test.columns = [x.strip() for x in test.columns] #Retira espaço que tinha no final do nome das colunas

In [None]:
def getData(df):
    """
    Pega dia da semana e mes que ocorreu o jogo
    """
    
    df["DiaSemana"] = df["Data"].apply(lambda x : x.split(" ")[0].strip(","))
    df["Mes"] = df["Data"].apply(lambda x : x.split(" ")[1])
    
    months = {"January":1, "February":2, "March":3, "April":4, "May":5, "June":6, "July":7,
             "August":8, "September":9, "October":10, "November":11, "December":12}
    
    df["Mes"] = df["Mes"].replace(months)
    df["Mes"] = df["Mes"].replace({8:4, 9:5, 10:6}) # Playoffs de 2020 atrasaram...
    
    
    columns_to_drop = ["Data"]
    df.drop(columns_to_drop, axis = 1, inplace = True)
    
    return df

In [None]:
def arrumaTime(df):
    """
    Pega nome antigo e muda pros atuais
    """
    
    dic = {"New Jersey Nets": "Brooklyn Nets",
           "New Orleans Hornets": "New Orleans Pelicans",
           "Charlotte Bobcats": "Charlotte Hornets"}
    
    df["H_Team"] = df["H_Team"].replace(dic)
    df["A_Team"] = df["A_Team"].replace(dic)
    
    return df

In [None]:
def getName(x, divisions):
    
    for k, v in divisions.items():
        if x in divisions[k]:
            return k

def getDivisao(df):
    """ 
    Pega a divisão q o time joga.
    """
    
    #East
    atlantic = ["Toronto Raptors", "Boston Celtics", "New York Knicks","Brooklyn Nets", "Philadelphia 76ers"]
    central = ["Cleveland Cavaliers", "Indiana Pacers", "Detroit Pistons", "Chicago Bulls", "Milwaukee Bucks"]
    southeast = ["Miami Heat", "Atlanta Hawks", "Charlotte Hornets", "Washington Wizards", "Orlando Magic"]
    
    #West
    northwest = ["Oklahoma City Thunder", "Portland Trail Blazers", "Utah Jazz", "Denver Nuggets", "Minnesota Timberwolves"]
    pacific = ["Golden State Warriors", "Los Angeles Clippers", "Sacramento Kings", "Phoenix Suns", "Los Angeles Lakers"]
    southwest = ["San Antonio Spurs", "Dallas Mavericks", "Memphis Grizzlies", "Houston Rockets", "New Orleans Pelicans"]
    
    divisions = {"Atlantic":atlantic, "Central":central, "Southeast":southeast,
                "Northwest":northwest, "Pacific":pacific, "Southwest":southwest}
    
    #df["H_Conference"] = df["H_Team"].apply(lambda x : "West" if x in west else "East")
    #df["A_Conference"] = df["A_Team"].apply(lambda x : "West" if x in west else "East")
    
    df["H_Division"] = df["H_Team"].apply(getName, args = (divisions, ))
    df["A_Division"] = df["A_Team"].apply(getName, args = (divisions, ))
    
    return df

In [None]:
def colocaDados(linha, base, nome_base):
    
    home_team = linha["H_Team"]
    away_team = linha["A_Team"]
    ano = linha["Year"]
    
    base = base.set_index("Time")
    
    linha["H_"+nome_base] = base.loc[home_team, str(ano)]
    linha["A_"+nome_base] = base.loc[away_team, str(ano)]
    
    return linha

In [None]:
def getMoreData(df):
    """
    Pega a distância que os times viajaram durante a temporada regular.
    Pega o rank do time na temporada regular. (Pode pegar através da W/L%).
    Pega os gastos do time com salário dos jogadores.
    Pega a nota do time no draft.
    """
    
    distance = pd.read_csv("/kaggle/input/dadosnba/distance.csv", sep = ";")
    rank = pd.read_csv("/kaggle/input/dadosnba/rank.csv", sep = ";")
    salario = pd.read_csv("/kaggle/input/dadosnba/salario.csv", sep = ";")
    draft = pd.read_csv("/kaggle/input/dadosnba/draft.csv", sep = ";") #A - 10; B - 8; C - 6; D - 4; E - 2; F - 0; I (Não draftou) - 5.
                                                      #Por exemplo, o Zion (Jogou em Duke em 2018/2019)
                                                            #escolhido no Draft de 2019 pelo NOP foi considerado nota A.
    #Pega só o nome do time, não a cidade
    df["H_Team"] = df["H_Team"].apply(lambda x: x.split()[-1])
    df["A_Team"] = df["A_Team"].apply(lambda x: x.split()[-1])
    
    df = df.apply(colocaDados, axis = 1, args = (distance,"DistTravelled",))
    df = df.apply(colocaDados, axis = 1, args = (rank,"Rank",))
    df = df.apply(colocaDados, axis = 1, args = (salario,"Salary",))
    df = df.apply(colocaDados, axis = 1, args = (draft,"Draft",))
    
    return df

In [None]:
def triploDuplo(df):
    """
    Calcula o número médio de triplo-duplo e duplo-duplo que cada time (Home, OppHome, Away, OppAway) fez durante a temporada.
    Depois, ve qual a diferença média de TD e DD que os times tem (Home com Away, OppHome com OppAway).
    """
    
    
    df_aux = pd.DataFrame(columns = list(df.columns.values)+["TDDif", "DDDif", "TDOppDif", "DDOppDif"])
    
    for idx, linha in df.iterrows():
        
        #Home Team
        H_n10 = linha[["H_AvgPointsPerGame", "H_TRB", "H_AST", "H_BLK", "H_TOV"]].divide(10)
        
        H_TD = 0 #qnt média de triplo duplos feitos pelo Home Team por jogo.
        H_DD = 0 #qnt média de duplo duplo feitos pelo Home Team por jogo.        
      
        aux = (H_n10 > 1).sum()
        
        
        while aux >= 3:
            H_TD += 1
            H_n10 = H_n10 - 1
            aux = (H_n10 > 1).sum()
            
        while aux >= 2:
            H_DD += 1
            H_n10 = H_n10 - 1
            aux = (H_n10 > 1).sum()
            
            
            
            
        #Opponent Home Team
        H_On10 = linha[["H_AvgPointsPerGameOpp", "H_OTRB", "H_OAST", "H_OBLK", "H_OTOV"]].divide(10)
        
        H_OTD = 0 #qnt média de triplo duplos feitos pelo Home Team por jogo.
        H_ODD = 0 #qnt média de duplo duplo feitos pelo Home Team por jogo.        
      
        aux = (H_On10 > 1).sum()
        
        while aux >= 3:
            H_OTD += 1
            H_On10 = H_On10 - 1
            aux = (H_On10 > 1).sum()
            
        while aux >= 2:
            H_ODD += 1
            H_On10 = H_On10 - 1
            aux = (H_On10 > 1).sum()
            
            
            
            
        
        #Away Team
        A_n10 = linha[["A_AvgPointsPerGame", "A_TRB", "A_AST", "A_BLK", "A_TOV"]].divide(10)
        
        A_TD = 0 #qnt média de triplo duplos feitos pelo Home Team por jogo.
        A_DD = 0 #qnt média de duplo duplo feitos pelo Home Team por jogo.        
      
        aux = (A_n10 > 1).sum()
        
        
        
        while aux >= 3:
            A_TD += 1
            A_n10 = A_n10 - 1
            aux = (A_n10 > 1).sum()
            
        while aux >= 2:
            A_DD += 1
            A_n10 = A_n10 - 1
            aux = (A_n10 > 1).sum()
            
        #Opponent Away Team
        A_On10 = linha[["A_AvgPointsPerGameOpp", "A_OTRB", "A_OAST", "A_OBLK", "A_OTOV"]].divide(10)
        
        A_OTD = 0 #qnt média de triplo duplos feitos pelo Home Team por jogo.
        A_ODD = 0 #qnt média de duplo duplo feitos pelo Home Team por jogo.        
      
        aux = (A_On10 > 1).sum()
        
        while aux >= 3:
            A_OTD += 1
            A_On10 = A_On10 - 1
            aux = (A_On10 > 1).sum()
            
        while aux >= 2:
            A_ODD += 1
            A_On10 = A_On10 - 1
            aux = (A_On10 > 1).sum()
    
        
        # Coloca isso no Data Frame
        linha["TDDif"] = pd.to_numeric(H_TD - A_TD)
        linha["DDDif"] = pd.to_numeric(H_DD - A_DD)
        
        linha["TDOppDif"] = pd.to_numeric(H_OTD - A_OTD)
        linha["DDOppDif"] = pd.to_numeric(H_ODD - A_ODD)   
        
        df_aux.loc[idx] = linha
    
    
    return df_aux

In [None]:
def dificuldadeVitoria(df):
    """
    Calcula a dificuldade que os times (Home e Away) tiveram pra vencer seus rivais.
    Faz isso através de uma média harmônica ponderada da margem de vitória e dificuldade do calendário
    
    OBS: Ponderada pois uma vitória pequena em um time forte vale mais que uma vitória grande em um time fraco.
    """
    
    #Coloca os dois valores entre 0 e 1
    df["H_MOV"] = (df["H_MOV"] - df["H_MOV"].min()) / (df["H_MOV"].max() - df["H_MOV"].min())
    df["A_MOV"] = (df["A_MOV"] - df["A_MOV"].min()) / (df["A_MOV"].max() - df["A_MOV"].min())
    
    df["H_SOS"] = (df["H_SOS"] - df["H_SOS"].min()) / (df["H_SOS"].max() - df["H_SOS"].min())
    df["A_SOS"] = (df["A_SOS"] - df["A_SOS"].min()) / (df["A_SOS"].max() - df["A_SOS"].min())
    
    #Qualidade da vitória (QOV)
    # Vencer com uma margem grande de times fortes é melhor do que vencer por muito de times fracos...
    # Vou usar média harmônica (ela mitiga o impacto de grandes valores)
    # Usarei a ponderada, pois acho mais forte um time que ganha de time grande por pouco
    #                                         do q time que ganha de muito de time fraco
    df["H_QOV"] = 3 / (2/df["H_SOS"]  +  1/df["H_MOV"])
    df["A_QOV"] = 3 / (2/df["A_SOS"]  +  1/df["A_MOV"])
    
    columns_to_drop = ["H_MOV", "A_MOV", "H_SOS", "A_SOS"]
    df.drop(columns_to_drop, axis = 1, inplace = True)
    
    return df

In [None]:
def comparaRivais(df):
    """
    Faz diversas comparações entre os times (Home e Away) tanto no aspecto ofensivo, quanto no aspecto defensivo.
    """
    
    #Rating
    df["SRSDif"] = (31 - df["H_SRS"]) - (31 - df["A_SRS"])
    
    df["OrtgDif"] = (31 - df["H_Ortg"]) - (31 - df["A_Ortg"])
    df["DrtgDif"] = (31 - df["H_Drtg"]) - (31 - df["A_Drtg"])
    
    #Ritmo de Jogo
    df["PaceDif"] = df["H_Pace"] - df["A_Pace"]
    
    #Saldo de vitória/derrotas
    df["SaldoDif"] = (df["H_Wins"] - df["H_Loss"]) - (df["A_Wins"] - df["A_Loss"])
    
    #Quantidade de jogos
    df["GamesDif"] = (df["H_Games"] - df["A_Games"])
    
    #Quantidade média de pontos
    df["AvgPointsMadeDif"] = df["H_AvgPointsPerGame"] - df["A_AvgPointsPerGame"]
    df["AvgPointsOppMadeDif"] = df["H_AvgPointsPerGameOpp"] - df["A_AvgPointsPerGameOpp"]
    
    
    #Precisão do arremesso
    df["FG%Dif"] = df["H_FG%"] - df["A_FG%"]
    df["FG%OppDif"] = df["H_OFG%"] - df["A_OFG%"]
    
    df["FT%Dif"] = df["H_FT%"] - df["A_FT%"]
    df["FT%OppDif"] = df["H_OFT%"] - df["A_OFT%"]
    
    df["3P%Dif"] = df["H_3P%"] - df["A_3P%"]
    df["3P%OppDif"] = df["H_O3P%"] - df["A_O3P%"]
    
    df["2P%Dif"] = df["H_2P%"] - df["A_2P%"]
    df["2P%OppDif"] = df["H_O2P%"] - df["A_O2P%"]
    
    
    
    columns_to_drop = ["H_SRS", "A_SRS", "H_Ortg", "A_Ortg", "H_Drtg", "A_Drtg", "H_Pace", "A_Pace",
                       "H_Wins", "H_Loss", "A_Wins", "A_Loss", "H_Games", "A_Games",
                       "H_TotalPoints", "A_TotalPoints", "H_PointsOpp", "A_PointsOpp",
                     "H_AvgPointsPerGame", "A_AvgPointsPerGame", "H_AvgPointsPerGameOpp", "A_AvgPointsPerGameOpp",
                      "H_FG%","A_FG%","H_OFG%","A_OFG%", "H_FT%", "A_FT%", "H_OFT%", "A_OFT%",
                      "H_3P%", "A_3P%", "H_O3P%", "A_O3P%", "H_2P%", "A_2P%", "H_O2P%", "A_O2P%"]
    df.drop(columns_to_drop, axis = 1, inplace = True)
    
    return df

In [None]:
def getRank(df):

    df["H_Rank"] = df["H_Team"].apply(lambda x: dic.get(x))
    df["A_Rank"] = df["A_Team"].apply(lambda x: dic.get(x))
    
    return df

In [None]:
def converteTipo(df):
    
    colunas = ["TDDif", "DDDif", "TDOppDif", "DDOppDif", "SaldoDif",
              "GamesDif", "H_PW", "H_PL", "A_PW", "A_PL",
               "Year", "H_DistTravelled", "A_DistTravelled", "A_Rank", "H_Rank",
              "H_Salary", "A_Salary", "H_Draft", "A_Draft"]
    
    for col in colunas:
        df[col] = pd.to_numeric(df[col])
        
    return df

In [None]:
def getAno(train, test):
    
    qnt_anos_train = [89, 79, 86, 85, 82, 81, 84, 85, 89, 81, 86, 79]
    qnt_anos_test = [82, 83]
    
    train_ano = np.repeat(list(range(2006, 2018)), qnt_anos_train)
    test_ano = np.repeat([2018,2019], qnt_anos_test)
    
    train["Year"] = train_ano
    test["Year"] = test_ano

    return train, test

In [None]:
train, test = getAno(train, test)

train = getData(train)
train = arrumaTime(train)
train = getDivisao(train)
train = getMoreData(train)
train = triploDuplo(train)
train = dificuldadeVitoria(train)
train = comparaRivais(train)
train = converteTipo(train)

test = getData(test)
test = arrumaTime(test)
test = getDivisao(test)
test = getMoreData(test)
test = triploDuplo(test)
test = dificuldadeVitoria(test)
test = comparaRivais(test)
test = converteTipo(test)

# Análise Exploratória

In [None]:
target = train['WinOrLose'].replace('W', 1).replace('L', 0).astype(int)

In [None]:
fig, ax = plt.subplots(ncols=2, figsize=(14,6))

sns.countplot(x="H_Team", data=train, color="red", order=train["H_Team"].unique(), ax=ax[0])
hteam_win = train.loc[target == 1, "H_Team"]
sns.countplot(x=hteam_win, data=train, color="green", order=train["H_Team"].unique(), ax=ax[0])

sns.countplot(x="A_Team", data=train, color="red", order=train["H_Team"].unique(), ax=ax[1])
ateam_win = train.loc[target == 0, 'A_Team']
sns.countplot(x=ateam_win, data=train, color="green", order=train["H_Team"].unique(), ax=ax[1])

ax[0].set_title("Distribuição de Times da Casa")
ax[1].set_title("Distribuição de Times Visitantes")
plt.setp(ax[0].xaxis.get_majorticklabels(), rotation=90)
plt.setp(ax[1].xaxis.get_majorticklabels(), rotation=90)
plt.show()

In [None]:
import datetime

In [None]:
data = pd.read_csv("/kaggle/input/task-03/train_full.csv", sep = ",")['Data ']
data = data.replace('June', 'Jun', regex=True).replace('April', 'Apr', regex=True).replace('June', 'Jun', regex=True)
data = np.array(list(data.str.split(', ')))[:,1]
data = pd.to_datetime(data, format='%b %d').strftime('%d/%m')

In [None]:
fig = plt.figure(figsize=(8,5))
sns.countplot(x="DiaSemana", data=train, color="red", order=['Sun','Mon','Tue','Wed','Thu','Fri','Sat'])
hteam_win = train.loc[target == 1, "DiaSemana"]
sns.countplot(x=hteam_win, data=train, color="green", order=['Sun','Mon','Tue','Wed','Thu','Fri','Sat'])

plt.title("Dias da semana e Taxas de Vitórias pela Casa")

plt.show()

In [None]:
teams = train['H_Team'].unique()

fig, ax = plt.subplots(ncols = 2, nrows=15, figsize=(20,30))

i = 0
for j in range(len(teams)):
    if(j%2 == 0 and j > 0):
        i += 1
    
    posh = train['H_Team'] == teams[j]
    posa = train['A_Team'] == teams[j]
    xh = data[posh]
    yh = target.loc[posh]
    
    xa = data[posa]
    ya = 1-target.loc[posa]
    
    ax[i,j%2].plot(xh, yh, 'o')
    ax[i,j%2].plot(xa, ya, 'o')
    plt.setp(ax[i,j%2].xaxis.get_majorticklabels(), rotation=90)
    ax[i,j%2].xaxis.set_tick_params(labelsize=10)
    ax[i,j%2].set_ylim(-0.2, 1.2)
    ax[i,j%2].set_title(teams[j])
  
fig.delaxes(ax[14,-1])
plt.subplots_adjust(hspace=2)

In [None]:
corr = train.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1,vmin = -1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

# Modelagem (SEM FEATURE SELECTION)

In [None]:
from sklearn.preprocessing import LabelEncoder
lbl = LabelEncoder()
for col in test.columns.values:
    if train.loc[:,col].dtype == "object":
        lbl.fit(train.loc[:,col].astype(str))
        train.loc[:,col] = lbl.transform(train.loc[:,col].astype(str))
        
        if col == "WinOrLose":
            pass
        else:
            test.loc[:,col] = lbl.transform(test.loc[:,col].astype(str))

In [None]:
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import BaggingClassifier

from sklearn.model_selection import GridSearchCV


from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score

In [None]:
df = train.copy()

columns = df.columns.values
target = "WinOrLose"
y_columns = [target]
x_columns = [x for x in columns if x != target]

X = df[x_columns]
y = df[y_columns]

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X=scaler.fit_transform(X)
#test = scaler.transform(test)

from sklearn.decomposition import PCA
pca = PCA(n_components = 30)
X = pca.fit_transform(X)
#test = pca.transform(test)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.05, random_state = 2)

## XGBoost

In [None]:
%%time

parameters = {'max_depth':np.arange(2, 10, 1), "n_estimators":[5], "learning_rate":[0.01]}

xgb_model = GridSearchCV(XGBClassifier(), parameters,
                    cv = 2, scoring = "balanced_accuracy", n_jobs = -1, verbose = 3,
                    refit = True)

#xgb_model.fit(X, y.to_numpy().ravel())
xgb_model.fit(X_train, y_train.to_numpy().ravel())

print("Melhor modelo: {}".format(xgb_model.best_estimator_))
print("Melhor score: {}".format(xgb_model.best_score_))

## CatBoost

In [None]:
%%time

parameters = {'early_stopping_rounds':[10], "learning_rate":[0.01]}

cat_model = GridSearchCV(CatBoostClassifier(silent=True), parameters,
                    cv = 2, scoring = "balanced_accuracy", n_jobs = -1, verbose=3,
                    refit = True)

cat_model.fit(X_train, y_train.to_numpy().ravel())

print("Melhor modelo: {}".format(cat_model.best_estimator_))
print("Melhor score: {}".format(cat_model.best_score_))

# Feature Selection

In [None]:
from sklearn.model_selection import KFold

from scipy.stats import pearsonr
from scipy.stats import spearmanr

In [None]:
target = train['WinOrLose'].replace('W',1).replace('L',0).astype(int)

In [None]:
x_columns = df.columns[(df.columns != 'DiaSemana') & (df.columns != 'H_Team') & (df.columns != 'A_Team') & (df.columns != 'H_Division') & (df.columns != 'A_Division') & (df.columns != 'WinOrLose')]

df = train.loc[:,x_columns].copy()
y = target

In [None]:
names = df.columns
pearson = []
spearman = []

for col in names:
    X = df.loc[:, col].to_numpy()
    pearson.append(abs(pearsonr(X,y)[0]))
    spearman.append(abs(spearmanr(X,y)[0]))
    
pearson = np.array(pearson)
spearman = np.array(spearman)

feature_scores = pd.DataFrame({"Column":names, "Pearson":pearson, "Spearman":spearman})

feature_scores.sort_values('Spearman', ascending=False, inplace=True)

best_features = list(feature_scores['Column'][:50])
feature_scores = feature_scores.iloc[:50].reset_index(drop=True).copy()

print("15 Features com o melhor score total")
feature_scores.head(15)

In [None]:
names = list(feature_scores['Column'])

votos = np.zeros(len(names))
tau = 0.85

for i in range(0, len(names)-1):
    for j in range(i+1, len(names)):
        p = pearsonr(train.loc[:,names[i]], train.loc[:,names[j]])[0] 
        if(abs(p) > tau):
            if(feature_scores.loc[i,'Spearman'] >= feature_scores.loc[j,'Spearman']):
                votos[j] += 1
            else:
                votos[i] += 1

In [None]:
votos

In [None]:
# Retiramos as colunas votadas ao menos uma vez...
best_features = list(feature_scores[votos == 0]['Column'])
print("Total de {} features".format(len(best_features)))
print(best_features)

In [None]:
df = train.copy()

x_columns = ['DiaSemana','H_Team','A_Team','H_Division','A_Division'] + best_features

X = df[x_columns]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 2)
X_train = X_train.copy()
X_test = X_test.copy()

In [None]:
print(X_train.shape)
print(X_test.shape)

# Modelagem (com feature selection)

## Decision Tree

In [None]:
params = {'max_depth':np.arange(5, 30, 1),
          'criterion': ['gini', 'entropy']}

dt_model = GridSearchCV(DecisionTreeClassifier(), params,
                        scoring = "accuracy", n_jobs = -1, cv=10, verbose=4, refit=True)
dt_model.fit(X_train, y_train)

y_pred_train = dt_model.predict(X_train)
y_pred_test = dt_model.predict(X_test)

In [None]:
print("Melhor modelo: {}".format(dt_model.best_estimator_.get_params()))
print("Melhor score: {}".format(dt_model.best_score_))

In [None]:
print("Acurácia Treino:", accuracy_score(y_train, y_pred_train))
print("Acurácia Teste:", accuracy_score(y_test, y_pred_test))

## Random Forest

In [None]:
# params = {'n_estimators':[200, 500, 700, 800],
#           'max_depth':np.arange(5, 10, 1)}

# rf_model = GridSearchCV(RandomForestClassifier(), params,
#                         scoring = "accuracy", n_jobs = -1, cv=10, verbose=4, refit=True)

rf_model = RandomForestClassifier(n_estimators=800, max_depth=5) # Melhor modelo encontrado em uma iteração de GridSeachCV com params
rf_model.fit(X_train, y_train)

y_pred_train = rf_model.predict(X_train)
y_pred_test = rf_model.predict(X_test)

In [None]:
print("Acurácia Treino:", accuracy_score(y_train, y_pred_train))
print("Acurácia Teste:", accuracy_score(y_test, y_pred_test))

## CatBoost

In [None]:
# params = {"learning_rate":[0.01, 0.05, 0.1, 0.03],
#           "n_estimators":[500, 700, 800]}

# cat_model = GridSearchCV(CatBoostClassifier(verbose=False), params,
#                     cv = 10, scoring = "accuracy", n_jobs=-1, verbose = 2,
#                     refit = True)

cat_model = CatBoostClassifier(learning_rate=0.01, n_estimators = 500, silent=True) # Melhor modelo encontrado em uma iteração de GridSeachCV com params
cat_model.fit(X_train, y_train)

y_pred_train = cat_model.predict(X_train)
y_pred_test = cat_model.predict(X_test)

In [None]:
print("Acurácia Treino:", accuracy_score(y_train, y_pred_train))
print("Acurácia Teste:", accuracy_score(y_test, y_pred_test))

## XGBoost

In [None]:
# params = {'n_estimators': [200, 500], 'max_depth': [2, 3], 'learning_rate': [0.01]}

# xgb_model = GridSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'), params,
#                     cv = 10, scoring = "accuracy", n_jobs = -1, verbose = 2,
#                     refit = True)

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', n_estimators=200, max_depth=2, learning_rate=0.01) # Melhor modelo encontrado em uma iteração de GridSeachCV com params
xgb_model.fit(X_train, y_train)

# Devido a versão do XGBoost no kaggle ele fica enchendo o saco com coisa boba
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)
y_pred_train = xgb_model.predict(X_train)
y_pred_test = xgb_model.predict(X_test)

In [None]:
print("Acurácia Treino:", accuracy_score(y_train, y_pred_train))
print("Acurácia Teste:", accuracy_score(y_test, y_pred_test))

# Bootstrap

In [None]:
import time

from sklearn.utils import resample
from sklearn.model_selection import KFold

In [None]:
target = train['WinOrLose'].replace('W', 1).replace('L', 0).astype(int)

df = pd.concat([X, target], axis=1).copy()
df.head(5)

In [None]:
k = 10 # Na prática optamos por valores entre 750 e 1250...

models = []
weights = []

total_start_time = time.time()
for i in range(k):
    print("Sample ",i+1,"... ", sep="", end="")
    
    sample = resample(df, replace=True, n_samples=df.shape[0], stratify=df['WinOrLose'])
    y = sample['WinOrLose'].to_numpy()
    X = sample.drop(columns = 'WinOrLose')
    
    kf = KFold(n_splits = 10, shuffle = True)
    kf.get_n_splits(X)
    
    model_dt = DecisionTreeClassifier(criterion='gini', max_depth=19)
    model_rf = RandomForestClassifier(n_estimators=800, max_depth=5)
    model_cat = CatBoostClassifier(learning_rate=0.01, n_estimators = 500, silent=True)
    model_xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', n_estimators=200, max_depth=2, learning_rate=0.01)
    
    acc_kfold_dt = []
    acc_kfold_rf = []
    acc_kfold_cat = []
    acc_kfold_xgb = []
    
    start_time = time.time()
    for train_index, test_index in kf.split(X_train):
        model_dt.fit(X.iloc[train_index], y[train_index])
        model_rf.fit(X.iloc[train_index], y[train_index])
        model_cat.fit(X.iloc[train_index], y[train_index])
        model_xgb.fit(X.iloc[train_index], y[train_index])
        acc_kfold_dt.append( accuracy_score(y[test_index], model_dt.predict(X.iloc[test_index])) )
        acc_kfold_rf.append( accuracy_score(y[test_index], model_rf.predict(X.iloc[test_index])) )
        acc_kfold_cat.append( accuracy_score(y[test_index], model_cat.predict(X.iloc[test_index])) )
        acc_kfold_xgb.append( accuracy_score(y[test_index], model_xgb.predict(X.iloc[test_index])) )
    
    candidate_models = [model_dt, model_rf, model_cat, model_xgb]
    candidate_weights = [np.mean(acc_kfold_dt), np.mean(acc_kfold_rf), np.mean(acc_kfold_cat), np.mean(acc_kfold_xgb)]

    # candidate_models = [model_dt, model_cat]
    # candidate_weights = [np.mean(acc_kfold_dt), np.mean(acc_kfold_cat)]

    i_max = np.argmax(candidate_weights)
    
    weights.append(candidate_weights[i_max])
    
    candidate_models[i_max].fit(X, y)
    
    elapsed_time = time.time() - start_time
    
    print(candidate_models[i_max], end=" ")
    print(elapsed_time, 's')
    
    models.append(candidate_models[i_max])
    
total_elapsed_time = time.time() - total_start_time
print("Total time:", total_elapsed_time)
print("Average time per model:", total_elapsed_time/k)

Neste caso, o modelo que apresentou melhor score em todas as 10 iterações foi o modelo DecisionTree... Os mais escolhidos na prática foram DecisionTree e CatBoost...

In [None]:
test = test.loc[:, df.columns[:-1]].copy()

prob_final = np.zeros(test.shape[0])

for i in range(len(models)):
    prob_final += models[i].predict_proba(test)[:,1] * weights[i] # Probabilidade de 'W'
prob_final /= np.sum(weights)
prob_final

In [None]:
y_pred_final = np.zeros(len(prob_final)).astype(str)
y_pred_final[np.where(prob_final >= 0.5)] = 'W'
y_pred_final[np.where(prob_final < 0.5)] = 'L'
y_pred_final

In [None]:
print("Proporção de vitórias preditas:", (y_pred_final == 'W').sum() / len(y_pred_final)) # Próximo de 35%... Parece bom