In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Lo primero, leemos el dataframe
df = pd.read_csv('df_all_rs.csv')
df

In [None]:
df = df.drop('Unnamed: 0', axis=1)
df = df.drop('index', axis=1)

In [None]:
df_corr = df.corr()
plt.figure(figsize=(18,6))
sns.heatmap(df_corr, annot=True, cmap='BrBG', center=0)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, roc_curve

In [None]:
target = df['HOME_TEAM_WINS']
data = df[['FG_PCT_home', 'FG_PCT_away', 'FG3_PCT_home', 'FG3_PCT_away']]

In [None]:
data.shape
target.shape

In [None]:
def evaluate(estimator, data, target):

    X_train, X_test, y_train, y_test = train_test_split(data,target)

    estimator.fit(X_train, y_train)
    y_hat = estimator.predict(X_test)
    probs = estimator.predict_proba(X_test)[:,1]

    # plotear curva roc
    fpr, tpr, thresholds = roc_curve(y_test, probs)
    f, ax = plt.subplots(1,1)
    f.set_size_inches(8,7)
    ax.plot(fpr,tpr)
    ax.plot([0,1], [0,1], c='grey')

    print(f'acc: {accuracy_score(y_test, y_hat):.3} \
    recall: {recall_score(y_test, y_hat):.3} \
    precision:{precision_score(y_test, y_hat):.3} \
    f1:{f1_score(y_test, y_hat):.3} \
    auc:{roc_auc_score(y_test, probs):.3}')

logR = LogisticRegression()
evaluate(logR, data, target)

In [None]:
metrics_results = pd.DataFrame(columns=['MSE','RMSE', "MAE"])

def evaluar_metricas(estimator, data, target, name):
    
    X_train, X_test, y_train, y_test = train_test_split(data,target)

    estimator.fit(X_train, y_train)
    y_hat = estimator.predict(X_test)

    errors = y_test - y_hat
    mae = np.mean(np.abs(errors))
    mse = np.mean(errors ** 2)
    rmse = np.sqrt(mse)
    
    metrics_results.loc[name] = [mse,rmse,mae]

In [None]:
lr = LinearRegression()
kn = KNeighborsRegressor()
dt = DecisionTreeRegressor()
rf = RandomForestRegressor()
mlpr = MLPRegressor()
logR = LogisticRegression()

evaluar_metricas(lr, data, target, 'lr')
evaluar_metricas(kn, data, target, 'kn')
evaluar_metricas(dt, data, target, 'dt')
evaluar_metricas(rf, data, target, 'rf')
evaluar_metricas(mlpr, data, target, 'mlpr')
evaluar_metricas(logR, data, target, 'logR')

metrics_results.style.highlight_min(color='lightgreen', axis=0)

In [None]:
# A priori parace que los algoritmos que menos error dan son mlpr y logR
# Esto es una primera prueba, no tenemos los datos normalizados si quiera
# Asi que lo primero que vamos a hacer es normalizar

In [None]:
data_copy = data.copy()
normalized_data = (data_copy - data_copy.mean()) / data_copy.std()

In [None]:
# Volvemos a evaluar, pero ahora con los datos normalizados
evaluar_metricas(lr, normalized_data, target, 'lr_normalized')
evaluar_metricas(kn, normalized_data, target, 'kn_normalized')
evaluar_metricas(dt, normalized_data, target, 'dt_normalized')
evaluar_metricas(rf, normalized_data, target, 'rf_normalized')
evaluar_metricas(mlpr, normalized_data, target, 'mlpr_normalized')
evaluar_metricas(logR, normalized_data, target, 'logR_normalized')

metrics_results

In [None]:
metrics_results.style.highlight_min(color='lightgreen', axis=0)

In [None]:
# Con los datos normalizados vemos que el algoritmo mlpr sigue siendo el que menos error tiene
# Antes de decidirnos del todo por un algoritmo, vamos a a "toquetear" un poco cada uno,
# jugar con sus parametros y demas

In [None]:
# Vamos a empezar con Kneighbors
# Para ello modificamos un poco la funcion que teniamos antes para evaluar, para que nos guarde los resultados
# en un dataframe distinto, uno solo para kn

# kn

In [None]:
kn_metrics = pd.DataFrame(columns=['MSE','RMSE', "MAE"])

def evaluar_metricas_kn(estimator, data, target, name):
    
    X_train, X_test, y_train, y_test = train_test_split(data,target)

    estimator.fit(X_train, y_train)
    y_hat = estimator.predict(X_test)

    errors = y_test - y_hat
    mae = np.mean(np.abs(errors))
    mse = np.mean(errors ** 2)
    rmse = np.sqrt(mse)
    
    kn_metrics.loc[name] = [mse,rmse,mae]

#for i in range(1, 50):
    kn = KNeighborsRegressor(n_neighbors=i)
    evaluar_metricas_kn(kn, normalized_data, target, 'kn_normalized_'+str(i))
    
#kn_metrics.style.highlight_min(color='lightgreen', axis=0)

### Conclusión kn:
#### n_neighbors es 29(MSE y RMSE)
#### n_neighbors es 2(MAE)

# dt

In [None]:
dt_metrics = pd.DataFrame(columns=['MSE','RMSE', "MAE"])

def evaluar_metricas_dt(estimator, data, target, name):
    
    X_train, X_test, y_train, y_test = train_test_split(data,target)

    estimator.fit(X_train, y_train)
    y_hat = estimator.predict(X_test)

    errors = y_test - y_hat
    mae = np.mean(np.abs(errors))
    mse = np.mean(errors ** 2)
    rmse = np.sqrt(mse)
    
    dt_metrics.loc[name] = [mse,rmse,mae]


#for i in range(1, 50):
    for j in range(1,10):
        dt = DecisionTreeRegressor(max_depth=i, min_samples_leaf=j)
        evaluar_metricas_dt(dt, normalized_data, target, 'dt_normalized_'+str(i)+str(j))
        j += 1
    i += 1
    
#dt_metrics.style.highlight_min(color='lightgreen', axis=0)

### Conclusión dt:
#### max_depth=6 y min_samples_leaf=2(MSE y RMSE)
#### max_depth=25 y min_samples_leaf=3(MAE)

# rf

In [None]:
rf_metrics = pd.DataFrame(columns=['MSE','RMSE', "MAE"])

def evaluar_metricas_rf(estimator, data, target, name):
    
    X_train, X_test, y_train, y_test = train_test_split(data,target)

    estimator.fit(X_train, y_train)
    y_hat = estimator.predict(X_test)

    errors = y_test - y_hat
    mae = np.mean(np.abs(errors))
    mse = np.mean(errors ** 2)
    rmse = np.sqrt(mse)
    
    rf_metrics.loc[name] = [mse,rmse,mae]


#for k in range(1,30):
    for i in range(1, 50):
        for j in range(1,10):
            rf = RandomForestRegressor(n_estimators=k, max_depth=i, min_samples_leaf=j)
            evaluar_metricas_rf(rf, normalized_data, target, 'rf_normalized_'+str(k)+str(i)+str(j))
            j += 1
        i += 1
    k += 1

#rf_metrics.style.highlight_min(color='lightgreen', axis=0)

### Conclusión rf:
#### minimo cuando: rf_normalized_16417 -> n_estimators=16, max_depth=41, min_samples_leaf=7

# logR

In [None]:
logR_metrics = pd.DataFrame(columns=['MSE','RMSE', "MAE"])

def evaluar_metricas_logR(estimator, data, target, name):
    
    X_train, X_test, y_train, y_test = train_test_split(data,target)

    estimator.fit(X_train, y_train)
    y_hat = estimator.predict(X_test)

    errors = y_test - y_hat
    mae = np.mean(np.abs(errors))
    mse = np.mean(errors ** 2)
    rmse = np.sqrt(mse)
    
    logR_metrics.loc[name] = [mse,rmse,mae]

#for k in range(1,30):
    for i in range(1, 50):
        for j in range(1,10):
            logR = logR = LogisticRegression()
            evaluar_metricas_logR(logR, normalized_data, target, 'logR_normalized_'+str(k)+str(i)+str(j))
            j += 1
        i += 1
    k += 1

#logR_metrics.style.highlight_min(color='lightgreen', axis=0)

### Conclusión logR:
#### minimo cuando:

# mlpr

In [None]:
mlpr_metrics = pd.DataFrame(columns=['MSE','RMSE', "MAE"])

def evaluar_metricas_mlpr(estimator, data, target, name):
    
    X_train, X_test, y_train, y_test = train_test_split(data,target)

    estimator.fit(X_train, y_train)
    y_hat = estimator.predict(X_test)

    errors = y_test - y_hat
    mae = np.mean(np.abs(errors))
    mse = np.mean(errors ** 2)
    rmse = np.sqrt(mse)
    
    mlpr_metrics.loc[name] = [mse,rmse,mae]

#for k in range(1,30):
    for i in range(1, 50):
        for j in range(1,10):
            mlpr = MLPRegressor()
            evaluar_metricas_mlpr(mlpr, normalized_data, target, 'mlpr_normalized_'+str(k)+str(i)+str(j))
            j += 1
        i += 1
    k += 1

#mlpr_metrics.style.highlight_min(color='lightgreen', axis=0)



In [None]:
### Conclusión mlpr:
#### minimo cuando: