In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as mae
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor # for building the model

#load/create data
PL = pd.read_csv('results_final.csv')
PLnormalized = pd.read_csv('results_final_normalized.csv')
extra = PL[["total_red_card","att_hd_goal","att_pen_goal","att_freekick_goal","goal_fastbreak","last_man_tackle","own_goals","pen_goals_conceded","clearance_off_line","penalty_save","punches","Income","Balance","fs_result","season","team"]]

#group the first 2 columns
PL = PL.groupby(["season", "team"]).sum()
PLnormalized = PLnormalized.groupby(["season", "team"]).sum()
extra = extra.groupby(["season", "team"]).sum()


#choose either PL, PLnormalized, or extra in the code below
X = PL.drop('fs_result', axis=1)
y = PL['fs_result']

#split data accordingly (and consistently)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

#decision tree
#create units to store the performances
train_acc_mean, train_acc_std = [], [] # to store the training accuracies
test_acc_mean, test_acc_std = [], []   # to store the testing accuracies
from tqdm import tqdm
for d in tqdm(range(1,20)): # loop over tree depths
    t_mae_train = []
    t_mae = []
    for n in range(20):
        modelDT = DecisionTreeRegressor(max_depth=d, random_state=n) # init new model
        modelDT = modelDT.fit(X_train, y_train) # train model
        y_pred_train = modelDT.predict(X_train)
        y_pred = modelDT.predict(X_test)
        t_mae_train.append(mae(y_train, y_pred_train))
        t_mae.append(mae(y_test, y_pred))

    # append mean and std scores to appropriate lists
    train_acc_mean.append(np.mean(t_mae_train))
    test_acc_mean.append(np.mean(t_mae))
    train_acc_std.append(np.std(t_mae_train))
    test_acc_std.append(np.std(t_mae))

plt.errorbar(x=np.arange(1,20), y=train_acc_mean, yerr=train_acc_std, label="Train set")
plt.errorbar(x=np.arange(1,20), y=test_acc_mean, yerr=test_acc_std, label="Test set")
plt.title("Decision tree model selection")
plt.ylabel("MAE")
plt.xlabel("Max depth of tree")
plt.legend()
plt.grid()
plt.show()



#random forest:
train_acc_mean, train_acc_std = np.zeros((10, 12)), np.zeros((10, 12)) # storing it in an np array instead of a list makes it easier
test_acc_mean, test_acc_std = np.zeros((10,12)), np.zeros((10,12))

n_trees = np.arange(10,101,10)
depths = np.arange(1,13,1)

for tree_idx, n_trees1 in tqdm(enumerate(n_trees), total=len(n_trees)):
    for d_idx, d in enumerate(depths):
        train_perfs = []
        test_perfs = []
        for n in range(10):
            modelRFC = RandomForestRegressor(n_estimators = n_trees1, max_depth=d, random_state = n)
            modelRFC = modelRFC.fit(X_train, y_train) # train model
            y_pred_train = modelRFC.predict(X_train)
            y_pred = modelRFC.predict(X_test)
            test = mae(y_train, y_pred_train)
            train_perfs.append(mae(y_train, y_pred_train)) # store interem values
            test_perfs.append(mae(y_test, y_pred)) # store interem values
        train_acc_mean[tree_idx][d_idx] = np.mean(train_perfs)
        train_acc_std[tree_idx][d_idx] = np.std(train_perfs)
        test_acc_mean[tree_idx][d_idx] = np.mean(test_perfs)
        test_acc_std[tree_idx][d_idx] = np.std(test_perfs)

# Figure 1
plt.figure()
sns.heatmap(train_acc_mean, annot=True) # create seaborn heatmap with annotations
plt.ylabel("Number of trees")
plt.xlabel("Max depth of trees")
plt.yticks(ticks=np.arange(0,10), labels=np.arange(10,101,10))
plt.xticks(np.arange(0,12), depths)
plt.title("Random Forest, Train mean absolute errors")
plt.show()

# Figure 2
plt.figure()
sns.heatmap(test_acc_mean, annot=True)
plt.ylabel("Number of trees")
plt.xlabel("Max depth of trees")
plt.yticks(np.arange(0,10), np.arange(10,101,10))
plt.xticks(np.arange(0,12), depths)
plt.title("Random Forest, Test mean absolute errors")
plt.show()


#svm
#for svm we use normalized data but since we want the MAE in points
# and not in normalized points, we save the original variance and mean.
a = PL["fs_result"].values
b = PL["fs_result"].values
meanV = a.mean()
stdevV = b.std()

#create a SVR model
#degree is only used if polynomial function is given and multiple degrees were tested
svr = SVR(kernel='rbf', degree=7, C=0.85)


X = PLnormalized.drop('fs_result', axis=1)
y = PLnormalized['fs_result']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# train the model on the data
svr.fit(X_train, y_train)

# make predictions on the data and reformat them to old desired output
y_pred = svr.predict(X_train)
y_pred_t = svr.predict(X_test)
y_train = y_train * stdevV + meanV
y_test = y_test * stdevV + meanV
y_pred = y_pred * stdevV + meanV
y_pred_t = y_pred_t * stdevV + meanV

# Get the maximum value between y_train and y_test predictions
max_y = max(max(y_train), max(y_test), max(y_pred), max(y_pred_t))

# Get the minimum value between y_train and y_test predictions
min_y = min(min(y_train), min(y_test), min(y_pred), min(y_pred_t))

# Set the limits of both axes to the same range
plt.xlim(min_y, max_y)
plt.ylim(min_y, max_y)

# Scatter plots
plt.scatter(y_train, y_pred, color='darkorange', label='train')
plt.scatter(y_test, y_pred_t, color='blue', label='test')
plt.ylabel("y prediction")
plt.xlabel("real y")
# Add diagonal line
plt.plot([min_y, max_y], [min_y, max_y], color='grey', linestyle='--', label='diagonal')
plt.legend()
plt.show()
#print train and test MAE respectively
print(mae(y_train, y_pred))
print(mae(y_test, y_pred_t))