In [1]:
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score

import pandas as pd
import numpy as np

import pickle
import json
import os
import warnings
import datetime

In [2]:
# Instantiate a dataframe on first dataset to get columns
nfl_df = pd.read_csv('../../dataset_generators/datasets/0_train_normalized.csv')
# Get normalized columns
y = ['fantasy_points']
X = [col for col in nfl_df.columns if col not in ('player_id'
                                                        , 'player_name'
                                                        , 'player_display_name'
                                                        , 'position','position_group'
                                                        , 'headshot_url'
                                                        , 'season'
                                                        , 'week'
                                                        , 'season_type'
                                                        , 'team'
                                                        , 'opponent_team'
                                                        , 'fantasy_points'
                                                        , 'Unnamed: 0')]
del nfl_df

In [3]:
# Instantiate a dataframe on first dataset to get columns
nfl_df = pd.read_csv('../../dataset_generators/datasets/0_train_pca.csv')
# Get pca columns
y_pca = ['fantasy_points']
X_pca = [col for col in nfl_df.columns if col not in ('player_id'
                                                        , 'player_name'
                                                        , 'player_display_name'
                                                        , 'position','position_group'
                                                        , 'headshot_url'
                                                        , 'season'
                                                        , 'week'
                                                        , 'season_type'
                                                        , 'team'
                                                        , 'opponent_team'
                                                        , 'fantasy_points'
                                                        , 'Unnamed: 0')]
del nfl_df

In [8]:
# Fit linear regression on the 5 fold datasets and evaluate metrics 
norm_lr_metrics = {'mae':[],'rmse':[],'r2':[]}
for n in range(5):
    train_dataset_path = f'../../dataset_generators/datasets/{n}_train_normalized.csv'
    train_df = pd.read_csv(train_dataset_path)
    test_dataset_path = f'../../dataset_generators/datasets/{n}_test_normalized.csv'
    test_df = pd.read_csv(test_dataset_path)
    
    X_train = train_df[X]
    y_train = train_df[y]
    X_test = test_df[X]
    y_test = test_df[y]
    
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    
    y_pred = lr.predict(X_test)
    
    norm_lr_metrics['mae'].append(mean_absolute_error(y_test, y_pred))
    norm_lr_metrics['rmse'].append(root_mean_squared_error(y_test, y_pred))
    norm_lr_metrics['r2'].append(r2_score(y_test, y_pred))

norm_lr_metrics['mae_avg'] = np.mean(np.array(norm_lr_metrics['mae']))
norm_lr_metrics['rmse_avg'] = np.mean(np.array(norm_lr_metrics['rmse']))
norm_lr_metrics['r2_avg'] = np.mean(np.array(norm_lr_metrics['r2']))

norm_lr_metrics['mae_std'] = np.std(np.array(norm_lr_metrics['mae']))
norm_lr_metrics['rmse_std'] = np.std(np.array(norm_lr_metrics['rmse']))
norm_lr_metrics['r2_std'] = np.std(np.array(norm_lr_metrics['r2']))

In [9]:
# Fit linear regression on the 5 fold datasets and evaluate metrics 
pca_lr_metrics = {'mae':[],'rmse':[],'r2':[]}
for n in range(5):
    train_dataset_path = f'../../dataset_generators/datasets/{n}_train_pca.csv'
    train_df = pd.read_csv(train_dataset_path)
    test_dataset_path = f'../../dataset_generators/datasets/{n}_test_pca.csv'
    test_df = pd.read_csv(test_dataset_path)
    
    X_train = train_df[X_pca]
    y_train = train_df[y_pca]
    X_test = test_df[X_pca]
    y_test = test_df[y_pca]
    
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    
    y_pred = lr.predict(X_test)
    
    pca_lr_metrics['mae'].append(mean_absolute_error(y_test, y_pred))
    pca_lr_metrics['rmse'].append(root_mean_squared_error(y_test, y_pred))
    pca_lr_metrics['r2'].append(r2_score(y_test, y_pred))

pca_lr_metrics['mae_avg'] = np.mean(np.array(pca_lr_metrics['mae']))
pca_lr_metrics['rmse_avg'] = np.mean(np.array(pca_lr_metrics['rmse']))
pca_lr_metrics['r2_avg'] = np.mean(np.array(pca_lr_metrics['r2']))

pca_lr_metrics['mae_std'] = np.std(np.array(pca_lr_metrics['mae']))
pca_lr_metrics['rmse_std'] = np.std(np.array(pca_lr_metrics['rmse']))
pca_lr_metrics['r2_std'] = np.std(np.array(pca_lr_metrics['r2']))

In [10]:
# Compare the 2 models
print(f"---NORMALIZED---\nMAE: {norm_lr_metrics['mae_avg']}\nRMSE: {norm_lr_metrics['rmse_avg']}\nR2: {norm_lr_metrics['r2_avg']}")
print(f"MAE std: {norm_lr_metrics['mae_std']}\nRMSE std: {norm_lr_metrics['rmse_std']}\nR2 std: {norm_lr_metrics['r2_std']}")

print(f"---PCA---\nMAE: {pca_lr_metrics['mae_avg']}\nRMSE: {pca_lr_metrics['rmse_avg']}\nR2: {pca_lr_metrics['r2_avg']}")
print(f"MAE std: {pca_lr_metrics['mae_std']}\nRMSE std: {pca_lr_metrics['rmse_std']}\nR2 std: {pca_lr_metrics['r2_std']}")


---NORMALIZED---
MAE: 3.873030699741642
RMSE: 5.380693337715541
R2: 0.40562443155245537
MAE std: 0.044284356922445714
RMSE std: 0.05843496151983345
R2 std: 0.00245723017181231
---PCA---
MAE: 3.905568402700351
RMSE: 5.413844157568112
R2: 0.39827150182828175
MAE std: 0.044321118957483994
RMSE std: 0.0569814476010699
R2 std: 0.002940295654117378


In [7]:
# It looks like normalized is slightly better than PCA, but overall it is negligable.