In [1]:
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler

import pickle

import numpy as np
import pandas as pd

In [2]:
# Import the original or main dataset
orig_df = pd.read_csv('./datasets/final_1_lag_ffa_dataset.csv')

In [3]:
# For copying and pasting into the columns_not_to_use variable below
orig_df.columns

Index(['player_id', 'player_name', 'player_display_name', 'position',
       'position_group', 'headshot_url', 'season', 'week', 'season_type',
       'team', 'opponent_team', 'age', 'years_exp',
       'passing_yards_r_avg_1_lag_1', 'passing_yards_r_avg_3_lag_1',
       'passing_yards_r_avg_5_lag_1', 'passing_yards_r_avg_8_lag_1',
       'passing_tds_r_avg_1_lag_1', 'passing_tds_r_avg_3_lag_1',
       'passing_tds_r_avg_5_lag_1', 'passing_tds_r_avg_8_lag_1',
       'passing_interceptions_r_avg_1_lag_1',
       'passing_interceptions_r_avg_3_lag_1',
       'passing_interceptions_r_avg_5_lag_1',
       'passing_interceptions_r_avg_8_lag_1',
       'passing_2pt_conversions_r_avg_1_lag_1',
       'passing_2pt_conversions_r_avg_3_lag_1',
       'passing_2pt_conversions_r_avg_5_lag_1',
       'passing_2pt_conversions_r_avg_8_lag_1',
       'sack_fumbles_lost_r_avg_1_lag_1', 'sack_fumbles_lost_r_avg_3_lag_1',
       'sack_fumbles_lost_r_avg_5_lag_1', 'sack_fumbles_lost_r_avg_8_lag_1',
      

In [4]:
# Get the X dataset in order to run the split
columns_not_to_use = ['player_id'
                    , 'player_name'
                    , 'player_display_name'
                    , 'position','position_group'
                    , 'headshot_url'
                    , 'season'
                    , 'week'
                    , 'season_type'
                    , 'team'
                    , 'opponent_team'
                    , 'fantasy_points']
columns_to_use = [col for col in orig_df.columns if col not in columns_not_to_use]
X = orig_df[columns_to_use]

In [6]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
for idx, split in enumerate(kf.split(X)):
    # Get file names
    train_unnorm_name = f'{idx}_train_non-normalized.csv'
    test_unnorm_name = f'{idx}_test_non-normalized.csv'
    train_norm_name = f'{idx}_train_normalized.csv'
    test_norm_name = f'{idx}_test_normalized.csv'
    scaler_name = f'{idx}_scaler.pickle'
    
    # Get the indexes for the train and the test for the split
    train_indexes = split[0]
    test_indexes = split[1]
    
    # Pull only X columns
    train_x_df = orig_df[columns_to_use].iloc[train_indexes] 
    test_x_df = orig_df[columns_to_use].iloc[test_indexes]

    # Pull the rest of the columns (these will be more important for the normalized dataset)
    train_y_df = orig_df[columns_not_to_use].iloc[train_indexes]
    test_y_df = orig_df[columns_not_to_use].iloc[test_indexes]
    
    # Concatenate them together
    train_unnorm_complete = pd.concat([train_y_df, train_x_df], axis=1) 
    test_unnorm_complete = pd.concat([test_y_df, test_x_df], axis=1) 

    # Write to file
    train_unnorm_complete.to_csv(f'./datasets/{train_unnorm_name}')
    test_unnorm_complete.to_csv(f'./datasets/{test_unnorm_name}')
    
    # Instantiate scaler
    scaler = MinMaxScaler()
    
    # Run scaler on X columns for dataset
    train_norm_x_df = pd.DataFrame(scaler.fit_transform(train_x_df), columns=columns_to_use)
    test_norm_x_df = pd.DataFrame(scaler.transform(test_x_df), columns=columns_to_use)
    
    # Put non-X columns back into dataframe
    train_norm_complete = pd.concat([train_y_df.reset_index(drop=True), train_norm_x_df], axis=1)
    test_norm_complete = pd.concat([test_y_df.reset_index(drop=True), test_norm_x_df], axis=1) 
 
    # Write to file
    train_norm_complete.to_csv(f'./datasets/{train_norm_name}')
    test_norm_complete.to_csv(f'./datasets/{test_norm_name}')

    # Write scaler to pickle for use later
    with open(f'./datasets/{scaler_name}', 'wb') as f:
        pickle.dump(scaler, f)
    
    # Check number of columns and number of records in dataset between normed and un-normed
    print(train_unnorm_complete.shape, train_norm_complete.shape)

(48775, 78) (48775, 78)
(48775, 78) (48775, 78)
(48775, 78) (48775, 78)
(48775, 78) (48775, 78)
(48776, 78) (48776, 78)
