In [None]:
import numpy as np
import pandas as pd
from datetime import datetime

In [None]:
# read raw data
raw_df = pd.read_csv('../inputs/raw_data.csv')
print("Total initial entries:", len(raw_df))
print("Total number of players:", raw_df['name'].nunique())
raw_df.head()

In [None]:
# filter raw data
raw_df = raw_df.dropna()
raw_df = raw_df[(raw_df['height'] >= 120) & (raw_df['height'] <= 230)]
raw_df = raw_df[(raw_df['matches_year1'] > 0) & (raw_df['matches_year2'] > 0) & (raw_df['matches_year3'] > 0)]
raw_df = raw_df[(raw_df['goals_year1'] > 0) & (raw_df['goals_year2'] > 0) & (raw_df['goals_year3'] > 0)]
raw_df = raw_df[raw_df['position'].isin(['Middle Back','Left Wing','Right Back','Right Wing','Line Player',\
                                         'Left Back','Back'])]
raw_df = raw_df[raw_df['prediction_season'] != '2018/2019']
print("Player positions:", raw_df['position'].unique())
print("Number of entries after filtering:", len(raw_df))
print("Number of players after filtering:", raw_df['name'].nunique())
raw_df.head()

In [None]:
def is_center(position):
    if position == 'Middle Back':
        return 1
    else:
        return 0

def is_back(position):
    if position == 'Right Back' or position == 'Left Back' or position == 'Back':
        return 1
    else:
        return 0
    
def is_wing(position):
    if position == 'Right Wing' or position == 'Left Wing':
        return 1
    else:
        return 0
    
def is_line(position):
    if position == 'Line Player':
        return 1
    else:
        return 0

In [None]:
# build necessary columns
raw_df['goals_per_match_year1'] = raw_df['goals_year1'] / raw_df['matches_year1']
raw_df['goals_per_match_year2'] = raw_df['goals_year2'] / raw_df['matches_year2']
raw_df['goals_per_match_year3'] = raw_df['goals_year3'] / raw_df['matches_year3']

# create binary features for field position
raw_df['center'] = raw_df.apply(lambda row: is_center(row['position']), axis=1)
raw_df['back'] = raw_df.apply(lambda row: is_back(row['position']), axis=1)
raw_df['wing'] = raw_df.apply(lambda row: is_wing(row['position']), axis=1)
raw_df['line'] = raw_df.apply(lambda row: is_line(row['position']), axis=1)

raw_df = raw_df[['name','center','back','wing','line','height','matches_year1','goals_year1',\
                 'goals_per_match_year1','matches_year2','goals_year2','goals_per_match_year2',\
                 'goals_per_match_year3']]

raw_df.head()

In [None]:
# print statistics
print ("Data shape:",raw_df.shape[0],"values","\n")
print (raw_df['height'].describe(percentiles=[.25,.50,.75]),"\n")    
print (raw_df['matches_year1'].describe(percentiles=[.25,.50,.75]),"\n")
print (raw_df['goals_year1'].describe(percentiles=[.25,.50,.75]),"\n")
print (raw_df['goals_per_match_year1'].describe(percentiles=[.25,.50,.75]),"\n")
print (raw_df['matches_year2'].describe(percentiles=[.25,.50,.75]),"\n")
print (raw_df['goals_year2'].describe(percentiles=[.25,.50,.75]),"\n")
print (raw_df['goals_per_match_year2'].describe(percentiles=[.25,.50,.75]),"\n")
print (raw_df['goals_per_match_year3'].describe(percentiles=[.25,.50,.75]),"\n")

In [None]:
# save final processed data to file
raw_df.to_csv('../inputs/final_processed_data.csv', sep=',', index=False)

In [None]:
raw_df