In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
ncaa_stats = pd.read_csv('../data/ncaa_stats.csv')

ncaa_stats.drop(['height', 'weight'], axis = 1, inplace = True)

ncaa_stats.head()

Unnamed: 0,player_id,season,assist_percentage,assists,block_percentage,blocks,box_plus_minus,conference,defensive_box_plus_minus,defensive_rebound_percentage,...,total_rebounds,true_shooting_percentage,turnover_percentage,turnovers,two_point_attempts,two_point_percentage,two_pointers,usage_percentage,win_shares,win_shares_per_40_minutes
0,kenyon-martin-1,1996-97,,10.0,,24.0,,cusa,,,...,74.0,0.562,22.5,16.0,40.0,0.65,26.0,,1.8,0.316
1,kenyon-martin-1,1997-98,,41.0,,83.0,,cusa,,,...,267.0,0.601,,,197.0,0.629,124.0,,7.0,
2,kenyon-martin-1,1998-99,,49.0,,78.0,,cusa,,,...,228.0,0.575,18.8,67.0,247.0,0.575,142.0,,6.8,0.303
3,kenyon-martin-1,1999-00,,42.0,,107.0,,cusa,,,...,300.0,0.601,10.3,56.0,382.0,0.573,219.0,,9.0,0.398
4,stromile-swift-1,1998-99,,5.0,,35.0,,sec,,,...,69.0,0.452,25.6,46.0,102.0,0.431,44.0,,0.7,0.089


In [3]:
# function to caclulate weighted average, weighted by minutes & recency
def wt_mean(data_col, weight_col):
    data_ = data_col.copy()
    weight_ = weight_col.copy()
    
    for i in np.arange(len(data_)):
        if math.isnan(data_[i]):
            weight_[i] = 0

        weight_[i] = weight_[i] * (i + 1)

    numer_ = (data_ * weight_).sum()    
    denom_ = weight_.sum()
    output = numer_ / denom_
    return output

In [4]:
# flatten dataframe to one row per player
player_summary = {}

for column in ncaa_stats.columns:
    player_summary[column] = []
    
player_summary['num_seasons'] = []

from progressbar import ProgressBar
pbar = ProgressBar()

for player in pbar(ncaa_stats['player_id'].unique()):
    player_stats = ncaa_stats.loc[ncaa_stats['player_id'] == player].reset_index(drop = True)
    
    player_stats.sort_values('season', inplace = True)
    
    nSeasons = len(player_stats)
    
    player_summary['player_id'].append(player_stats['player_id'].unique()[0])
    player_summary['season'].append(player_stats['season'].iloc[-1])
    player_summary['position'].append(player_stats['position'].iloc[-1])
    player_summary['team_abbreviation'].append(player_stats['team_abbreviation'].iloc[-1])
    player_summary['conference'].append(player_stats['conference'].iloc[-1])
    player_summary['num_seasons'].append(nSeasons)
    
    player_stats.drop(['player_id', 'season', 'position', 'team_abbreviation', 'conference'], axis = 1, inplace = True)
    
    final_seasons = player_stats.tail(2)
    final_seasons.reset_index(drop = True, inplace = True)
    
    for column in player_stats.columns:
        value = wt_mean(final_seasons[column], final_seasons['minutes_played'])
        player_summary[column].append(value)

  
100% |########################################################################|


In [5]:
# clean up dataframe
ncaa_summary = pd.DataFrame(player_summary)
ncaa_summary['season'] = ncaa_summary['season'].str.split('-').str[0]
ncaa_summary['season'] = ncaa_summary['season'].astype(int)

mask = ncaa_summary['conference'] == 'pac-10'
column_name = 'conference'
ncaa_summary.loc[mask, column_name] = 'pac-12'

ncaa_summary.head()

Unnamed: 0,player_id,season,assist_percentage,assists,block_percentage,blocks,box_plus_minus,conference,defensive_box_plus_minus,defensive_rebound_percentage,...,true_shooting_percentage,turnover_percentage,turnovers,two_point_attempts,two_point_percentage,two_pointers,usage_percentage,win_shares,win_shares_per_40_minutes,num_seasons
0,kenyon-martin-1,1999,,44.317881,,97.397351,,cusa,,,...,0.592391,13.11457,59.642384,337.298013,0.573662,193.503311,,8.271523,0.366543,4
1,stromile-swift-1,1999,,28.327079,,86.837953,,sec,,,...,0.601466,16.787548,75.37484,287.752665,0.606385,179.642644,,7.438934,0.300672,2
2,marcus-fizer-1,1999,,39.048448,,34.53931,,big-12,,,...,0.59489,10.562431,74.490862,484.872063,0.562921,276.314476,,8.178155,0.274545,3
3,mike-miller-1,1999,,83.001074,,13.060866,,sec,,,...,0.584151,15.081633,70.515217,208.881489,0.562878,117.334765,,4.81826,0.197698,2
4,dermarr-johnson-1,1999,,45.0,,30.0,,cusa,,,...,0.594,12.0,46.0,153.0,0.575,88.0,,4.8,0.218,1


In [6]:
ncaa_summary.to_csv('../data/ncaa_summary.csv', index = False)