In [1]:
# import dependencies
import pandas as pd

In [2]:
# read in datasets
id_map = pd.read_csv('../data/id_map.csv')
nba_class = pd.read_csv('../data/nba_data_bpm_classified.csv')
nba_draft = pd.read_csv('../data/nba_draft.csv')
ncaa_stats = pd.read_csv('../data/ncaa_stats.csv')

In [3]:
# preview dataset columns
print("ID Map:")
for column in id_map.columns:
    print(column)
print("")

print("NBA Classified:")
for column in nba_class.columns:
    print(column)
print("")

print("NBA Draft:")
for column in nba_draft.columns:
    print(column)
print("")

print("NCAA Stats:")
for column in ncaa_stats.columns:
    print(column)

ID Map:
season
player_name
pro_id
cbb_id

NBA Classified:
index
year
player_id
name
position
age
team
games_played
games_started
minutes_played
points
made_field_goals
attempted_field_goals
made_three_pointers
attempted_three_pointers
made_free_throws
attempted_free_throws
offensive_rebounds
defensive_rebounds
assists
steals
blocks
turnovers
personal_fouls
player_id_2
player_efficiency_rating
true_shooting_percentage
three_point_attempt_rate
free_throw_attempt_rate
offensive_rebound_percentage
defensive_rebound_percentage
total_rebound_percentage
assist_percentage
steal_percentage
block_percentage
turnover_percentage
usage_percentage
offensive_win_shares
defensive_win_shares
win_shares
win_shares_per_48_minutes
offensive_box_plus_minus
defensive_box_plus_minus
box_plus_minus
value_over_replacement_player
classification

NBA Draft:
year
player
team
affiliation
round
round_pick
overall_pick
position
lane_agility
shuttle_run
sprint
standing_leap
max_leap
bench_press
body_fat
hand_length
h

In [4]:
# flatten nba_class dataset to one row per player
nba_dupes = nba_class[["player_id", "classification"]]

nba_flat = nba_dupes.drop_duplicates()

In [5]:
# merge nba draft with id map
connect_ids = pd.merge(nba_draft, id_map, left_on = 'player_id', right_on = 'pro_id', how = 'inner')

# merge nba classification with connect_ids
ids_plus_nba = pd.merge(connect_ids, nba_flat, on = 'player_id', how = 'inner')

# merge ncaa stats to ids_plus_nba
all_data = pd.merge(ids_plus_nba, ncaa_stats, left_on = 'cbb_id', right_on = 'player_id', how = 'inner')

# get column names
for column in all_data.columns:
    print(f"'{column}',")

'year',
'player',
'team',
'affiliation',
'round',
'round_pick',
'overall_pick',
'position',
'lane_agility',
'shuttle_run',
'sprint',
'standing_leap',
'max_leap',
'bench_press',
'body_fat',
'hand_length',
'hand_width',
'height_no_shoes',
'height_shoes',
'reach',
'weight',
'wingspan',
'player_id_x',
'season',
'player_name',
'pro_id',
'cbb_id',
'classification',
'Unnamed: 0',
'assist_percentage',
'assists',
'box_plus_minus',
'block_percentage',
'blocks',
'conference',
'defensive_rebound_percentage',
'defensive_rebounds',
'effective_field_goal_percentage',
'field_goal_attempts',
'field_goals',
'free_throw_attempt_rate',
'free_throw_attempt',
'free_throw_percentage',
'free_throws',
'minutes_played',
'name',
'offensive_rebound_percentage',
'offensive_rebounds',
'personal_fouls',
'player_id_y',
'points',
'steal_percentage',
'steals',
'three_point_attempt_rate',
'three_point_attempts',
'three_point_percentage',
'three_pointers',
'total_rebound_percentage',
'total_rebounds',
'true_shooting_perc

In [6]:
# pare down data to necessary columns
req_columns = all_data[['year',
                        'player',
                        'affiliation',
                        'overall_pick',
                        'position',
                        'lane_agility',
                        'shuttle_run',
                        'sprint',
                        'standing_leap',
                        'max_leap',
                        'bench_press',
                        'body_fat',
                        'hand_length',
                        'hand_width',
                        'height_no_shoes',
                        'height_shoes',
                        'reach',
                        'weight',
                        'wingspan',
                        'assist_percentage',
                        'assists',
                        'block_percentage',
                        'blocks',
                        'box_plus_minus',
                        'conference',
                        'defensive_rebound_percentage',
                        'defensive_rebounds',
                        'effective_field_goal_percentage',
                        'field_goal_attempts',
                        'field_goals',
                        'free_throw_attempt_rate',
                        'free_throw_attempt',
                        'free_throw_percentage',
                        'free_throws',
                        'minutes_played',
                        'offensive_rebound_percentage',
                        'offensive_rebounds',
                        'personal_fouls',
                        'points',
                        'steal_percentage',
                        'steals',
                        'three_point_attempt_rate',
                        'three_point_attempts',
                        'three_point_percentage',
                        'three_pointers',
                        'total_rebound_percentage',
                        'total_rebounds',
                        'true_shooting_percentage',
                        'turnover_percentage',
                        'turnovers',
                        'two_point_attempts',
                        'two_point_percentage',
                        'two_pointers',
                        'usage_percentage',
                        'win_shares',
                        'player_id_y',
                        'classification']]

final_full = req_columns.rename(columns={'player_id_y': 'player_id'})

In [7]:
# save dataframes as .csv files
final_full.to_csv('../data/final_full.csv', index = False)