In [None]:
import pandas as pd
import numpy as np
import os
import pickle
from bson.objectid import ObjectId
from collections import defaultdict
import timeit
import matplotlib.lines as mlines
import matplotlib.pyplot as plt
import datetime

import data_reader as dr
import feature_engineering_SA as fe_s

In [None]:
league = "NCAAM1"
season = "22-23"
raw_data_path = "../data/raw_data/"
dataframes = dr.read_data(raw_data_path,league,season)

In [None]:
errors = dr.validate_data_types(dataframes)

### Manually Fix data errors found, drop if minimal missing data

In [None]:
# USE IF NECESSARY TO CLEAN DATA
# dataframes['segments_df'] = dataframes['segments_df'].loc[~(dataframes['segments_df'].index.isin(errors['segments_df__players_team2'].index) | (dataframes['segments_df'].index.isin(errors['segments_df__players_team1'].index)))].reset_index(drop=True)
# dataframes['segments_players_usage_blocking_df'] = dataframes['segments_players_usage_blocking_df'].loc[~dataframes['segments_players_usage_blocking_df'].index.isin(errors['segments_players_usage_blocking_df__player_id'].index)].reset_index(drop=True)
# dataframes['shots_df'] = dataframes['shots_df'].loc[~dataframes['shots_df'].index.isin(errors['shots_df__player_id'].index)].reset_index(drop=True)


In [None]:
dataframes['shots_df'] = dr.clean_shots_data(dataframes['shots_df'])
dataframes['games_df'].date = pd.to_datetime(dataframes['games_df'].date)
# dataframes['games_df']['date'] = dr.get_correct_game_date(dataframes)

# Feature: Time Remaining on Game Clock, Score Difference, Clutch Time

In [None]:
dr.validate_data_types(dataframes)

In [None]:
shot_id_time_features = fe_s.get_time_score_features(dataframes['shots_df'])

In [None]:
feature_list_1_shot_level = shot_id_time_features
feature_list_1_shot_level

# Feature: Avg./max blocking rates of opponent team on court (segment)

### Game Player Level Blocking and Usage Stats 

## 1. Get All players in each segment to make at same level as blocking stats df

In [None]:
segments_df_long = fe_s.create_player_level_segment_df(dataframes['segments_df'])
segments_df_long

In [None]:
segment_master_df, segment_master_df_game_player = fe_s.get_player_game_stats(segments_df_long,dataframes['segments_players_usage_blocking_df'])

In [None]:
segment_master_df_game_player

In [None]:
# USE IF NECESSARY TO FIX DATE

# dataframes['games_df']['date'] = dataframes['games_df'].date.apply(lambda x: pad_date(x))


In [None]:
team_game_player_stats_df = fe_s.get_all_team_player_game_stats(segment_master_df,segment_master_df_game_player,dataframes['games_df'])
team_game_player_stats_df.head(15)

In [None]:
#SANITY CHECK
# team_id = ObjectId('5e875e684dc25ebee5af674b')	
# game_id = ObjectId('636e84fb7d85d389f9e3e868')
# player_id = ObjectId('618d6acced05764db467c654')	
# team_game_player_stats_df.loc[(team_game_player_stats_df.team_id==team_id) & (team_game_player_stats_df.player_id==player_id)][["team_id","game_id","date","player_id","uses","blocks","two_shots_for_blocks","posessions_for_usage","current_season_usage_ratio","current_season_blocks_ratio"]].to_clipboard()

In [None]:
team_game_player_stats_df.isna().sum()/team_game_player_stats_df.shape[0]

### As we have stats for each player for a game, now we can join this info at segment level and calculate averages for all players of opponent team

In [None]:
fe_s.get_segment_team_level_blockers(team_game_player_stats_df,segments_df_long,choose_threshold=True)

In [None]:
segment_team_level_blockers = fe_s.get_segment_team_level_blockers(team_game_player_stats_df,segments_df_long)
feature_list_2_game_segment_team_level = segment_team_level_blockers
feature_list_2_game_segment_team_level

In [None]:
feature_list_3_game_segment_player_level = fe_s.get_segment_usage_ratios(segments_df_long,team_game_player_stats_df)
feature_list_3_game_segment_player_level

### Feature: Opposition Player Characteristics

In [None]:
feature_list_4_game_segment_team_level = fe_s.get_segment_positions(segments_df_long,dataframes['players_df'])
feature_list_4_game_segment_team_level

# Previous Season Feature creation

In [None]:
league = "NCAAM1"
last_season = "-".join([str(int(x)-1) for x in season.split("-")])
print(f"Fetching data for {last_season}")
raw_data_path = "../../GT_MSA_LBA/data/raw_data/"
dataframes_ls = dr.read_data(raw_data_path,league,last_season)

In [None]:
errors = dr.validate_data_types(dataframes_ls)

### Manually Fix data errors found, drop if minimal missing data

In [None]:
# USE IF NECESSARY TO CLEAN DATA
dataframes_ls['segments_df'] = dataframes_ls['segments_df'].loc[~(dataframes_ls['segments_df'].index.isin(errors['segments_df__players_team2'].index) | (dataframes_ls['segments_df'].index.isin(errors['segments_df__players_team1'].index)))].reset_index(drop=True)
dataframes_ls['segments_players_usage_blocking_df'] = dataframes_ls['segments_players_usage_blocking_df'].loc[~dataframes_ls['segments_players_usage_blocking_df'].index.isin(errors['segments_players_usage_blocking_df__player_id'].index)].reset_index(drop=True)
dataframes_ls['shots_df'] = dataframes_ls['shots_df'].loc[~dataframes_ls['shots_df'].index.isin(errors['shots_df__player_id'].index)].reset_index(drop=True)

In [None]:
dataframes_ls['shots_df'] = dr.clean_shots_data(dataframes_ls['shots_df'])
# dataframes_ls['games_df']['date'] = dr.get_correct_game_date(dataframes_ls)

In [None]:
segments_df_long = fe_s.create_player_level_segment_df(dataframes_ls['segments_df'])
segments_df_long

In [None]:
segment_master_df, segment_master_df_game_player = fe_s.get_player_game_stats(segments_df_long,dataframes_ls['segments_players_usage_blocking_df'])

In [None]:
segment_master_df

In [None]:
dataframes_ls['segments_players_usage_blocking_df']

In [None]:
dataframes['segments_players_usage_blocking_df']

In [None]:
# USE IF NECESSARY TO FIX DATE

# dataframes['games_df']['date'] = dataframes['games_df'].date.apply(lambda x: pad_date(x))


In [None]:
player_past_season_stats = fe_s.get_all_team_player_stats_last_season(segment_master_df_game_player)
player_past_season_stats.head(15)

In [None]:
player_past_season_stats

In [None]:
feature_list_5_player_level = player_past_season_stats

In [None]:
altered_shots_df = fe_s.get_corrected_score_feature(dataframes['shots_df'])

# Merge all features

In [None]:
shots = fe_s.get_all_features_at_shot_level(altered_shots_df,dataframes['segments_df'],feature_list_1_shot_level,feature_list_2_game_segment_team_level,feature_list_3_game_segment_player_level,feature_list_4_game_segment_team_level,feature_list_5_player_level)
shots.columns

In [None]:
subset_cols = ['shot_id',
       'score_pre_shot','score_pre_shot_diff', 'clutch_time',
       'num_blockers_on_team', 'last_1_game_usage_ratio',
       'last_3_game_usage_ratio', 'last_5_game_usage_ratio',
       'last_10_game_usage_ratio','current_season_usage_ratio','past_season_usage_ratio', 'Big', 'Mid', 'Small']

In [None]:
shots[subset_cols]

In [None]:
shots[subset_cols].to_pickle('features_SA.pickle')