In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import time
import math
from math import radians
import gc
import seaborn as sns
from sklearn.impute import SimpleImputer

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/totalrolling8/total_rolling.parquet
/kaggle/input/nfl-big-data-bowl-2023/players.csv
/kaggle/input/nfl-big-data-bowl-2023/week6.csv
/kaggle/input/nfl-big-data-bowl-2023/week2.csv
/kaggle/input/nfl-big-data-bowl-2023/pffScoutingData.csv
/kaggle/input/nfl-big-data-bowl-2023/week3.csv
/kaggle/input/nfl-big-data-bowl-2023/week8.csv
/kaggle/input/nfl-big-data-bowl-2023/games.csv
/kaggle/input/nfl-big-data-bowl-2023/week5.csv
/kaggle/input/nfl-big-data-bowl-2023/week7.csv
/kaggle/input/nfl-big-data-bowl-2023/week1.csv
/kaggle/input/nfl-big-data-bowl-2023/week4.csv
/kaggle/input/nfl-big-data-bowl-2023/plays.csv
/kaggle/input/bdb2023-finaldata-7/FinalData_Week6.parquet
/kaggle/input/bdb2023-finaldata-7/FinalData_Week8.parquet
/kaggle/input/bdb2023-finaldata-7/FinalData_Week7.parquet
/kaggle/input/bdb2023-finaldata-7/FinalData_Week1.parquet
/kaggle/input/bdb2023-finaldata-7/FinalData_Week3.parquet
/kaggle/input/bdb2023-finaldata-7/FinalData_Week5.parquet
/kaggle/input/bdb2023-fina

In [2]:
def get_df_counts(df: pd.DataFrame):
    """get the number of unique counts for each column"""
    total_count = len(df)
    df_unique = pd.DataFrame.from_records([(col, str(df[col].dtype), df[col].count(), total_count-df[col].count(), df[col].nunique()) for col in df.columns],
                                          columns=['column_name', 'dtype', 'non_null_count', 'null_count', 'num_unique'])
    mem = df.memory_usage()
    mem_usage = pd.DataFrame(mem, columns=['usage']).reset_index().rename(columns={'index': 'column_name'})
    mem_usage = mem_usage.assign(usage_mb = mem_usage.usage/1024**2)
    mem_usage.drop(columns=['usage'], inplace=True)
    df_counts = mem_usage.merge(df_unique).sort_values(by='usage_mb', ascending=False)
    
    print(f'memory usage: {sum(mem)/1024**2:.1f}MB')
    return df_counts

def reduce_mem_usage(df, category_limit: int = 128):
    """reduce memory usage"""
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            if not (str(col_type).startswith('int') or str(col_type).startswith('float')):
                continue
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type).startswith('int'):
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int64)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            elif str(col_type).startswith('float'):
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)#
                else:
                    df[col] = df[col].astype(np.float64)
#         else:
#             if df[col].nunique() <= category_limit:
#                 df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print(f'memory usage decreased by {100*(start_mem - end_mem)/start_mem:.1f}, final size {end_mem:.1f} MB')

    return df

In [3]:
def ball_snap_throw_frames(D_frame):
    #some plays have no ball_snap, so i changed it 12/10 to check what plays those are
    D_frame_unique = D_frame.drop_duplicates('frameId')
    default = 20
#     end_frame = D_frame_unique.frameId.max()
#     num_snaps = len(D_frame_unique[D_frame_unique.event == 'ball_snap'])
#     D_frame['num_snaps'] = num_snaps
    snap_frame = D_frame.query('event == "ball_snap"').drop_duplicates('frameId').frameId
    if len(snap_frame) == 1:
        D_frame['ball_snap'] = snap_frame.item()
    ending_frames = [1000.5]
    pass_frame_df = D_frame.query('event == "pass_forward"').drop_duplicates('frameId').frameId
    if len(pass_frame_df) == 1:
        pass_forward = pass_frame_df.item()
        D_frame['pass_forward'] = pass_forward
        ending_frames.append(pass_forward)
    
    run_frame_df = D_frame.query('event == "run"').drop_duplicates('frameId').frameId
    if len(run_frame_df) == 1:
        run_frame = run_frame_df.item()
        ending_frames.append(run_frame)
    
    sack_frame_df = D_frame.query('event == "qb_sack"').drop_duplicates('frameId').frameId
    if len(sack_frame_df) == 1:
        sack_frame = sack_frame_df.item()
        ending_frames.append(sack_frame)
    
    strip_frame_df = D_frame.query('event == "qb_strip_sack"').drop_duplicates('frameId').frameId
    if len(strip_frame_df) == 1:
        strip_frame = strip_frame_df.item()
        ending_frames.append(strip_frame)
    
    fumble_frame_df = D_frame.query('event == "fumble"').drop_duplicates('frameId').frameId
    if len(fumble_frame_df) == 1:
        fumble_frame = fumble_frame_df.item()
        ending_frames.append(fumble_frame)
    
#     default_end = snap_frame.item() + default
#     ending_frames.append(default_end)
    
    end_frame = min(ending_frames)
    

    D_frame['end_frame'] = end_frame
    return D_frame

In [4]:
def event_frames(D):
    D = D.groupby(['gameId','playId']).apply(func = ball_snap_throw_frames)
    D.ball_snap.fillna(D.ball_snap.mean(), inplace = True)
    D.ball_snap = D.ball_snap.round()
    return D

In [5]:
def ChangeDirection(D):
    frames = 40
    D.loc[D.ball_snap + frames < D.end_frame, 'end_frame'] = D.ball_snap + frames
    D['change_o'] = D.groupby(['gameId','playId','nflId']).o_deg_std.diff()
    D['abs_change_o'] = D.change_o.abs()
    D.loc[D.abs_change_o >= 260, 'abs_change_o'] = (360 - D.abs_change_o).abs()
    D['total_change_o'] = D.groupby(['gameId','playId','nflId']).abs_change_o.cumsum()
    D['spin_rate'] = D.total_change_o/(D.frameId)
    D_specific_end = D.query('frameId == end_frame')[['gameId','playId','nflId','total_change_o']]
    D_specific_end = D_specific_end.rename(columns = {'total_change_o':'final_change_o'})
    D = pd.merge(D, D_specific_end, how = 'left', left_on=['gameId', 'playId', 'nflId'], right_on=['gameId', 'playId', 'nflId'])
    D_specific_begin = D.query('frameId == ball_snap + 1')[['gameId','playId','nflId','total_change_o']]
    D_specific_begin = D_specific_begin.rename(columns = {'total_change_o':'initial_change_o'})
    D = pd.merge(D, D_specific_begin, how = 'left', left_on=['gameId', 'playId', 'nflId'], right_on=['gameId', 'playId', 'nflId'])
    D['specific_change_o'] = D.final_change_o - D.initial_change_o
    D['avg_spin_rate'] = D.specific_change_o/(frames)
    
    return D
    

In [6]:
def rolling_o(df):
    spin_length = 8
    df['rolling_o'] = df.real_change_o.rolling(8).sum().shift(-1*spin_length + 1)
#     df['rolling_o'] = df.real_change_o.rolling(spin_length).sum().shift(-1*spin_length + 1)
    gc.collect()
    return df

In [7]:
def GreatestRolling(D):
    if ((D.head(1).pff_sack.item()) == 1) | ((D.head(1).pff_hit.item()) == 1):
        D_relevant = D[(D.frameId > D.ball_snap + 6) & (D.frameId < D.end_frame - 18)]
    else:
        D_relevant = D[(D.frameId > D.ball_snap + 6) & (D.frameId < D.end_frame)]
    max_rolling = D_relevant.rolling_o.max()
    min_rolling = D_relevant.rolling_o.min()
    if abs(min_rolling) > abs(max_rolling):
        highest_rolling = min_rolling
    else:
        highest_rolling = max_rolling
    D['max_rolling_o'] = highest_rolling
    max_frame = D.query('rolling_o == max_rolling_o').frameId
    if len(max_frame) == 1:
        D['max_rolling_o_frameId'] = max_frame.item()
    elif len(max_frame) > 1:
        D['max_rolling_o_frameId'] = max_frame[:1].item()
    return D

In [8]:
def ChangeDirection2(D):
    frames = 40
    spin_length = 8
    D.loc[D.ball_snap + frames < D.end_frame, 'end_frame'] = D.ball_snap + frames
    D['change_o'] = D.groupby(['gameId','playId','nflId']).o_deg_std.diff()
    #magnitude of change should always be less than 180 degrees
    D['real_change_o'] = D.change_o
    D.loc[D.real_change_o > 180, 'real_change_o'] = (D.real_change_o-360)
    D.loc[D.real_change_o < -180, 'real_change_o'] = (360 + D.real_change_o)
    #rolling o
    D = D.groupby(['gameId','playId','nflId']).apply(func = rolling_o).reset_index(drop = True)
    gc.collect()
    #get max change and frameId of max change
    D = D.groupby(['gameId','playId','nflId']).apply(func = GreatestRolling).reset_index(drop = True)

    D['abs_max_rolling_o'] = D.max_rolling_o.abs()
    
    return D
    

In [10]:
# week_data = []
# for i in range(0,8):
#     print(i)
#     current_data = pd.read_parquet('/kaggle/input/bdb2023-finaldata-7/FinalData_Week' + str(i+1) + '.parquet')
#     current_data = event_frames(current_data)
#     week_data.append(current_data)

0


KeyboardInterrupt: 

In [10]:
total_rolling_test = pd.read_parquet('/kaggle/input/totalrolling8/total_rolling.parquet')

In [None]:
total_rolling = total_rolling_test.groupby(['gameId','playId','nflId']).apply(func = GreatestRolling).reset_index(drop = True)

In [None]:
total_rolling = pd.concat(rolling_data, ignore_index = True)
total_rolling = reduce_mem_usage(total_rolling)

In [None]:
total_rolling.to_parquet('/kaggle/working/total_rolling.parquet',index=False)