In [1]:
import tensorflow as tf
from tensorflow.python.ops.rnn import _transpose_batch_time

from datetime import datetime
import numpy as np
import pandas as pd
import glob, os, sys, math, warnings, copy, time
import matplotlib.pyplot as  plt
os.environ["TF_CPP_MIN_LOG_LEVEL"]="3"

# customized ftns 
from preprocessing import *
from utilities import *
from model import *
from train import train_all_single_policies
# ---------------------------------------------------------
%matplotlib inline
%load_ext autoreload
%autoreload 2
warnings.filterwarnings('ignore')
# warnings.filterwarnings(action='once')
# ---------------------------------------------------------
# directories
main_dir = '../'
game_dir = main_dir+'data/'
Data = LoadData(main_dir, game_dir)
models_path = './models/'

  from ._conv import register_converters as _register_converters


#### Load raw data

In [2]:
# %%time
game_id = '0021500463'
game_data = Data.load_game(game_id)
events_df = pd.DataFrame(game_data['events'])
print('raw events shape:', events_df.shape)
home_id = events_df.loc[0].home['teamid']
events_df.head(3)

raw events shape: (231, 8)


Unnamed: 0,end_time_left,home,moments,orig_events,playbyplay,quarter,start_time_left,visitor
0,702.31,"{'abbreviation': 'CHI', 'players': [{'playerid...","[[1, 1451351428029, 708.28, 12.78, None, [[-1,...",[0],GAME_ID EVENTNUM EVENTMSGTYPE EVENTMS...,1,708.28,"{'abbreviation': 'TOR', 'players': [{'playerid..."
1,686.28,"{'abbreviation': 'CHI', 'players': [{'playerid...","[[1, 1451351428029, 708.28, 12.78, None, [[-1,...",[1],GAME_ID EVENTNUM EVENTMSGTYPE EVENTMS...,1,708.28,"{'abbreviation': 'TOR', 'players': [{'playerid..."
2,668.42,"{'abbreviation': 'CHI', 'players': [{'playerid...","[[1, 1451351444029, 692.25, 12.21, None, [[-1,...","[2, 3]",GAME_ID EVENTNUM EVENTMSGTYPE EVENTMS...,1,692.25,"{'abbreviation': 'TOR', 'players': [{'playerid..."


In [3]:
events_df.home[0]['teamid']

1610612741

In [4]:
events_df.visitor[0]['teamid']

1610612761

#### Get some suplementary data

In [5]:
# # play id to play roles/positions
# id_role = id_position(events_df)
# check_game_roles_duplicates(id_role)

# # its possible that F has similar role as G-f or F-G, we create empty slots to ensure meta order
# # ddentify defending and offending runs (this is included in process_moments)
# court_index = Data.load_csv('./meta_data/court_index.csv')
# court_index = dict(zip(court_index.game_id, court_index.court_position))

# # home and visitor ids
# homeid = events_df.loc[0].home['teamid']
# awayid = events_df.loc[0].visitor['teamid']

### FILTER EVENTS

In [6]:
# events_df.loc[3].playbyplay.to_dict('list')

In [7]:
# events_df.moments[3]

In [8]:
# n_event = 233
# P = PlotGame('0021500196', main_dir, game_dir)
# for i in range(len(events_df.moments[n_event])):
#     P.load_moment2img(game_data, n_event, i)

### as we saw that the playbyplay description of events is not accurate, so for now at least we will not try to filter by events

In [9]:
def remove_non_eleven(events_df, event_length_th=25, verbose=False):
    df = events_df.copy()
    home_id = df.loc[0]['home']['teamid']
    away_id = df.loc[0]['visitor']['teamid']
    def remove_non_eleven_(moments, event_length_th=25, verbose=False):
        ''' Go through each moment, when encounters balls not present on court,
            or less than 10 players, discard these moments and then chunk the following moments 
            to as another event.

            Motivations: balls out of bound or throwing the ball at side line will
                probably create a lot noise for the defend trajectory learning model.
                We could add the case where players are less than 10 (it could happen),
                but this is not allowed in the model and it requres certain input dimension.

            moments: A list of moments
            event_length_th: The minimum length of an event

            segments: A list of events (or, list of moments) e.g. [ms1, ms2] where msi = [m1, m2]
        '''

        segments = []
        segment = []
        # looping through each moment
        for i in range(len(moments)):
            # get moment dimension
            moment_dim = len(moments[i][5])
            # 1 bball + 10 players
            if moment_dim == 11:
                segment.append(moments[i])
            # less than ten players or basketball is not on the court
            else:
    #             print('less than 11')
                # only grab these satisfy the length threshold
                if len(segment) >= event_length_th:
                    segments.append(segment)
                # reset the segment to empty list
                segment = []
        # grab the last one
        if len(segment) >= event_length_th:
            segments.append(segment)
        if len(segments) == 0:
            if verbose: print('Warning: Zero length event returned')
        return segments
    # process for each event (row)
    df['chunked_moments'] = df.moments.apply(lambda m: remove_non_eleven_(m, event_length_th, verbose))
    # in case there's zero length event
    df = df[df['chunked_moments'].apply(lambda e: len(e)) != 0]
    df['chunked_moments'] = df['chunked_moments'].apply(lambda e: e[0])
    return df['chunked_moments'].values, {'home_id': home_id, 'away_id': away_id}

In [10]:
r, team_ids = remove_non_eleven(events_df)

In [11]:
type(r[0]) == list

True

In [12]:
events_df1 = pd.DataFrame({'moments': r})

In [13]:
def chunk_shotclock(events_df, event_length_th=25, verbose=False):
    df = events_df.copy()
    def chunk_shotclock_(moments, event_length_th, verbose):
        ''' When encounters ~24secs or game stops, chunk the moment to another event.
            shot clock test:
            1) c = [20.1, 20, 19, None,18, 12, 9, 7, 23.59, 23.59, 24, 12, 10, None, None, 10]
              result = [[20.1, 20, 19], [18, 12, 9, 7], [23.59], [23.59], [24, 12, 10]]
            2) c = [20.1, 20, 19, None, None,18, 12, 9, 7, 7, 7, 23.59, 23.59, 24, 12, 10, None, None, 10]
              result = [[20.1, 20, 19], [18, 12, 9, 7], [7], [7], [23.59], [23.59], [24, 12, 10]]

            Motivations: game flow would make sharp change when there's 24s or 
            something happened on the court s.t. the shot clock is stopped, thus discard
            these special moments and remake the following valid moments to be next event.

            moments: A list of moments
            event_length_th: The minimum length of an event
            verbose: print out exceptions or not

            segments: A list of events (or, list of moments) e.g. [ms1, ms2] where msi = [m1, m2] 
        '''

        segments = []
        segment = []
        # naturally we won't get the last moment, but it should be okay
        for i in range(len(moments)-1):
            current_shot_clock_i = moments[i][3]
            next_shot_clock_i = moments[i+1][3]
            # sometimes the shot clock value is None, thus cannot compare
            try:
                # if the game is still going i.e. sc is decreasing
                if next_shot_clock_i < current_shot_clock_i:
                    segment.append(moments[i])
                # for any reason the game is sstopped or reset
                else:
                    # not forget the last moment before game reset or stopped
                    if current_shot_clock_i < 24.:
                        segment.append(moments[i])
                    # add length condition
                    if len(segment) >= event_length_th:
                        segments.append(segment)
                    # reset the segment to empty list
                    segment = []
            # None value
            except Exception as e:
                if verbose: print(e)
                # not forget the last valid moment before None value
                if current_shot_clock_i != None:
                    segment.append(moments[i])    
                if len(segment) >= event_length_th:
                    segments.append(segment)
                # reset the segment to empty list
                segment = []

        # grab the last one
        if len(segment) >= event_length_th:
            segments.append(segment)            
        if len(segments) == 0:
            if verbose: print('Warning: Zero length event returned')
        return segments
    
    # process for each event (row)
    df['chunked_moments'] = df.moments.apply(lambda m: chunk_shotclock_(m, event_length_th, verbose))
    # in case there's zero length event
    df = df[df['chunked_moments'].apply(lambda e: len(e)) != 0]
    df['chunked_moments'] = df['chunked_moments'].apply(lambda e: e[0])
    return df['chunked_moments'].values

In [14]:
r1 = chunk_shotclock(events_df1)
events_df2 = pd.DataFrame({'moments': r1})

In [15]:
def chunk_halfcourt(events_df, event_length_th=25, verbose=False):
    df = events_df.copy()
    def chunk_halfcourt_(moments, event_length_th, verbose):
        ''' Discard any plays that are not single sided. When the play switches 
            court withhin one event, we chunk it to be as another event
        '''

        # NBA court size 94 by 50 feet
        half_court = 94/2. # feet
        cleaned = []

        # remove any moments where two teams are not playing at either side of the court
        for i in moments:
            # the x coordinates is on the 3rd or 2 ind of the matrix,
            # the first and second is team_id and player_id
            team1x = np.array(i[5])[1:6, :][:, 2]    # player data starts from 1, 0 ind is bball
            team2x = np.array(i[5])[6:11, :][:, 2]
            # if both team are on the left court:
            if sum(team1x <= half_court)==5 and sum(team2x <= half_court)==5:
                cleaned.append(i)
            elif sum(team1x >= half_court)==5 and sum(team2x >= half_court)==5:
                cleaned.append(i)

        # if teamns playing court changed during same list of moments,
        # chunk it to another event
        segments = []
        segment = []
        for i in range(len(cleaned)-1):
            current_mean = np.mean(np.array(cleaned[i][5])[:, 2], axis=0)
            current_pos = 'R' if current_mean >= half_court else 'L'
            next_mean = np.mean(np.array(cleaned[i+1][5])[:, 2], axis=0)
            next_pos = 'R' if next_mean >= half_court else 'L'

            # the next moment both team are still on same side as current
            if next_pos == current_pos:
                segment.append(cleaned[i])
            else:
                if len(segment) >= event_length_th:
                    segments.append(segment)
                segment = []
        # grab the last one
        if len(segment) >= event_length_th:
            segments.append(segment)            
        if len(segments) == 0:
            if verbose: print('Warning: Zero length event returned')
        return segments
    
    # process for each event (row)
    df['chunked_moments'] = df.moments.apply(lambda m: chunk_halfcourt_(m, event_length_th, verbose))
    # in case there's zero length event
    df = df[df['chunked_moments'].apply(lambda e: len(e)) != 0]
    df['chunked_moments'] = df['chunked_moments'].apply(lambda e: e[0])
    return df['chunked_moments'].values

In [16]:
r2 = chunk_halfcourt(events_df2)
events_df3 = pd.DataFrame({'moments': r2})

In [17]:
# court_index

In [18]:
court_index = pd.read_csv('./meta_data/court_index.csv')
court_index = dict(zip(court_index.game_id, court_index.court_position))
court_index[int('0021500196')]

1

In [19]:
def reorder_teams(events_df, game_id):
    df = events_df.copy()
    def reorder_teams_(input_moments, game_id):
        ''' 1) the matrix always lays as home top and away bot VERIFIED
            2) the court index indicate which side the top team (home team) defends VERIFIED

            Reorder the team position s.t. the defending team is always the first 

            input_moments: A list moments
            game_id: str of the game id
        '''
        # now we want to reorder the team position based on meta data
        court_index = pd.read_csv('./meta_data/court_index.csv')
        court_index = dict(zip(court_index.game_id, court_index.court_position))

        full_court = 94.
        half_court = full_court/2. # feet
        home_defense = court_index[int(game_id)]
        moments = copy.deepcopy(input_moments)
        for i in range(len(moments)):
            home_moment_x = np.array(moments[i][5])[1:6,2]
            away_moment_x = np.array(moments[i][5])[6:11,2]
            quarter = moments[i][0]
            # if the home team's basket is on the left
            if home_defense == 0:
                # first half game
                if quarter <= 2:
                    # if the home team is over half court, this means they are doing offense
                    # and the away team is defending, so switch the away team to top
                    if sum(home_moment_x>=half_court)==5 and sum(away_moment_x>=half_court)==5:
                        moments[i][5][1:6], moments[i][5][6:11] = moments[i][5][6:11], moments[i][5][1:6]
                        for l in moments[i][5][1:6]:
                            l[2] = full_court - l[2]
                        for l in moments[i][5][6:11]:
                            l[2] = full_court - l[2]
                # second half game      
                elif quarter > 2: # second half game, 3,4 quarter
                    # now the home actually gets switch to the other court
                    if sum(home_moment_x<=half_court)==5 and sum(away_moment_x<=half_court)==5:
                        moments[i][5][1:6], moments[i][5][6:11] = moments[i][5][6:11], moments[i][5][1:6]
                    elif sum(home_moment_x>=half_court)==5 and sum(away_moment_x>=half_court)==5:
                        for l in moments[i][5][1:6]:
                            l[2] = full_court - l[2]
                        for l in moments[i][5][6:11]:
                            l[2] = full_court - l[2]
                else:
                    print('Should not be here, check quarter value')
            # if the home team's basket is on the right
            elif home_defense == 1:
                # first half game
                if quarter <= 2:
                    # if the home team is over half court, this means they are doing offense
                    # and the away team is defending, so switch the away team to top
                    if sum(home_moment_x<=half_court)==5 and sum(away_moment_x<=half_court)==5:
                        moments[i][5][1:6], moments[i][5][6:11] = moments[i][5][6:11], moments[i][5][1:6]
                    elif sum(home_moment_x>=half_court)==5 and sum(away_moment_x>=half_court)==5:
                        for l in moments[i][5][1:6]:
                            l[2] = full_court - l[2]
                        for l in moments[i][5][6:11]:
                            l[2] = full_court - l[2]
                # second half game      
                elif quarter > 2: # second half game, 3,4 quarter
                    # now the home actually gets switch to the other court
                    if sum(home_moment_x>=half_court)==5 and sum(away_moment_x>=half_court)==5:
                        moments[i][5][1:6], moments[i][5][6:11] = moments[i][5][6:11], moments[i][5][1:6]
                        for l in moments[i][5][1:6]:
                            l[2] = full_court - l[2]
                        for l in moments[i][5][6:11]:
                            l[2] = full_court - l[2]
                else:
                    print('Should not be here, check quarter value')
        return moments
    return [reorder_teams_(m, game_id) for m in df.moments.values]

In [20]:
home_id

1610612741

In [21]:
court_index[int(game_id)]

0

In [22]:
r3 = reorder_teams(events_df3, game_id)
events_df4 = pd.DataFrame({'moments': r3})

In [23]:
r3[0][0]

[1,
 1451351428029,
 708.28,
 12.78,
 None,
 [[-1, -1, 18.38063, 14.07976, 8.56325],
  [1610612741, 2200, 11.15334, 21.35529, 0.0],
  [1610612741, 201959, 16.79035, 20.55978, 0.0],
  [1610612741, 201565, 9.12233, 39.32051, 0.0],
  [1610612741, 202710, 21.15543, 32.71616, 0.0],
  [1610612741, 202703, 8.40459, 11.67492, 0.0],
  [1610612761, 2449, 24.18381, 44.21187, 0.0],
  [1610612761, 201960, 1.06327, 2.54971, 0.0],
  [1610612761, 200768, 7.28146, 48.40417, 0.0],
  [1610612761, 201942, 18.1243, 14.25539, 0.0],
  [1610612761, 202687, 10.82794, 26.70275, 0.0]]]

In [24]:
m = np.array(r3[0][0][5])

In [25]:
m[0]

array([-1.     , -1.     , 18.38063, 14.07976,  8.56325])

In [26]:
m[1]

array([1.61061274e+09, 2.20000000e+03, 1.11533400e+01, 2.13552900e+01,
       0.00000000e+00])

In [27]:
# features = np.concatenate((m[1:11, 2:4].reshape(-1), m[0][2:5], np.array([r3[0][0][0]]), np.array([r3[0][0][2]])))#,2]]))

In [28]:
# features

In [29]:
# len(features)

In [30]:
# events_df4.moments.loc[0]

In [31]:
def flatten_moments(events_df):
    df = events_df.copy()
    def flatten_moment(moment):
        m = np.array(moment[5])
        features = np.concatenate((m[1:11, 2:4].reshape(-1),    # x,y of all 10 players 
                                   m[0][2:5],                   # basketball x,y,z 
                                   np.array([moment[0]]),       # quarter number 
                                   np.array([moment[2]]),       # time in seconds left to the end of the period
                                   np.array([moment[3]])))      # shot clock 
        return features
    
    def get_team_ids(moment):
        m = np.array(moment[5])
        team_id1 = set(m[1:6, 0])
        team_id2 = set(m[6:11, 0])
        assert len(team_id1) == len(team_id2) == 1
        assert team_id1 != team_id2
        return [list(team_id1)[0], list(team_id2)[0]]
        
        
    df['flattened'] = df.moments.apply(lambda ms: [flatten_moment(m) for m in ms])
    df['team_ids'] = df.moments.apply(lambda ms: get_team_ids(ms[0])) # just use the first one to determine        
    
    return df['flattened'].values, df['team_ids'].values

r4, team_ids = flatten_moments(events_df4)
events_df5 = pd.DataFrame({'moments': r4})   


def create_static_features(events_df):
    df = events_df.copy()
    def create_static_features_(moment):
        ''' moment: flatten moment i.e. (25=10*2+3+2,)'''
        # distance of each players to the ball
        player_xy = moment[:10*2]
        b_xy = moment[10*2:10*2+2]
        hoop_xy = np.array([3.917, 25])

        def disp_(pxy, target):
            # dispacement to bball
            disp = pxy.reshape(-1, 2) - np.tile(target, (10, 1))
            r = np.sqrt(disp[:,0]**2 + disp[:, 1]**2)               # r 
            cos_theta = disp[:, 0]/r                                # costheta
            sin_theta = disp[:, 1]/r                                # sintheta
            theta = np.arccos(cos_theta)                            # theta
            return np.concatenate((r, cos_theta, sin_theta, theta))
        return np.concatenate((moment, disp_(player_xy, b_xy), disp_(player_xy, hoop_xy)))
    df['enriched'] = df.moments.apply(lambda ms: np.vstack([create_static_features_(m) for m in ms]))
    return df['enriched'].values
    
r5 = create_static_features(events_df5)
events_df6 = pd.DataFrame({'moments': r5})

In [32]:
def create_dynamic_features(events_df, fs):
    df = events_df.copy()
    def create_dynamic_features_(moments, fs):
        ''' moments: (moments length, n existing features)'''
        pxy = moments[:, :23] # get the players x,y and basketball x,y,z coordinates
        next_pxy = np.roll(pxy, -1, axis=0) # get next frame value
        vel = ((next_pxy - pxy)/fs)[:-1, :] # the last velocity is not meaningful
        # when we combine this back to the original features, we shift one done,
        # i.e. [p1, p2, ..., pT] combine [_, p2-p1, ...., pT-pT_1]
        # the reason why we shift is that we don't want to leak next position info
        return np.column_stack([moments[1:, :], vel])
    df['enriched'] = df.moments.apply(lambda ms: create_dynamic_features_(ms, fs))
    return df['enriched'].values


In [33]:
r6 = create_dynamic_features(events_df6, 1/25.)
events_df7 = pd.DataFrame({'moments': r6})

In [34]:
r6[0][0]

array([ 1.09922700e+01,  2.13145200e+01,  1.64770900e+01,  2.04079900e+01,
        9.04023000e+00,  3.92987900e+01,  2.12817200e+01,  3.28909500e+01,
        8.39265000e+00,  1.16947200e+01,  2.42594200e+01,  4.41244400e+01,
        1.11990000e+00,  2.57540000e+00,  7.33219000e+00,  4.82904400e+01,
        1.80694800e+01,  1.42062100e+01,  1.04847800e+01,  2.66978100e+01,
        1.82924900e+01,  1.41473300e+01,  8.97331000e+00,  1.00000000e+00,
        7.08240000e+02,  1.27500000e+01,  1.02304362e+01,  6.51855358e+00,
        2.67992585e+01,  1.89804844e+01,  1.01991239e+01,  3.05651988e+01,
        2.07076655e+01,  3.58591709e+01,  2.30651934e-01,  1.47808959e+01,
       -7.13578568e-01, -2.78497366e-01, -3.45243134e-01,  1.57489658e-01,
       -9.70655923e-01,  1.95219735e-01, -8.29286623e-01, -3.05648450e-01,
       -9.66868114e-01, -5.28229821e-01,  7.00575212e-01,  9.60436993e-01,
        9.38513281e-01,  9.87520636e-01, -2.40472616e-01,  9.80759530e-01,
       -5.58823494e-01,  

##### role alignment

In [35]:
# 10*2 (10 players with x,y) + 3(bball x,y,z) + 1(qtr number) + 1(time left in qtr) + 1(sc) + 
# 10*(4(r,cos,sin,theta)_bball + 4(r, cos, sin, theta)_hoop) + 10*2 (10 players vx, vy) + 3(bball vx,vy,vz)
n_fts = 10*2 + 3 + 1 + 1 + 1 + 10*(4+4) + 10*2 + 3
n_fts

129

In [36]:
class HiddenStructureLearning:
    def __init__(self, events_df):
        self.df = events_df.copy()
        self.defend_players = list(range(5))
        self.offend_players = list(range(5, 10))
        
    def find_features_ind_(self, player):
        assert player < 10
        pxy_ind = [player*2, player*2+1]
        bball_xy_ind = [2*10, 2*10+1, 2*10+2]
        qtr_ind = [23]
        time_left_ind = [24]
        sc_ind = [25]
        polar_bball_ind = [26+player*4, 26+player*4+1, 26+player*4+2, 26+player*4+3]
        polar_hoop_ind = [66+player*4, 66+player*4+1, 66+player*4+2, 66+player*4+3]
        pvxy_ind = [106+player*2, 106+player*2+1]
        bball_vxy_ind = [126, 127, 128]
        player_features_ind = pxy_ind + polar_bball_ind + polar_hoop_ind + pvxy_ind
#         features_ind = np.array(pxy_ind + bball_xy_ind + qtr_ind + time_left_ind + sc_ind + polar_bball_ind \
#                      + polar_hoop_ind + pvxy_ind + bball_vxy_ind)
        features_ind = np.array(pxy_ind + bball_xy_ind + sc_ind + polar_bball_ind \
                     + polar_hoop_ind + pvxy_ind + bball_vxy_ind)
        return player_features_ind, features_ind
    
    def create_hmm_input_(self, player_inds):
        event = self.df.moments.values
        X = np.concatenate([np.concatenate([ms[:, self.find_features_ind_(player)[1]] for ms in event], axis=0) \
                            for player in player_inds], axis=0)
        lengths = np.concatenate([[len(ms) for ms in event] for _ in range(len(player_inds))],
                                 axis=0)
        assert len(event[0]) == lengths[0]
        assert len(event[-1]) == lengths[-1]
        return X, lengths
    
    def train_hmm_(self, player_inds, verbose=True, random_state=42):
        from hmmlearn import hmm
        assert len(player_inds) == 5 # defend and offend players each are five
        X, lengths = self.create_hmm_input_(player_inds=player_inds)
        model = hmm.GaussianHMM(n_components=5, 
                                covariance_type='full', 
                                n_iter=50, 
                                random_state=random_state,
                                verbose=verbose)
        model.fit(X, lengths)
        state_sequence = model.predict(X, lengths)
        state_sequence_prob = model.predict_proba(X, lengths) # (n_samples, n_components)
        n_samples, _ = state_sequence_prob.shape
        cmeans = model.means_
        return {'X': X,
                'lengths': lengths,
                'state_sequence': state_sequence.reshape(5, -1), 
#                 'state_sequence_prob': [state_sequence_prob[i:i+n_samples//5] for i in range(0, n_samples, n_samples//5)], 
                'state_sequence_prob': state_sequence_prob,
                'cmeans': cmeans}
    
    def assign_roles(self, player_inds, mode='euclidean'):
        result = self.train_hmm_(player_inds=player_inds)
        if mode == 'euclidean':
            ed = distance.cdist(result['X'], result['cmeans'], 'euclidean')
        if mode == 'cosine':
            ed = distance.cdist(result['X'], result['cmeans'], 'cosine')
        elif mode == 'post':
            print(np.sum(result['state_sequence_prob'], axis=1))
#             assert sum(np.sum(result['state_sequence_prob'], axis=1) == 1) == len(result['state_sequence_prob'])
            ed = 1 - result['state_sequence_prob']
            
        assert len(player_inds) == 5
        n = len(ed)//5 # number of sequences for each players
        assert len(ed) % 5 == 0 # it should be divisibe by number of players
        
        def assign_ind_(cost):
            row_ind, col_ind = linear_sum_assignment(cost)
            return col_ind
        
        role_assignments = np.array([assign_ind_(ed[np.arange(5)*n + i]) for i in range(n)])
        return role_assignments, result
    
    def reorder_moment(self):
        defend_role_assignments, defend_result = self.assign_roles(player_inds=HSL.defend_players)
        offend_role_assignments, offend_result = self.assign_roles(player_inds=HSL.offend_players)
        
        original = copy.deepcopy(self.df.moments.values)
        reordered = copy.deepcopy(self.df.moments.values)
        def reorder_moment_(original, reordered, role_assignments):
            divider = 0
            lengths = [len(m) for m in original]
            # iteratve through each moments length
            for i in range(len(lengths)):
                # grab the corresponding moments' reordered roles
                ra_i = role_assignments[divider:divider+lengths[i]]
                # update the next starting index
                divider += lengths[i]
                # iterate through each moment in the current moments
                for j in range(lengths[i]):
                    # iterate through each players
                    for p in HSL.defend_players:
                        # get the current player feature index
                        p_ind = HSL.find_features_ind_(p)[0]
                        # get the player feature index corresponding to the reordered role
                        re_p_ind = HSL.find_features_ind_(ra_i[j][p])[0]
                        reordered[i][j][re_p_ind] = original[i][j][p_ind]
            return reordered
        reordered_defend = copy.deepcopy(reorder_moment_(original, reordered, defend_role_assignments))
        reordered_all = copy.deepcopy(reorder_moment_(original, reordered_defend, offend_role_assignments))
        return reordered_all

In [37]:
# HSL.defend_players

In [38]:
HSL = HiddenStructureLearning(events_df7)
role_assignments, result1 = HSL.assign_roles(player_inds=HSL.defend_players, mode='post')

         1    -9364457.4715             +nan
         2    -8347618.3232    +1016839.1482
         3    -7312731.2810    +1034887.0423
         4    -7247373.4453      +65357.8356
         5    -7234939.6452      +12433.8001
         6    -7224795.4171      +10144.2281
         7    -7213962.3048      +10833.1124
         8    -7205298.9951       +8663.3097
         9    -7197884.0653       +7414.9298
        10    -7192165.1247       +5718.9406
        11    -7185985.7148       +6179.4099
        12    -7180329.3359       +5656.3789
        13    -7178879.0334       +1450.3025
        14    -7178434.8762        +444.1571
        15    -7178204.0477        +230.8285
        16    -7178144.0802         +59.9675
        17    -7178130.8593         +13.2209
        18    -7178078.1963         +52.6631
        19    -7178069.2113          +8.9849
        20    -7178068.4769          +0.7345
        21    -7178068.1952          +0.2816
        22    -7178068.0225          +0.1728
        23

[1. 1. 1. ... 1. 1. 1.]


In [None]:
result1['state_sequence']

In [None]:
result['state_sequence']

In [None]:
np.sum(result1['state_sequence'][3] == result['state_sequence'][3])

In [None]:
result['state_sequence'][0].shape

In [65]:
# n_rows = [len(i) > 0 for i in test_seq]
# n_cols = [i.shape[1] for i in test_seq]
# assert len(set(n_cols)) == 1
# assert sum(n_rows) == len(n_cols)

In [66]:
# from preprocessing import subsample_sequence

In [67]:
# test_seq[2]

In [68]:
# subsample_sequence(test_seq, 2)

In [74]:
a = np.array([1,2,3])
b = pd.DataFrame({'A':a})

In [78]:
b['B'] = b.A.apply(lambda x: np.array([0]*x))

In [81]:
b['B'][0]

array([0])

[7, 6, 2, 7, 9, 3, 2, 8, 5, 4]

In [None]:
r = HSL.reorder_moment()

In [None]:
lengths[0]

In [None]:
len(role_assignments)

In [None]:
events_df7.moments.values

In [None]:
# original = copy.deepcopy(events_df7.moments.values)
# reordered = copy.deepcopy(events_df7.moments.values)
# divider = 0
# lengths = [len(m) for m in original]
# # iteratve through each moments length
# for i in range(len(lengths)):
# #     print(i, len(lengths))
#     # grab the corresponding moments' reordered roles
#     ra_i = role_assignments[divider:divider+lengths[i]]
#     # update the next starting index
#     divider += lengths[i]
#     # iterate through each moment in the current moments
#     for j in range(lengths[i]):
#         # iterate through each players
#         for p in HSL.defend_players:
#             # get the current player feature index
#             p_ind = HSL.find_features_ind_(p)[0]
#             # get the player feature index corresponding to the reordered role
#             re_p_ind = HSL.find_features_ind_(ra_i[j][p])[0]
#             reordered[i][j][re_p_ind] = original[i][j][p_ind] 
    

In [None]:
r[0]

In [None]:
team_ids

In [None]:
OHE = OneHotEncoding()
final = [np.column_stack((r[i], np.tile(OHE.encode(team_ids[i]), (len(r[i]), 1)))) for i in range(len(r))]

In [None]:
cmeans.shape

In [None]:
result[1]['X']

In [None]:
from scipy.spatial import distance
ed = distance.cdist(result[1]['X'], result[1]['cmeans'], 'euclidean')

In [None]:
ed.shape

In [None]:
concated_ms[0]

In [None]:
cmeans[0]

In [None]:
np.sqrt(sum((concated_ms[0]-cmeans[0])**2))

In [None]:
np.sqrt(sum((concated_ms[0]-cmeans[1])**2))

In [None]:
ed[0]

In [None]:
n = len(ed)//5 # number of sequences
assert len(ed) % 5 == 0

In [None]:
cost = ed[np.arange(5)*n]
cost

In [None]:
# 1) minimize the wrong posterior i.e. 1 - posterior
# 2) euclidean distance to the means
# 3) adjusted consine similarity to the means

In [None]:
# defend_X, defend_lengths = create_hmm_input(events_df7, players=list(range(5)))
# offend_X, offend_lengths = create_hmm_input(events_df7, players=list(range(5, 10)))

In [None]:
# defend_model = hmm.GaussianHMM(n_components=5, covariance_type='diag', n_iter=50, verbose=True)#, random_state=42)
# defend_model.fit(defend_X, defend_lengths)

In [None]:
# offend_model = hmm.GaussianHMM(n_components=5, covariance_type='diag', n_iter=50, verbose=True)#, random_state=42)
# offend_model.fit(offend_X, offend_lengths)

In [None]:
# from hmmlearn import hmm
# model = hmm.GaussianHMM(n_components=5, covariance_type='diag', n_iter=50, verbose=True)
# model.fit(X, lengths)

In [None]:
# cmeans = defend_model.means_
# covars = defend_model.covars_

In [None]:
# state_sequence = defend_model.predict(defend_X, defend_lengths)

In [None]:
d = defend_lengths.reshape(-1, 5)

In [None]:
s = state_sequence.reshape(-1, 5)

In [None]:
s.shape[0]

In [None]:
a = np.arange(10)
a

In [None]:
a.reshape(5, -1).T

In [None]:
defend_lengths.shape

In [None]:
l = defend_lengths.reshape(5, -1)

In [None]:
l[0,:] == l[1,:]

1) sumsample should only happen all the way until the end, e.g. if velocity is computed after subsample then the direction of the velocity will result in more errors.