In [1]:
import numpy as np
import pandas as pd
import sportsdataverse

In [2]:
# function that will do inputed year to output final dataframe
# currently take in one year and will gave you a random number of games from year
def randomGamesFromSeason(year, numGames):
    """
    Returns a cleaned dataframe with a number of games
 
    Args:
        year (ints): Year you wish to collect data fram
        games (int): The number of games you wish to collect from specific season
 
    Returns:
        dataframe: Dataframe containing a random selection of games from the season selected
    """
    # import data
    mbb_df = sportsdataverse.mbb.load_mbb_pbp(seasons=[year], return_as_pandas=True)
    # Select random 10 games from the season data
    game_groups = mbb_df.groupby('game_id')
    unique_groups = game_groups.groups.keys()
    random_10_groups = np.random.choice(list(unique_groups), size=numGames, replace=False)
    random_games = mbb_df[mbb_df['game_id'].isin(random_10_groups)] # now has 10 random games from the data frame
    mbb_df = random_games
    
    # Reduce columns in dataframe
    mbb_df = mbb_df[['id', 'away_score',
       'home_score','start_quarter_seconds_remaining','period', 'away_team_id', 'team_id', 'game_id' ]]
    mbb_copy = mbb_df.copy()
    # create score difference column
    mbb_copy['score_difference'] = mbb_copy['home_score'] - mbb_copy['away_score'] # gets score_diff column
    mbb = mbb_copy
    # create seconds_remaining column based on seconds left in whole game
    mbb['seconds_remaining'] = mbb.apply(
    lambda row: row['start_quarter_seconds_remaining'] if row['period'] != 1 else row['start_quarter_seconds_remaining'] + 1200,
    axis=1
    )
    # Get possession column
    mbb = mbb.dropna() # DROP NA DROPS TOO MANY COLUMNS
    mbb['possession'] = mbb.apply(
    lambda row: 1 if row['away_team_id'] != row['team_id'] else 0,
    axis=1
    )
    # Get home_win colunn based on each game
    groups = mbb.groupby('game_id')
    for group_name, group_data in groups:
        if group_data.iloc[-1]['score_difference'] > 0:
            mbb.loc[mbb['game_id'] == group_name, 'home_win'] = 1
        else:
            mbb.loc[mbb['game_id'] == group_name, 'home_win'] = 0
    mbb_condensed = mbb[['away_score', 'home_score', 'score_difference', 'seconds_remaining', 'possession', 'home_win', 'game_id']]
    mbb_condensed['year'] = year
    return mbb_condensed
    


In [3]:
# Gives you a random s
def multipleSeasons(years, games):
    """
    Returns a number of games over a span of seasons
 
    Args:
        years (list of ints): All years you wish to collect data from that season. 
        games (int): The number of games you wish to collect from each season
 
    Returns:
        dataframe: Dataframe containing a random selection of games from each season
    """
    df = randomGamesFromSeason(years[0], games)
    for i in range(1, len(years)):
        df_temp = randomGamesFromSeason(years[i], games)
        df = pd.concat([df, df_temp])
        
    return df
        

In [4]:
seasons = multipleSeasons([2018, 2019], 5)

100%|█████████████████████████████████████████████| 1/1 [00:04<00:00,  4.49s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mbb['possession'] = mbb.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mbb.loc[mbb['game_id'] == group_name, 'home_win'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mbb_condensed['year'] = year
100%|█████

In [5]:
seasons

Unnamed: 0,away_score,home_score,score_difference,seconds_remaining,possession,home_win,game_id,year
225945,0,0,0,2400.0,0,0.0,400987401,2018
225946,0,0,0,2395.0,0,0.0,400987401,2018
225947,0,0,0,2395.0,1,0.0,400987401,2018
225948,0,2,2,2387.0,1,0.0,400987401,2018
225949,0,2,2,2368.0,0,0.0,400987401,2018
...,...,...,...,...,...,...,...,...
1638008,69,83,14,40.0,0,1.0,401088556,2019
1638009,69,83,14,37.0,0,1.0,401088556,2019
1638010,69,83,14,28.0,1,1.0,401088556,2019
1638011,70,83,13,28.0,0,1.0,401088556,2019


In [6]:
seasons['game_id'].unique()

<ArrowExtensionArray>
[400987401, 400990869, 400988332, 400991102, 400990089, 401089071, 401086252,
 401083440, 401089484, 401088556]
Length: 10, dtype: int32[pyarrow]