In [7]:
import pandas as pd

# Load Kaggle games.csv
df = pd.read_csv("games.csv", low_memory=False)

# Convert GAME_DATE_EST to datetime
df["GAME_DATE_EST"] = pd.to_datetime(df["GAME_DATE_EST"], errors="coerce")

# --- Step 1: Define start and end dates ---
start_date = pd.to_datetime("2018-10-16")   # Opening night of 2018â€“19 season
end_date   = pd.to_datetime("2022-06-16")   # Game 6 of 2022 Finals

# --- Step 2: Filter games within this window ---
df = df[(df["GAME_DATE_EST"] >= start_date) & (df["GAME_DATE_EST"] <= end_date)]

# --- Step 3: Drop unnecessary columns ---
drop_cols = ["GAME_STATUS_TEXT", "HOME_TEAM_ID", "VISITOR_TEAM_ID", "TEAM_ID_home", "TEAM_ID_away"]
df = df.drop(columns=drop_cols, errors="ignore")

# --- Step 4: Rename columns for clarity ---
df = df.rename(columns={
    "GAME_DATE_EST": "date",
    "PTS_home": "home_points",
    "FG_PCT_home": "home_fg_pct",
    "FT_PCT_home": "home_ft_pct",
    "FG3_PCT_home": "home_fg3_pct",
    "AST_home": "home_ast",
    "REB_home": "home_reb",
    "PTS_away": "away_points",
    "FG_PCT_away": "away_fg_pct",
    "FT_PCT_away": "away_ft_pct",
    "FG3_PCT_away": "away_fg3_pct",
    "AST_away": "away_ast",
    "REB_away": "away_reb",
    "HOME_TEAM_WINS": "home_win"
})

# --- Step 5: Ensure numeric columns are floats ---
numeric_cols = [
    "home_points","home_fg_pct","home_ft_pct","home_fg3_pct","home_ast","home_reb",
    "away_points","away_fg_pct","away_ft_pct","away_fg3_pct","away_ast","away_reb"
]
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors="coerce")

# --- Step 6: Feature engineering ---
df["point_diff"] = df["home_points"] - df["away_points"]   # margin of victory
df["total_points"] = df["home_points"] + df["away_points"] # combined score

# Reset index
df = df.reset_index(drop=True)

print(df.head())
print(df.tail())  # should show games ending June 16, 2022


        date   GAME_ID  SEASON  home_points  home_fg_pct  home_ft_pct  \
0 2022-06-16  42100406    2021         90.0        0.425        0.917   
1 2022-06-13  42100405    2021        104.0        0.466        0.867   
2 2022-06-10  42100404    2021         97.0        0.400        0.737   
3 2022-06-08  42100403    2021        116.0        0.483        0.708   
4 2022-06-05  42100402    2021        107.0        0.453        0.700   

   home_fg3_pct  home_ast  home_reb  away_points  away_fg_pct  away_ft_pct  \
0         0.393      27.0      41.0        103.0        0.413        1.000   
1         0.225      23.0      39.0         94.0        0.413        0.677   
2         0.395      22.0      42.0        107.0        0.440        0.800   
3         0.371      28.0      47.0        100.0        0.462        0.867   
4         0.405      25.0      42.0         88.0        0.375        0.765   

   away_fg3_pct  away_ast  away_reb  home_win  point_diff  total_points  
0         0.413   

In [8]:
print(df.iloc[-1])

date            2018-10-16 00:00:00
GAME_ID                    21800002
SEASON                         2018
home_points                   108.0
home_fg_pct                   0.442
home_ft_pct                   0.944
home_fg3_pct                  0.269
home_ast                       28.0
home_reb                       58.0
away_points                   100.0
away_fg_pct                   0.363
away_ft_pct                   0.649
away_fg3_pct                   0.27
away_ast                       21.0
away_reb                       45.0
home_win                          1
point_diff                      8.0
total_points                  208.0
Name: 5190, dtype: object
