In [1]:
import pandas as pd

# Load Kaggle games.csv
df = pd.read_csv("games.csv", low_memory=False)

# Convert GAME_DATE_EST to datetime
df["GAME_DATE_EST"] = pd.to_datetime(df["GAME_DATE_EST"], errors="coerce")

# --- Step 1: Define start and end dates ---
start_date = pd.to_datetime("2018-10-16")   # Opening night of 2018â€“19 season
end_date   = pd.to_datetime("2022-06-16")   # Game 6 of 2022 Finals

# --- Step 2: Filter games within this window ---
df = df[(df["GAME_DATE_EST"] >= start_date) & (df["GAME_DATE_EST"] <= end_date)]

# --- Step 3: Keep only useful columns ---
keep_cols = [
    "GAME_ID","SEASON","GAME_DATE_EST",
    "HOME_TEAM_ID","VISITOR_TEAM_ID",
    "PTS_home","FG_PCT_home","FT_PCT_home","FG3_PCT_home","AST_home","REB_home",
    "PTS_away","FG_PCT_away","FT_PCT_away","FG3_PCT_away","AST_away","REB_away",
    "HOME_TEAM_WINS"
]
df = df[keep_cols]

# --- Step 4: Rename columns for clarity ---
df = df.rename(columns={
    "GAME_DATE_EST": "date",
    "PTS_home": "home_points",
    "FG_PCT_home": "home_fg_pct",
    "FT_PCT_home": "home_ft_pct",
    "FG3_PCT_home": "home_fg3_pct",
    "AST_home": "home_ast",
    "REB_home": "home_reb",
    "PTS_away": "away_points",
    "FG_PCT_away": "away_fg_pct",
    "FT_PCT_away": "away_ft_pct",
    "FG3_PCT_away": "away_fg3_pct",
    "AST_away": "away_ast",
    "REB_away": "away_reb",
    "HOME_TEAM_WINS": "home_win"
})

# --- Step 5: Ensure numeric columns are floats ---
numeric_cols = [
    "home_points","home_fg_pct","home_ft_pct","home_fg3_pct","home_ast","home_reb",
    "away_points","away_fg_pct","away_ft_pct","away_fg3_pct","away_ast","away_reb"
]
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors="coerce")

# --- Step 6: Feature engineering ---
df["point_diff"] = df["home_points"] - df["away_points"]   # margin of victory
df["total_points"] = df["home_points"] + df["away_points"] # combined score

# --- Step 7: Export cleaned dataset ---
df.to_csv("nba_games_2018_2022_clean.csv", index=False)

print(df.head())
print(df.tail())  # should show games ending June 16, 2022


      GAME_ID  SEASON       date  HOME_TEAM_ID  VISITOR_TEAM_ID  home_points  \
542  42100406    2021 2022-06-16    1610612738       1610612744         90.0   
543  42100405    2021 2022-06-13    1610612744       1610612738        104.0   
544  42100404    2021 2022-06-10    1610612738       1610612744         97.0   
545  42100403    2021 2022-06-08    1610612738       1610612744        116.0   
546  42100402    2021 2022-06-05    1610612744       1610612738        107.0   

     home_fg_pct  home_ft_pct  home_fg3_pct  home_ast  home_reb  away_points  \
542        0.425        0.917         0.393      27.0      41.0        103.0   
543        0.466        0.867         0.225      23.0      39.0         94.0   
544        0.400        0.737         0.395      22.0      42.0        107.0   
545        0.483        0.708         0.371      28.0      47.0        100.0   
546        0.453        0.700         0.405      25.0      42.0         88.0   

     away_fg_pct  away_ft_pct  away_fg

In [9]:
print(df.iloc[0])

date            2022-06-16 00:00:00
GAME_ID                    42100406
SEASON                         2021
home_points                    90.0
home_fg_pct                   0.425
home_ft_pct                   0.917
home_fg3_pct                  0.393
home_ast                       27.0
home_reb                       41.0
away_points                   103.0
away_fg_pct                   0.413
away_ft_pct                     1.0
away_fg3_pct                  0.413
away_ast                       27.0
away_reb                       44.0
home_win                          0
point_diff                    -13.0
total_points                  193.0
Name: 0, dtype: object
