# Code to Scrape DARKO and BPM


In [None]:
import os
import time

import numpy as np
import pandas as pd

os.environ["R_HOME"] = "C:\\Program Files\\R\\R-4.3.2\\"
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr


export_DIR1 = "../data/bbref/"
export_DIR2 = "../data/all_in_one_metrics/"

# Python Method

In [None]:
sheet = "1mhwOLqPu2F9026EQiVxFPIN1t9RGafGpl-dokaIsm9c"
sheet_ids = [1064086941, 142925152, 284274620, 923517192 , 1503564342]
dfa = []
for sheet_id in sheet_ids:
    url = f'https://docs.google.com/spreadsheets/d/{sheet}/gviz/tq?tqx=out:csv&gid={sheet_id}'
    df = pd.read_csv(url)
    dfa.append(df)
    time.sleep(0.5)
df1 = dfa[0]
df1.columns = ["idPlayerNBA","namePlayer","position","age","dpm","o_dpm","d_dpm","box_odpm","box_ddpm","on_off_odpm","on_off_ddpm"]
df1.to_csv(export_DIR2 + "NBA_DARKO_Current.csv")
df1.to_parquet(export_DIR2 + "NBA_DARKO_Current.parquet")
df2 = dfa[1]
df2 = df2.rename(columns={"nba_id":"idPlayerNBA", "player_name":"namePlayer"})
df2.to_csv(export_DIR2 + "NBA_DARKO_History.csv")
df2.to_parquet(export_DIR2 + "NBA_DARKO_History.parquet")
df3 = dfa[2]
df3 = df3.rename(columns={"nba_id":"idPlayerNBA", "player_name":"namePlayer"})
df3.to_csv(export_DIR2 + "NBA_DARKO_BoxScore_Talent.csv")
df3.to_parquet(export_DIR2 + "NBA_DARKO_BoxScore_Talent.parquet")
df4 = dfa[3]
df4 = df4.rename(columns={"nba_id":"idPlayerNBA", "player_name":"namePlayer"})
df4.to_csv(export_DIR2 + "NBA_DARKO_Time_Decay_RAPM.csv")
df4.to_parquet(export_DIR2 + "NBA_DARKO_Time_Decay_RAPM.parquet")
df5 = dfa[4]
df5 = df5.rename(columns={"nba_id":"idPlayerNBA", "player_name":"namePlayer"})
df5.to_csv(export_DIR2 + "NBA_DARKO_Time_Decay_RAPM_Pace.csv")
df5.to_parquet(export_DIR2 + "NBA_DARKO_Time_Decay_RAPM_Pace.parquet")

In [None]:
nbastatR = importr('nbastatR')
robjects.r('''
                Sys.setenv("VROOM_CONNECTION_SIZE" = 131072 * 2)
        ''')

In [None]:
def update_bbref(seasons):
    for season in seasons:
        season1 = str(int(season)+1)
        print(season1)
        r_df = nbastatR.bref_players_stats(
                seasons = season1,
                tables = "advanced",
                include_all_nba = False,
                only_totals = False,
                nest_data = False,
                assign_to_environment = True,
                widen_data = True,
                join_data = True,
                return_message = True
        )
        with (robjects.default_converter + pandas2ri.converter).context():
            bpm = robjects.conversion.get_conversion().rpy2py(r_df)
        vars = ['urlPlayerThumbnail','urlPlayerHeadshot', 'urlPlayerPhoto', 'urlPlayerStats','urlPlayerActionPhoto']
        bpm[vars] = bpm[vars].astype(str)
        bpm.to_csv(export_DIR1 + f"NBA_bbref_Player_Adv_{season}.csv")
        bpm.to_parquet(export_DIR1 + f"NBA_bbref_Player_Adv_{season}.parquet")
        time.sleep(5)

In [None]:
season_start = 2000
season_end = 2024
seasons = np.arange(season_start, season_end, 1).astype(str)
update_bbref(seasons)

# Deprecated Code 
## using R for scraping Data
### Credits to Krishna Narsu for the DARKO R code

In [None]:
sdasdasdas

In [None]:
%reload_ext rpy2.ipython

In [None]:
df_R = []

In [None]:
bpm,dpm,dpmhistory = [],[],[]

In [None]:
%%R -o bpm,dpm,dpmhistory
library(tidyverse)
library(nbastatR)
library(googlesheets4)
library(googledrive)
library(tidyverse)

Sys.setenv("VROOM_CONNECTION_SIZE" = 131072 * 2)
season <- 2024

bpm <- bref_players_stats(
  seasons = season,
  tables = "advanced",
  include_all_nba = FALSE,
  only_totals = FALSE,
  nest_data = FALSE,
  assign_to_environment = TRUE,
  widen_data = TRUE,
  join_data = TRUE,
  return_message = TRUE
)

#Drive authentication
drive_auth(email = "sra.djoker@gmail.com")
#DPM
dpm <- googlesheets4::read_sheet(ss = "1mhwOLqPu2F9026EQiVxFPIN1t9RGafGpl-dokaIsm9c",
                                 sheet = "Current DPM Ranks") %>% data.frame()

dpmhistory <- googlesheets4::read_sheet(ss = "1mhwOLqPu2F9026EQiVxFPIN1t9RGafGpl-dokaIsm9c",
                                        sheet = "Full DPM History") %>% data.frame()

In [None]:
bpm.to_csv(export_DIR1 + "NBA_bbref_Player_Adv_2023.csv")
bpm.to_parquet(export_DIR1 + "NBA_bbref_Player_Adv_2023.parquet")
dfb = bpm[["idPlayerNBA",'ratioOBPM', 'ratioDBPM', 'ratioBPM']].reset_index(drop=True)
dfb["idPlayerNBA"] = dfb["idPlayerNBA"].astype(int)
dfb = dfb.query("idPlayerNBA > 0")

In [None]:
dfd= dpm.rename(columns={"NBA.ID":"idPlayerNBA","Player.Name":"namePlayer"})
dfd.columns = ['idPlayerNBA', 'namePlayer', 'Position', 'age', 'dpm','o_dpm', 'd_dpm', 'box_odpm', 'box_ddpm', 'on_off_odpm', 'on_off_ddpm']
dfd["idPlayerNBA"] = dfd["idPlayerNBA"].astype(int)
dfd.columns = dfd.columns.str.replace("Offensive","O")
dfd.columns = dfd.columns.str.replace("Defensive","D")
dfd.columns = dfd.columns.str.replace(".","-")
dfd.to_csv(export_DIR2 + "NBA_DARKO_Current.csv")
dfd.to_parquet(export_DIR2 + "NBA_DARKO_Current.parquet")
dfd = dfd.drop(columns=["Position"])

In [None]:
df1 = pd.merge(dfd,dfb, on = "idPlayerNBA", how="right")

In [None]:
print(len(dfd))
print(len(dfb))
print(len(df1))

In [None]:
dfdh= dpmhistory.rename(columns={"nba_id":"idPlayerNBA","player_name":"namePlayer"})
dfdh["idPlayerNBA"] = dfdh["idPlayerNBA"].astype(int)
dfdh.to_csv(export_DIR2 + "NBA_DARKO_History.csv")
dfdh.to_parquet(export_DIR2 + "NBA_DARKO_History.parquet")
dfdh =dfdh.drop(columns=["tm_id","team_name"])
dfdh = dfdh.query("season == 2024")
dfdh = dfdh.drop(columns=["season"])

In [None]:
nan_rows = df1.isna().any(axis=1)

In [None]:
missing_players = df1["idPlayerNBA"][nan_rows].to_list()

In [None]:
df2 = dfdh[dfdh["idPlayerNBA"].isin(missing_players)].reset_index(drop=True)

In [None]:
df1nan = df1[nan_rows].sort_values("idPlayerNBA").reset_index(drop=True)
df1nan = df1nan.drop(columns = [ 'namePlayer', 'age', 'dpm','o_dpm', 'd_dpm', 'box_odpm', 'box_ddpm', 'on_off_odpm', 'on_off_ddpm'])

In [None]:
df1n = df1.dropna()

In [None]:
df3 = pd.merge(df2,df1nan,on="idPlayerNBA")

In [None]:
df4 = pd.concat([df1n,df3])

In [None]:
df4.to_csv(export_DIR2 + "NBA_DARKO_Processed.csv")