In [1]:
import numpy as np
import pandas as pd
from IPython.display import display


In [2]:
df_fantasyPros = pd.read_csv('data/fantasyPros_L0.csv')

display(df_fantasyPros.head(5))
display(df_fantasyPros.info())


Unnamed: 0,NBA Fantasy Basketball Overall 2025-26 Average Projections | FantasyPros,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11
0,Player,PTS,REB,AST,BLK,STL,FG%,FT%,3PM,GP,MIN,TO
1,Shai Gilgeous-Alexander (OKC - PG),32.4,5.3,6.3,0.9,1.8,0.52,0.893,1.8,76,34.9,2.5
2,"Giannis Antetokounmpo (MIL - PF,C)",32.2,11.6,6.6,1,1,0.586,0.638,0.4,74,35.4,3.5
3,"Luka Doncic (LAL - PG,SG)",29.7,8.5,8.5,0.5,1.5,0.478,0.774,3.1,74,36.4,3.9
4,Nikola Jokic (DEN - C),27.6,12.3,9.6,0.7,1.6,0.575,0.807,1.6,75,36.1,3.2


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 356 entries, 0 to 355
Data columns (total 12 columns):
 #   Column                                                                    Non-Null Count  Dtype 
---  ------                                                                    --------------  ----- 
 0   NBA Fantasy Basketball Overall 2025-26 Average Projections | FantasyPros  341 non-null    object
 1   Unnamed: 1                                                                281 non-null    object
 2   Unnamed: 2                                                                281 non-null    object
 3   Unnamed: 3                                                                281 non-null    object
 4   Unnamed: 4                                                                281 non-null    object
 5   Unnamed: 5                                                                281 non-null    object
 6   Unnamed: 6                                                                

None

In [3]:
# Set the first row as header, then remove it from the data
df_fantasyPros.columns = df_fantasyPros.iloc[0]
df_fantasyPros = df_fantasyPros[1:].reset_index(drop=True)

In [4]:
# 1) Normalize dashes/spaces
df_fantasyPros['Player'] = (
    df_fantasyPros['Player']
    .astype(str)
    .str.replace(r'[\u2013\u2014–—]', '-', regex=True)  # normalize dashes
    .str.replace(r'\s+', ' ', regex=True)               # clean extra spaces
    .str.strip()
)

# 2) Remove *trailing* status flags like OUT, DTD, GTD, O, NA, IR, etc.
#    Examples handled:
#    "Name (TEAM - POS) OUT", "Name (TEAM - POS) - DTD"
status_tokens = r'(?:Q|GTD|DTD|O|OUT|INJ|IL|NA|IR|SUS|MIN|REST|DNP|DAY-TO-DAY)'
df_fantasyPros['Player'] = df_fantasyPros['Player'].str.replace(
    rf'\)\s*(?:-?\s*)?{status_tokens}\s*$', ')', regex=True
)

# 3) Extract PLAYER, TEAM, POS from "Name (TEAM - POS...)" — allow commas/slashes in POS
#    Note: we purposely do NOT anchor the end ($) so extra harmless suffixes won't break it.
parsed = df_fantasyPros['Player'].str.extract(
    r'^(?P<PLAYER>.+?) \((?P<TEAM>[A-Z]{2,4})\s*-\s*(?P<POS>[A-Z0-9/, ]+)\)'
)

# 4) Optional POS tidy (remove spaces like "PG, SG" -> "PG,SG")
parsed['POS'] = parsed['POS'].str.replace(' ', '', regex=False)

# 5) Assign back
df_fantasyPros[['PLAYER', 'POS', 'TEAM']] = parsed[['PLAYER', 'POS', 'TEAM']]

# 6) (Optional) Quick audit to see if anything still failed to parse
unmatched = df_fantasyPros[df_fantasyPros['PLAYER'].isna()].copy()
if not unmatched.empty:
    print("Unmatched rows after parse (showing up to 15):")
    display(unmatched[['Player']].head(15))

# Optional: drop original column
# df_fantasyPros = df_fantasyPros.drop(columns=['Player'])

Unmatched rows after parse (showing up to 15):


Unnamed: 0,Player
280,Featured Tools
281,
282,NBA
283,Consensus Rankings
284,Up-to-date rankings from all the top fantasy e...
285,
286,Consensus Rankings
287,NBA
288,Mock Draft Simulator
289,Practice for your draft with the best mock dra...


In [5]:
df_fantasyPros = df_fantasyPros.drop(columns=['Player'])

In [6]:
df_fantasyPros = df_fantasyPros.rename(columns={'MIN': 'MPG'})

In [7]:
desired_order = [
    'PLAYER', 'POS', 'TEAM', 'GP', 'MPG', 'PTS',
    'FG%', 'FT%', '3PM',
    'REB', 'AST', 'STL', 'BLK', 'TO'
]

df_fantasyPros = df_fantasyPros[desired_order]

In [8]:
numeric_cols = [
    'GP', 'MPG', 'PTS',
    'FG%', 'FT%', '3PM',
    'REB', 'AST', 'STL', 'BLK', 'TO'
]

# Convert safely — non-numeric entries become NaN
for col in numeric_cols:
    df_fantasyPros[col] = pd.to_numeric(df_fantasyPros[col], errors='coerce')

In [9]:
#df_hashTag[numeric_cols] = df_hashTag[numeric_cols].fillna(0)

In [10]:
display(df_fantasyPros.head(15))
display(df_fantasyPros.info())

Unnamed: 0,PLAYER,POS,TEAM,GP,MPG,PTS,FG%,FT%,3PM,REB,AST,STL,BLK,TO
0,Shai Gilgeous-Alexander,PG,OKC,76.0,34.9,32.4,0.52,0.893,1.8,5.3,6.3,1.8,0.9,2.5
1,Giannis Antetokounmpo,"PF,C",MIL,74.0,35.4,32.2,0.586,0.638,0.4,11.6,6.6,1.0,1.0,3.5
2,Luka Doncic,"PG,SG",LAL,74.0,36.4,29.7,0.478,0.774,3.1,8.5,8.5,1.5,0.5,3.9
3,Nikola Jokic,C,DEN,75.0,36.1,27.6,0.575,0.807,1.6,12.3,9.6,1.6,0.7,3.2
4,Anthony Edwards,"PG,SF,SG",MIN,78.0,36.8,27.3,0.45,0.822,3.6,5.9,5.1,1.3,0.7,3.2
5,Devin Booker,"PG,SG",PHO,75.0,37.0,26.5,0.469,0.886,2.3,4.1,6.9,0.9,0.3,2.8
6,Joel Embiid,C,PHI,59.0,31.3,26.3,0.5,0.873,1.1,8.4,4.1,0.9,1.2,3.1
7,Jaylen Brown,"SG,SF",BOS,75.0,35.7,26.0,0.478,0.738,2.3,6.8,4.5,1.4,0.6,3.0
8,Jalen Brunson,PG,NYK,74.0,35.6,25.7,0.484,0.828,2.3,3.1,6.9,0.9,0.2,2.4
9,Donovan Mitchell,"PG,SG",CLE,75.0,33.4,25.6,0.455,0.836,3.5,4.6,5.2,1.5,0.3,2.4


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 355 entries, 0 to 354
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   PLAYER  280 non-null    object 
 1   POS     280 non-null    object 
 2   TEAM    280 non-null    object 
 3   GP      280 non-null    float64
 4   MPG     280 non-null    float64
 5   PTS     280 non-null    float64
 6   FG%     280 non-null    float64
 7   FT%     280 non-null    float64
 8   3PM     280 non-null    float64
 9   REB     280 non-null    float64
 10  AST     280 non-null    float64
 11  STL     280 non-null    float64
 12  BLK     280 non-null    float64
 13  TO      280 non-null    float64
dtypes: float64(11), object(3)
memory usage: 39.0+ KB


None

In [11]:
# Convert to csv

df_fantasyPros.to_csv("../L1/data/fantasyPros_L1.csv")