# Web Scraping with requests and Beautiful Soup

#### Imports Statements

In [1]:
import requests
from bs4 import BeautifulSoup as bs
from bs4 import Comment
import pandas as pd
import time

#### Get URL

In [2]:
url= 'https://www.basketball-reference.com/players/s/simshe01.html'

#### Headers to not overload systems

In [3]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Referer": "http://example.com/previous-page",
    "Cookie": "session_id=your_session_id_here",
    "Connection": "keep-alive",
    "Cache-Control": "max-age=0"
}

#### Get responses from website and return status

In [7]:
response = requests.get(url, headers=headers)
status = response.status_code

#### Return weather you were able to scrap or not

In [8]:
if status == 200:
    page = response.text
    soup = bs(page)
    print("HTML Recieved!")
    
else:
    print(f"Oops! Received status code {status}")

Oops! Received status code 429


#### Print HTML if obtained

In [None]:
print(soup.prettify())

## Create Pipeline

#### Create function for getting HTML Page

In [None]:
def get_web_page(url):
    # Headers
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Referer": "http://example.com/previous-page",
        "Cookie": "session_id=your_session_id_here",
        "Connection": "keep-alive",
        "Cache-Control": "max-age=0"
    }
    # Response and status saved
    response = requests.get(url, headers=headers)
    status = response.status_code
    
    # Return if HTML obtained
    if status == 200:
        return response.text
    else:
        print(f"Oops! Received status code {status}")
        return ""

#### Get URLS and pipline list ready

In [None]:
# URl
base = "https://www.basketball-reference.com"
url_start = f"{base}/players/"

# List of Info
pipeline_list = []

#### Get HTML and soup

In [None]:
html = get_web_page(url_start)
soup = bs(html, "html.parser")

#### Find hidden tags within HTML Comments and only keeps URL linked comments *--Used Help from ChatGPT*

In [None]:
# Finds all comments in HTML
comments = soup.find_all(string=lambda text: isinstance(text, Comment))

# Empty List
letter_links = []

# Loop through comments to only keep the ones with URLS and remove the rest
for c in comments:
    if '/players/' in c:
        comment_soup = bs(c, "html.parser")
        for a in comment_soup.select("a[href^='/players/']"):
            href = a.get("href")
            if href and len(href) == 11:
                letter_links.append(base + href)

# Print links found
print(f"Found {len(letter_links)} letter pages.")

#### Gather all Player Links

In [None]:
# List of Links
player_links = []

# Loop through the URLs and obtain HTML returns of those pages
for letter_url in letter_links:
    page = get_web_page(letter_url)
    soup = bs(page, "html.parser")

    # From HTML get the extact player pages
    for a in soup.select("th[data-stat='player'] a"):
        href = a.get("href")
        if href:
            player_links.append(base + href)
    time.sleep(1)

# Print out total player links
print(f"Collected {len(player_links)} player profile links.")

#### Loop through 5 players and scrape each part of their stats

In [6]:
# Loop through first 5 players
for i, url in enumerate(player_links[:5]):
    
    # Get the HTML and soup of the player
    html = get_web_page(url)
    soup = bs(html, "html.parser")

    # Get name
    name = soup.find("h1").find("span").text.strip() if soup.find("h1") else "N/A"

    # Get p tags
    p_tags = soup.find_all("p")

    # Converts each paragraph into text
    for p in p_tags:
        text = p.get_text(" ", strip=True)
    
        # Look for the Position
        if "Position:" in text:
            # Extract the position value only *got help from chatGPT
            position = (
                text.split("Position:")[1]
                .split("Shoots:")[0]
                .split("â–ª")[0]
                .strip()
            )
    
        # Look for the Weight
        if "lb" in text:
            spans = p.find_all("span")
            if len(spans) > 1:
                # Extract the weight value only
                weight_text = spans[1].text.strip()
                if "lb" in weight_text:
                    weight = int(weight_text.replace("lb", "").strip())


        # mp_per_g tag
        minute_tags = soup.find_all("td", {"data-stat": "mp_per_g"})
        # extract and convert to float
        minutes_played = [float(tag.text.strip()) for tag in minute_tags if tag.text.strip()]

        # pts_per_g tag
        ppg_tags = soup.find_all("td", {"data-stat": "pts_per_g"})
        # extract and convert to float
        points_per_game = [float(tag.text.strip()) for tag in ppg_tags if tag.text.strip()]
          
        # fta_per_g tag
        fta_tags = soup.find_all("td", {"data-stat": "fta_per_g"})
        # Create empty list
        free_throws = []
        # Loop throuhg the tag and add values to list * got help from chatGPT
        for tag in fta_tags:
            text = tag.text.strip()
            if text:  # skip empty cells
                try:
                    free_throws.append(float(text))
                except ValueError:
                    pass  # ignore non-numeric text
        # if no values resort to 0
        if not free_throws:
            free_throws = [0]

        # blk_per_g tag
        blk_tags = soup.find_all("td", {"data-stat": "blk_per_g"})
        # extract and convert to float *got help from chatGPT
        blocks = float(blk_tags[-1].text.strip()) if blk_tags and blk_tags[-1].text.strip() else 0
        
        # games total
        games_label = soup.find("span", {"class": "poptip", "data-tip": "Games"})
        if games_label:
            # find the p tag and get the total game
            p_tag = games_label.find_next("p")
            if p_tag and p_tag.text.strip().isdigit():
                total_games = int(p_tag.text.strip())




        # Create a big dictionary for each player with their corresponding stats
        player_dict = {
        "name": name,
        "position": position,
        "weight": weight,
        "minutes played": minutes_played,
        "points": points_per_game,
        "free_throws": free_throws,
        "blocks": blocks,
        "total_games": total_games
        }
        
        # Add to pipeline list
        pipeline_list.append(player_dict)
    
    # sleep between requests
    time.sleep(2)

# Review collected data
print(f"\nTotal players scraped: {len(pipeline_list)}")
pipeline_list


NameError: name 'player_links' is not defined

#### Start to move data to pandas df

In [None]:
# Convert list of player dictionaries to DataFrame
df = pd.DataFrame(pipeline_list)

# Standardize column names to avoid spaces/case issues
df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]

# --- Compute averages for list-based or single-value stats ---
for col, new_col in [
    ("minutes_played", "avg_minutes"),
    ("points", "avg_points"),
    ("free_throws", "avg_free_throws"),
    ("blocks", "avg_blocks")
]:
    if col in df.columns:
        df[new_col] = df[col].apply(
            lambda x: round(sum(x)/len(x), 1) if isinstance(x, list) and len(x) > 0
            else round(float(x), 1) if x is not None else 0
        )
    else:
        df[new_col] = 0

# Drop the original list or raw columns (except total_games)
for col in ["minutes_played", "points", "free_throws", "blocks"]:
    if col in df.columns:
        df.drop(columns=[col], inplace=True)

# --- Ensure numeric columns are proper types ---
numeric_cols = ["weight", "avg_minutes", "avg_points", "avg_free_throws", "avg_blocks"]
for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

# Convert total_games to numeric but do not modify it otherwise
if "total_games" in df.columns:
    df["total_games"] = pd.to_numeric(df["total_games"], errors='coerce').fillna(0).astype(int)

# --- Clean string columns ---
string_cols = ["name", "position", "url"]
for col in string_cols:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip()

# --- Optional: reorder columns ---
cols_order = [
    "name", "position", "weight", "avg_minutes", "avg_points",
    "avg_free_throws", "avg_blocks", "total_games", "url"
]
df = df[[c for c in cols_order if c in df.columns]]

# --- Display the cleaned DataFrame ---
df.head()
