# This notebook will
1. Scrape data from nfl by year and stats
2. Overwite tables with newer data

# Import libs

In [0]:
import pandas as pd
from pathlib import Path

import requests
from bs4 import BeautifulSoup

# Scraper

## Unique items to loop

In [0]:
dict_items = {
    'offense' : [
        'passing',
        'rushing',
        'receiving',
        'scoring',
        'downs',
    ],
    'defense' : [
        'passing',
        'rushing',
        'receiving',
        'scoring',
        'tackles',
        'downs',
        'fumbles',
        'interceptions',
    ],
    'special-teams' : [
        'field-goals',
        'scoring',
        'kickoffs',
        'kickoff-returns',
        'punting',
        'punt-returns',
    ]
}

In [0]:
df_dict = {}

## Scraper

In [0]:
for key in dict_items.keys():
    for item in dict_items[key]:
        print(f'Chave: {key} | Item: {item}')
        receiving_df = pd.DataFrame()
        for year in range(2010, 2025):
            url = f'https://www.nfl.com/stats/team-stats/{key}/{item}/{year}/reg/all'
            # html = urlopen(url)
            html = requests.get(url)
            soup = BeautifulSoup(html.text, 'html.parser')
            column_headers = [th.getText() for th in soup.findAll('thead', limit=1)[0].findAll('th')]
            data_rows = soup.findAll('tbody', limit=1)[0].findAll('tr')[0:]
            player_data = [[td.getText() for td in data_rows[i].findAll(['th','td'])] for i in range(len(data_rows))]

            year_df = pd.DataFrame(player_data, columns=column_headers)
            df = year_df.replace('\n','', regex=True)
            df['Team'] = list(df['Team'].str[:-50].replace(' ','', regex=True))
            df['year'] = year

            receiving_df = pd.concat([receiving_df, df])

            # receiving_df.to_csv(f'{PATH}/offense_{item}.csv', index=False)
        receiving_df = receiving_df.apply(pd.to_numeric, errors="ignore")
        df_dict[f'{key}_{item}'] = receiving_df

## Save tables

In [0]:
for key in df_dict.keys():
    spark_df = spark.createDataFrame(df_dict[key])
    if '-' in key:
        key = key.replace('-', '_')

    for name in spark_df.columns:
        new_name = name.replace(" ", "_").lower()
        spark_df = spark_df.withColumnRenamed(name, new_name)

    table_name = f"workspace.nfl_data.{key}"
    spark_df.write.mode("overwrite").saveAsTable(table_name)

    print(f"Table '{table_name}' created successfully.")