In [7]:
from pymongo import MongoClient
import pandas as pd
from bs4 import BeautifulSoup

In [12]:
client = MongoClient()
db = client['nfl']
season_collection = db['season_raw']
player_collection = db['player_raw']

Using the inspect and what I learned from the scrap I am going to make a pipeline to clean the data and make clean records.  I could do this directly to a DataFrame or to a new Mongo collection. 

In [14]:
clean_player = db['clean_player_rm_1']

for season_rec in season_collection.find({},{'_id':0}):
    year = season_rec['year']
    position = season_rec['position']
    
    soup = BeautifulSoup(season_rec['html']) 
    
    # Get the table data so we can ge sub data:
    table = soup.find_all('tbody')[0]
    for row in table.find_all('tr'):
        
        # make sure it is not a blank row.
        col_one = row.find('td')
        if col_one == None:
            continue
            
        # Get player url extension to find record in mongo
        player = col_one.find('a')
        url_player = player.get_attribute_list('href')[0]
        
        
        # Turn sub page inot a soup object
        sub_rec = player_collection.find_one({'player_url':url_player},{'_id':0})
        
        # This should not happen but if it does then skip.
        if sub_rec == None:
            continue

        sub_soup = BeautifulSoup(sub_rec['html'])
        
        
        data = {'position':position, 'year': year, 'player_url':url_player}
        
        # Get all row data Cool fact is that this page has a parameter we can use to name the stats
        for col in row.find_all('td'):
            field = col.get('data-stat')
            value = col.text
            data[field] = value
            
        # Get height and weight from sub page
        height = sub_soup.find('span',{'itemprop':'height'}).text
        weight = sub_soup.find('span',{'itemprop':'weight'}).text
        
        data['heigh'] = height
        data['weight'] = weight
        
        
        
        # Put clean record in new collection.  We will check to not make duplicits
        if clean_player.count_documents(data) == 0:
            clean_player.insert_one(data)
        

In [15]:
df = pd.DataFrame(list(clean_player.find({},{'_id':0})))

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1277 entries, 0 to 1276
Data columns (total 23 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   position         1277 non-null   object
 1   year             1277 non-null   int64 
 2   player_url       1277 non-null   object
 3   player           1277 non-null   object
 4   team             1277 non-null   object
 5   age              1277 non-null   object
 6   pos              1277 non-null   object
 7   g                1277 non-null   object
 8   gs               1277 non-null   object
 9   targets          1277 non-null   object
 10  rec              1277 non-null   object
 11  catch_pct        1277 non-null   object
 12  rec_yds          1277 non-null   object
 13  rec_yds_per_rec  1277 non-null   object
 14  rec_td           1277 non-null   object
 15  rec_first_down   1277 non-null   object
 16  rec_long         1277 non-null   object
 17  rec_yds_per_tgt  1277 non-null   

In [17]:
df

Unnamed: 0,position,year,player_url,player,team,age,pos,g,gs,targets,...,rec_yds_per_rec,rec_td,rec_first_down,rec_long,rec_yds_per_tgt,rec_per_g,rec_yds_per_g,fumbles,heigh,weight
0,receiving,2005,/players/F/FitzLa00.htm,Larry Fitzgerald*,ARI,22,WR,16,16,165,...,13.7,10,68,47,8.5,6.4,88.1,0,6-3,218lb
1,receiving,2005,/players/S/SmitSt01.htm,Steve Smith*+,CAR,26,WR,16,16,150,...,15.2,12,72,80,10.4,6.4,97.7,2,5-9,195lb
2,receiving,2005,/players/B/BoldAn00.htm,Anquan Boldin,ARI,25,WR,14,14,171,...,13.7,7,69,54,8.2,7.3,100.1,2,6-1,220lb
3,receiving,2005,/players/H/HoltTo00.htm,Torry Holt*,STL,29,WR,14,14,163,...,13.0,9,63,44,8.2,7.3,95.1,2,6-0,200lb
4,receiving,2005,/players/J/JohnCh01.htm,Chad Johnson *+,CIN,27,WR,16,16,155,...,14.8,9,74,70,9.2,6.1,89.5,1,6-1,188lb
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1272,receiving,2007,/players/T/TaylTr01.htm,Travis Taylor,2TM,29,,2,0,3,...,4.0,0,0,4,1.3,0.5,2.0,0,6-1,200lb
1273,receiving,2007,/players/T/ThomDa02.htm,David Thomas,NWE,24,,2,0,1,...,9.0,0,1,9,9.0,0.5,4.5,0,6-3,248lb
1274,receiving,2007,/players/T/ToefLa00.htm,LaBrandon Toefield,JAX,27,,2,1,1,...,4.0,0,0,4,4.0,0.5,2.0,0,5-11,232lb
1275,receiving,2007,/players/T/TumaJe00.htm,Jerame Tuman,PIT,31,,6,1,2,...,9.0,1,1,9,4.5,0.2,1.5,0,6-4,253lb
