In [192]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd

### Per Game Stats

In [193]:
def basic_crawler(soup, year):
    
    table_index = 4 if year >= 2015 else 2
    game_header = [th.getText() for th in soup.findAll('table')[4].findAll('th', class_ = 'poptip center')]
    game_table = soup.findAll('table')[table_index]
    game_row = game_table.findAll('tr')[1:]
    game_data = [[i.getText() for i in game_row[j].findAll('td')] for j in range(len(game_row))]
    per_game = pd.DataFrame(game_data, columns = game_header)
    per_game.Team = per_game.Team.replace('Los Angeles Lakers*', 'LA Lakers temp')
    per_game.Team = per_game.Team.replace('Los Angeles Lakers', 'LA Lakers temp')
    per_game.Team = per_game.Team.replace('Los Angeles Clippers*', 'LA Clippers temp')
    per_game.Team = per_game.Team.replace('Los Angeles Clippers', 'LA Clippers temp')
    game_citys = [x[:-1] for x in per_game.Team.str.strip('*').str.split(' ')]
    for index, city in enumerate(game_citys):
            game_citys[index] = city[0] + ' ' + city[1] if len(city) > 1 else city[0]
    per_game.Team = game_citys
    per_game.Team = per_game.Team.replace('Portland Trail', 'Portland')
    game_select = ['Team', 'FG', 'FG%', '3P', '3P%', '2P', '2P%', 'FT', 'FT%', 'ORB', 'DRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']
    game_done = per_game[game_select]
    for i in game_select[1:]:
        game_done[i] = pd.to_numeric(game_done[i])
    game_done = game_done.sort_values(by = 'Team').reset_index(drop = True)
    game_done['Year'] = year
    
    return game_done

### Advanced Stats

In [194]:
def advanced_crawler(soup, year):
    
    table_index = 10 if year >= 2015 else 8
    ad_table = soup.findAll('table')[table_index]
    ad_header = [i.getText() for i in ad_table.findAll('tr')[1].findAll('th', class_ = 
                                                                        ['poptip center', 'poptip sort_default_asc center'])]
    ad_row = ad_table.findAll('tr')[1:]
    ad_data = [[i.getText() for i in ad_row[j].findAll('td')] for j in range(1, len(ad_row))]
    advanced = pd.DataFrame(ad_data, columns = ad_header)
    advanced.Team = advanced.Team.replace('Los Angeles Lakers*', 'LA Lakers temp')
    advanced.Team = advanced.Team.replace('Los Angeles Lakers', 'LA Lakers temp')
    advanced.Team = advanced.Team.replace('Los Angeles Clippers*', 'LA Clippers temp')
    advanced.Team = advanced.Team.replace('Los Angeles Clippers', 'LA Clippers temp')
    advanced.Team = advanced.Team.replace('Portland Trail', 'Portland')
    ad_citys = [x[:-1] for x in advanced.Team.str.strip('*').str.split(' ')]
    for index, city in enumerate(ad_citys):
            ad_citys[index] = city[0] + ' ' + city[1] if len(city) > 1 else city[0]
    advanced.Team = ad_citys
    advanced.Team = advanced.Team.replace('Portland Trail', 'Portland')
    ad_select = ['Team', 'Age', 'PW', 'PL', 'SOS', 'SRS', 'ORtg', 'DRtg', 'NRtg', 'Pace', 'TS%', 'W']
    advanced_done = advanced[ad_select]
    for i in ad_select[1:]:
        advanced_done[i] = pd.to_numeric(advanced_done[i])
    for x in ['W', 'NRtg']:
        advanced_done[x][30] = advanced_done[x].mean()
    advanced_done = advanced_done.sort_values(by = 'Team').reset_index(drop = True)
    advanced_done['Year'] = year
    
    return advanced_done

### Salary

In [202]:
def salary_crawler(soup, year):
    
    header = [th.getText() for th in soup.findAll('thead')[0].findAll('td')]
    row = soup.findAll('tr')
    data = [[td.getText()[10: -8] for td in row[i].findAll('td')] for i in range(1, len(row))]
    salary = pd.DataFrame(data, columns = header).iloc[:, 1:]
    salary.Team = salary.Team.str.strip()
    salary[f'{year}/{year-1999}(*)'] = salary[f'{year}/{year-1999}(*)'].str.strip('$')
    salary = salary[['Team', f'{year}/{year-1999}(*)']]
    cost = [int(x[0] + x[1] + x[2]) for x in salary[f'{year}/{year-1999}(*)'].str.split(',')]
    salary['Salary'] = cost
    salary.loc[30] = ['League', 'temp', salary.Salary.mean()]
    salary['Salary'] = salary.Salary.astype(int)
    salary_done = salary.drop(columns = f'{year}/{year-1999}(*)')
    salary_done = salary_done.sort_values(by = 'Team').reset_index(drop = True)
    salary_done['Year'] = year

    return salary_done

### All data

In [203]:
##### 2012-13 to 2018-19 (without 2015-16) #####
target_year = list(range(2012, 2015)) + list(range(2016, 2019))

basic_df = pd.DataFrame()
advanced_df = pd.DataFrame()
salary_df = pd.DataFrame()
for i in target_year:
    url_1 = f'https://www.basketball-reference.com/leagues/NBA_{i+1}.html'
    html_1 = urlopen(url_1)
    soup_1 = BeautifulSoup(html_1)
    url_2 = f'https://hoopshype.com/salaries/{i}-{i+1}/'
    html_2 = urlopen(url_2)
    soup_2 = BeautifulSoup(html_2)
    temp_basic = basic_crawler(soup_1, i)
    temp_advanced = advanced_crawler(soup_1, i)
    temp_salary = salary_crawler(soup_2, i)
    basic_df = pd.concat([basic_df, temp_basic], axis = 0).reset_index(drop = True)
    advanced_df = pd.concat([advanced_df, temp_advanced], axis = 0).reset_index(drop = True)
    salary_df = pd.concat([salary_df, temp_salary], axis = 0).reset_index(drop = True)
    print(f'Year {i}: {len(basic_df)} basic, {len(advanced_df)} advanced ,{len(salary_df)} salary.')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  advanced_done[x][30] = advanced_done[x].mean()


Year 2012: 31 basic, 31 advanced ,31 salary.
Year 2013: 62 basic, 62 advanced ,62 salary.
Year 2014: 93 basic, 93 advanced ,93 salary.
Year 2016: 124 basic, 124 advanced ,124 salary.
Year 2017: 155 basic, 155 advanced ,155 salary.
Year 2018: 186 basic, 186 advanced ,186 salary.


In [204]:
temp_df = salary_df.merge(basic_df, on = ['Team', 'Year'], how = 'inner')
all_data = temp_df.merge(advanced_df, on = ['Team', 'Year'], how = 'inner')
west_list = ['Utah', 'Phoenix', 'Denver', 'LA Clippers', 'Dallas', 'Portland', 'LA Lakers', 'Memphis', 'Golden State',
             'San Antonio', 'New Orleans', 'Sacramento', 'Minnesota', 'Oklahoma City', 'Houston']
conference = []
for i in all_data.Team:
    if i in west_list:
        conference.append('West')
    elif i == 'League':
        conference.append('Average')
    else:
        conference.append('East')
all_data['Conference'] = conference

In [206]:
all_data.to_csv('NBA_done.csv')