# Importing Required Libraries

In [73]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

# Starting with Web Scraping

In [74]:
# First we'll write logic to scrape contents of one page, then we'll iterate through mutiple pages using looping

In [75]:
URL = 'https://www.scrapethissite.com/pages/forms/?page_num=1'
headers = ({'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36','Accept-Language':'en-US,en;q=0.5'})

In [76]:
webpage = requests.get(URL,headers=headers)

In [77]:
webpage

<Response [200]>

In [78]:
# Response 200 means good to go
# Response 403/503 are bad requests - Try again after sometime

In [79]:
#webpage.text

In [80]:
soup = BeautifulSoup(webpage.content,'html.parser')

In [81]:
#print(soup.prettify())

In [82]:
allteams = soup.find_all('tr',class_='team')
#allteams

In [83]:
team_name = []
year = []
wins = []
losses = []
ot_losses = []
win_percentage = []
goals_for = []
goals_against = []
diff = []
for i in allteams:
    team_name.append(i.find('td',class_='name').text.strip())
    year.append(i.find('td',class_='year').text.strip())
    wins.append(i.find('td',class_='wins').text.strip())
    losses.append(i.find('td',class_='losses').text.strip())
    ot_losses.append(i.find('td',class_='ot-losses').text.strip())
    win_percentage_temp = i.find('td',class_='pct text-success')
    if win_percentage_temp is None:
        win_percentage_temp = i.find('td',class_='pct text-danger')
    if win_percentage_temp:
        win_percentage.append(win_percentage_temp.text.strip())
    goals_for.append(i.find('td',class_='gf').text.strip())
    goals_against.append(i.find('td',class_='ga').text.strip())
    diff_temp = i.find('td',class_='diff text-success')
    if diff_temp is None:
        diff_temp = i.find('td',class_='diff text-danger')
    if diff_temp:
        diff.append(diff_temp.text.strip())


In [84]:
d = {'Team_Name':team_name,'Year':year,'Wins':wins,'Losses':losses,'ot_losses':ot_losses,'Goals_for':goals_for,'Goals_Against':goals_against,'Difference':diff}

In [85]:
df=pd.DataFrame(d)

In [86]:
df

Unnamed: 0,Team_Name,Year,Wins,Losses,ot_losses,Goals_for,Goals_Against,Difference
0,Boston Bruins,1990,44,24,,299,264,35
1,Buffalo Sabres,1990,31,30,,292,278,14
2,Calgary Flames,1990,46,26,,344,263,81
3,Chicago Blackhawks,1990,49,23,,284,211,73
4,Detroit Red Wings,1990,34,38,,273,298,-25
5,Edmonton Oilers,1990,37,37,,272,272,0
6,Hartford Whalers,1990,31,38,,238,276,-38
7,Los Angeles Kings,1990,46,24,,340,254,86
8,Minnesota North Stars,1990,27,39,,256,266,-10
9,Montreal Canadiens,1990,39,30,,273,249,24


In [87]:
# Now that we have succesfully able to scrape one pages, we'll apply the logic to fetch records from multiple pages

# For Scraping data from Multiple Pages

In [88]:
data_frames = []

for j in range(1,25):

    URL = 'https://www.scrapethissite.com/pages/forms/?page_num={}'.format(j)
    headers = ({'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36','Accept-Language':'en-US,en;q=0.5'})
    webpage = requests.get(URL,headers=headers)
    soup = BeautifulSoup(webpage.content,'html.parser')
    allteams = soup.find_all('tr',class_='team')

    team_name = []
    year = []
    wins = []
    losses = []
    ot_losses = []
    win_percentage = []
    goals_for = []
    goals_against = []
    diff = []
    for i in allteams:
        team_name.append(i.find('td',class_='name').text.strip())
        year.append(i.find('td',class_='year').text.strip())
        wins.append(i.find('td',class_='wins').text.strip())
        losses.append(i.find('td',class_='losses').text.strip())
        ot_losses.append(i.find('td',class_='ot-losses').text.strip())
        win_percentage_temp = i.find('td',class_='pct text-success')
        if win_percentage_temp is None:
            win_percentage_temp = i.find('td',class_='pct text-danger')
        if win_percentage_temp:
            win_percentage.append(win_percentage_temp.text.strip())
        goals_for.append(i.find('td',class_='gf').text.strip())
        goals_against.append(i.find('td',class_='ga').text.strip())
        diff_temp = i.find('td',class_='diff text-success')
        if diff_temp is None:
            diff_temp = i.find('td',class_='diff text-danger')
        if diff_temp:
            diff.append(diff_temp.text.strip())

    d = {'Team_Name':team_name,'Year':year,'Wins':wins,'Losses':losses,'ot_losses':ot_losses,'Goals_for':goals_for,'Goals_Against':goals_against,'Difference':diff}

    df=pd.DataFrame(d)

    data_frames.append(df)

final = pd.concat(data_frames, ignore_index=True)

In [89]:
final

Unnamed: 0,Team_Name,Year,Wins,Losses,ot_losses,Goals_for,Goals_Against,Difference
0,Boston Bruins,1990,44,24,,299,264,35
1,Buffalo Sabres,1990,31,30,,292,278,14
2,Calgary Flames,1990,46,26,,344,263,81
3,Chicago Blackhawks,1990,49,23,,284,211,73
4,Detroit Red Wings,1990,34,38,,273,298,-25
...,...,...,...,...,...,...,...,...
577,Tampa Bay Lightning,2011,38,36,8,235,281,-46
578,Toronto Maple Leafs,2011,35,37,10,231,264,-33
579,Vancouver Canucks,2011,51,22,9,249,198,51
580,Washington Capitals,2011,42,32,8,222,230,-8


In [90]:
# Verification of Obtained data

In [91]:
print(final.duplicated().sum()) # Prints the number of duplicate rows

0


# Extracting the data into excel format

In [92]:
final.to_excel('scrapeddata.xlsx',index = False)