-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.py
110 lines (100 loc) · 4.2 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import requests
from bs4 import BeautifulSoup # Python package for parsing HTML and XML documents.
# It creates a parse tree for parsed pages that
# can be used to extract data from HTML, which is
#useful for web scraping
import re #requests.get gets the url information
import json #loads json APIs
class TeamSite:
def __init__(self, college,year1,year2):
self.name = college
self.year1 = year1
self.year2 = year2
r = requests.get('https://www.sports-reference.com/cbb/schools/{college}/'.format(college = college))
c = r.content #to see the content of the URL site
self.soup = BeautifulSoup(c, 'html.parser') # bs4 parses the html data
self.index = (self.year2-self.year1)
@property
def years(self):
college = self.name
years = []
no_data = []
for i in range(self.year1+1,self.year2+1):
x = self.soup.findAll('a', {'href':'/cbb/schools/{name}/{year}.html'.format(name = college, year = i)})
if len(x) == 0:
no_data.append(x)
else:
y = ((self.soup.findAll('a', {'href':'/cbb/schools/{name}/{year}.html'.format(name = college, year = i)}))[0]).text
years.append(y)
years.sort()
years.reverse()
return years #had to reverse the list because the way winloss is reading
# is from top year (2019) to lowest year, but this years function
# is reading it with index(lowest year to highest year)
@property
def winloss(self):
whole = self.soup.findAll('td', {'class' : "right", 'data-stat' : 'win_loss_pct'})
almost = [i.text for i in whole]
return almost[:self.index]
@property
def team_points(self):
whole = self.soup.findAll('td', {'class': 'right', 'data-stat': 'pts_per_g'})
almost = [i.text for i in whole]
return almost[:self.index]
@property
def opp_points(self):
whole = self.soup.findAll('td', {'class': 'right', 'data-stat': 'opp_pts_per_g'})
almost = [i.text for i in whole]
return almost[:self.index]
@property
def coaches_names(self):
whole = self.soup.findAll('td', {'class': 'left', 'data-stat': 'coaches'})
almost = [i.text for i in whole]
coaches = almost[:self.index]
listy = []
for coach in coaches:
first = coach.rindex('(')
space = coach[first-1:]
clean = (coach.replace(space,''))
listy.append(clean)
return listy
class CoachSite:
def __init__(self):
r = requests.get('http://sports.usatoday.com/ncaa/salaries/mens-basketball/coach/#')
c = r.content
self.soup = BeautifulSoup(c, 'html.parser')
@property
def tags(self):
coaches_ids = []
not_coaches_ids = []
for tag in self.soup.findAll('td',{"class": ''}):
try:
coaches_ids.append((tag['data-coach']))
except KeyError:
not_coaches_ids.append(0)
return coaches_ids
@property
def pages(self):
coaches_pages = []
for id in self.tags:
r = requests.get('''http://sports.usatoday.com/ajaxservice/ncaa/salaries__coach__'''+id)
c = r.content
data = json.loads(c)
coaches_pages.append(data)
return coaches_pages
@property
def salaries(self):
coaches_pages = self.pages
necess_info = list(map(lambda coach : coach['rows'],coaches_pages))
listy = []
schools = []
coach_name = list(map(lambda coach: coach['profile']['coach_name'],coaches_pages))
for i in range(0,len(necess_info)):
for ii in range(0,len(necess_info[i])):
year = necess_info[i][ii]['year']['value']
salary = necess_info[i][ii]['total_pay']['value']
school = necess_info[i][ii]['school_name']['value']
schools.append(school)
tup = (coach_name[i], year,salary,school)
listy.append(tup)
return listy, list(set(schools))