-
Notifications
You must be signed in to change notification settings - Fork 107
/
tournaments.py
280 lines (236 loc) · 14.4 KB
/
tournaments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
# # # # # # # # #
# #
# FUNCTIONS #
# #
# # # # # # # # #
from scraping import *
def tournaments(year):
# Setup
year_url = "http://www.atptour.com/en/scores/results-archive?year=" + year
url_prefix = "http://www.atptour.com"
# HTML tree
year_tree = html_parse_tree(year_url)
# Initial XPath to find number of tournaments in a given year
tourney_titles_xpath = "//span[contains(@class, 'tourney-title')]/text()"
tourney_titles_parsed = xpath_parse(year_tree, tourney_titles_xpath)
tourney_titles_cleaned = regex_strip_array(tourney_titles_parsed)
# If no tournaments found in <span> tags try find in <a> tags
if len(tourney_titles_cleaned) == 0:
tourney_titles_xpath = "//a[contains(@class, 'tourney-title')]/text()"
tourney_titles_parsed = xpath_parse(year_tree, tourney_titles_xpath)
tourney_titles_cleaned = regex_strip_array(tourney_titles_parsed)
tourney_count = len(tourney_titles_cleaned)
# Iterate through each row in the tournaments table
output = []
for i in range(0, tourney_count):
tourney_order = i + 1
# Tournament type
tourney_type_xpath = "//tr[contains(@class, 'tourney-result')][" + str(i + 1) + "]/td[2]/img[contains(@alt, 'tournament badge')]/@src"
tourney_type_parsed = xpath_parse(year_tree, tourney_type_xpath)
if len(tourney_type_parsed) > 0:
if tourney_type_parsed[0] == '/assets/atpwt/images/tournament/badges/categorystamps_grandslam.png': tourney_type = 'Grand Slam'
elif tourney_type_parsed[0] == '/assets/atpwt/images/tournament/badges/categorystamps_finals.svg': tourney_type = "ATP Finals"
elif tourney_type_parsed[0] == '/assets/atpwt/images/tournament/badges/categorystamps_1000.png': tourney_type = "Masters 1000"
elif tourney_type_parsed[0] == '/assets/atpwt/images/tournament/badges/categorystamps_500.png': tourney_type = "ATP 500"
elif tourney_type_parsed[0] == '/assets/atpwt/images/tournament/badges/categorystamps_250.png': tourney_type = "ATP 250"
elif tourney_type_parsed[0] == '/assets/atpwt/images/tournament/badges/categorystamps_lvr.png': tourney_type = "Laver Cup"
elif tourney_type_parsed[0] == '/assets/atpwt/images/tournament/badges/categorystamps_nextgen.svg': tourney_type = "Next Gen Finals"
elif tourney_type_parsed[0] == '/assets/atpwt/images/tournament/badges/categorystamps_atpcup.svg': tourney_type = "ATP Cup"
else:
tourney_type = 'undefined'
else:
tourney_type = ''
# Tournament name, location, and start date
tourney_info_xpath = "//tr[contains(@class, 'tourney-result')][" + str(i + 1) + "]/td[3]/span/text()"
tourney_info_parsed = xpath_parse(year_tree, tourney_info_xpath)
tourney_info_cleaned = regex_strip_array(tourney_info_parsed)
# If tourney name not found in <span> tags try find in <a> tags
if len(tourney_info_cleaned) == 2:
tourney_info_xpath = "//tr[contains(@class, 'tourney-result')][" + str(i + 1) + "]/td[3]/a/text()"
tourney_info_parsed = xpath_parse(year_tree, tourney_info_xpath)
tourney_info_cleaned = regex_strip_array(tourney_info_parsed) + tourney_info_cleaned
#tourney_name = tourney_info_cleaned[0].encode('utf-8')
tourney_name = tourney_info_cleaned[0]
#tourney_location = tourney_info_cleaned[1].encode('utf-8')
tourney_location = tourney_info_cleaned[1]
tourney_date = tourney_info_cleaned[2]
tourney_year = int(year)
try:
tourney_date_split = tourney_date.split('.')
tourney_month = int(tourney_date_split[1])
tourney_day = int(tourney_date_split[2])
except Exception:
tourney_month = ''
tourney_day = ''
# Tournament singles draw
tourney_singles_draw_xpath = "//tr[contains(@class, 'tourney-result')][" + str(i + 1) + "]/td[4]/div/div[contains(., 'SGL')]/a[1]/span/text()"
#tourney_singles_draw_xpath = "//tr[contains(@class, 'tourney-result')][" + str(i + 1) + "]/td[4]/div/div/a[1]/span/text()"
tourney_singles_draw_parsed = xpath_parse(year_tree, tourney_singles_draw_xpath)
tourney_singles_draw_cleaned = regex_strip_array(tourney_singles_draw_parsed)
tourney_singles_draw = int(tourney_singles_draw_cleaned[0])
# Tournament doubles draw
tourney_doubles_draw_xpath = "//tr[contains(@class, 'tourney-result')][" + str(i + 1) + "]/td[4]/div/div[contains(., 'DBL')]/a[2]/span/text()"
tourney_doubles_draw_parsed = xpath_parse(year_tree, tourney_doubles_draw_xpath)
tourney_doubles_draw_cleaned = regex_strip_array(tourney_doubles_draw_parsed)
tourney_doubles_draw = int(tourney_doubles_draw_cleaned[0])
# Tournament conditions
tourney_conditions_xpath = "//tr[contains(@class, 'tourney-result')][" + str(i + 1) + "]/td[5]/div/div[contains(., 'Outdoor') or contains(., 'Indoor')]/text()[normalize-space()]"
tourney_conditions_parsed = xpath_parse(year_tree, tourney_conditions_xpath)
tourney_conditions_cleaned = regex_strip_array(tourney_conditions_parsed)
try:
tourney_conditions = tourney_conditions_cleaned[0].strip()
except Exception:
tourney_conditions = ''
# Tourneament surface
tourney_surface_xpath = "//tr[contains(@class, 'tourney-result')][" + str(i + 1) + "]/td[5]/div/div[contains(., 'Outdoor') or contains(., 'Indoor')]/span/text()[normalize-space()]"
tourney_surface_parsed = xpath_parse(year_tree, tourney_surface_xpath)
tourney_surface_cleaned = regex_strip_array(tourney_surface_parsed)
try:
tourney_surface = tourney_surface_cleaned[0].strip()
except Exception:
tourney_surface = ''
# Tournament total financial commitment
tourney_fin_commit_xpath = "//tr[contains(@class, 'tourney-result')][" + str(i + 1) + "]/td[6]/div/div/span/text()"
tourney_fin_commit_parsed = xpath_parse(year_tree, tourney_fin_commit_xpath)
tourney_fin_commit_cleaned = regex_strip_array(tourney_fin_commit_parsed)
if len(tourney_fin_commit_cleaned) == 0:
tourney_fin_commit_raw = ''
tourney_fin_commit = ''
currency = ''
elif len(tourney_fin_commit_cleaned) > 0:
#tourney_fin_commit = tourney_fin_commit = tourney_fin_commit_cleaned[0].encode('utf-8')
tourney_fin_commit_raw = tourney_fin_commit_cleaned[0]
if tourney_fin_commit_raw[0] == '$': currency = 'USD'
elif tourney_fin_commit_raw[0] == '£': currency = 'GBP'
elif tourney_fin_commit_raw[0] == '€': currency = 'EUR'
elif tourney_fin_commit_raw[0] == 'A': currency = 'AUD'
else: currency = 'PROBLEM'
tourney_fin_commit = tourney_fin_commit_raw.replace(',','')
tourney_fin_commit = tourney_fin_commit.replace('$','')
tourney_fin_commit = tourney_fin_commit.replace('£','')
tourney_fin_commit = tourney_fin_commit.replace('€','')
tourney_fin_commit = tourney_fin_commit.replace('A','')
tourney_fin_commit = int(tourney_fin_commit)
else:
tourney_fin_commit_raw = 'PROBLEM'
tourney_fin_commit = ''
currency = ''
# Tournament results
tourney_details_url_xpath = "//tr[contains(@class, 'tourney-result')][" + str(i + 1) + "]/td[8]/a/@href"
tourney_details_url_parsed = xpath_parse(year_tree, tourney_details_url_xpath)
if len(tourney_details_url_parsed) > 0:
tourney_url_suffix = tourney_details_url_parsed[0]
tourney_url_split = tourney_url_suffix.split('/')
tourney_slug = tourney_url_split[4]
tourney_id = tourney_url_split[5]
else:
tourney_url_suffix = ''
tourney_slug = ''
tourney_id = ''
# Singles winner info
singles_winner_name_xpath = "//tr[contains(@class, 'tourney-result')][" + str(i + 1) + "]/td[7]/div[contains(., 'SGL:')]/a/text()"
singles_winner_name_parsed = xpath_parse(year_tree, singles_winner_name_xpath)
singles_winner_name_cleaned = regex_strip_array(singles_winner_name_parsed)
if len(singles_winner_name_cleaned) > 0:
singles_winner_name = singles_winner_name_cleaned[0]
singles_winner_url_xpath = "//tr[@class = 'tourney-result'][" + str(i + 1) + "]/td/div[contains(., 'SGL:')]/a/@href"
singles_winner_url_parsed = xpath_parse(year_tree, singles_winner_url_xpath)
if len(singles_winner_url_parsed) > 0:
singles_winner_url = singles_winner_url_parsed[0]
singles_winner_url_split = singles_winner_url.split('/')
singles_winner_player_slug = singles_winner_url_split[3]
singles_winner_player_id = singles_winner_url_split[4]
else:
singles_winner_url = ''
singles_winner_player_slug = ''
singles_winner_player_id = ''
else: # Case where tourney missing winner name but has a tourney URL
if tourney_url_suffix != '':
# Check tourney URL for Finals match winner
tourney_url = url_prefix + tourney_url_suffix
tourney_tree = html_parse_tree(tourney_url)
missing_winner_name_xpath = "//table[contains(@class, 'day-table')]/tbody[" + str(i + 1) + "]/tr[*]/td[contains(@class, 'day-table-name')][1]/a/text()"
missing_winner_name_parsed = xpath_parse(tourney_tree, missing_winner_name_xpath)
if len(missing_winner_name_parsed) > 0: singles_winner_name = missing_winner_name_parsed[0]
else: singles_winner_name = ''
missing_winner_url_xpath = "//table[contains(@class, 'day-table')]/tbody[" + str(i + 1) + "]/tr[*]/td[contains(@class, 'day-table-name')][1]/a/@href"
missing_winner_url_parsed = xpath_parse(tourney_tree, missing_winner_url_xpath)
if len(missing_winner_url_parsed) > 0:
singles_winner_url = missing_winner_url_parsed[0]
singles_winner_url_split = singles_winner_url.split('/')
singles_winner_player_slug = singles_winner_url_split[3]
singles_winner_player_id = singles_winner_url_split[4]
else:
singles_winner_url = ''
singles_winner_player_slug = ''
singles_winner_player_id = ''
else: # Case where tourney is missing URL
singles_winner_name = ''
singles_winner_url = ''
singles_winner_player_slug = ''
singles_winner_player_id = ''
# Doubles winners info
doubles_winners_name_xpath = "//tr[contains(@class, 'tourney-result')][" + str(i + 1) + "]/td[7]/div[contains(., 'DBL:')]/a/text()"
doubles_winners_name_parsed = xpath_parse(year_tree, doubles_winners_name_xpath)
doubles_winners_name_cleaned = regex_strip_array(doubles_winners_name_parsed)
if len(doubles_winners_name_cleaned) == 2:
doubles_winner_1_name = doubles_winners_name_cleaned[0]
doubles_winner_2_name = doubles_winners_name_cleaned[1]
elif len(doubles_winners_name_cleaned) == 1:
doubles_winner_1_name = doubles_winners_name_cleaned[0]
doubles_winner_2_name = ''
else:
doubles_winner_1_name = ''
doubles_winner_2_name = ''
doubles_winners_url_xpath = "//tr[@class = 'tourney-result'][" + str(i + 1) + "]/td/div[contains(., 'DBL:')]/a/@href"
doubles_winners_url_parsed = xpath_parse(year_tree, doubles_winners_url_xpath)
if len(doubles_winners_url_parsed) == 2:
doubles_winner_1_url = doubles_winners_url_parsed[0]
doubles_winner_1_url_split = doubles_winner_1_url.split('/')
doubles_winner_1_player_slug = doubles_winner_1_url_split[3]
doubles_winner_1_player_id = doubles_winner_1_url_split[4]
doubles_winner_2_url = doubles_winners_url_parsed[1]
doubles_winner_2_url_split = doubles_winner_2_url.split('/')
doubles_winner_2_player_slug = doubles_winner_2_url_split[3]
doubles_winner_2_player_id = doubles_winner_2_url_split[4]
elif len(doubles_winners_url_parsed) == 1:
doubles_winner_1_url = doubles_winners_url_parsed[0]
doubles_winner_1_url_split = doubles_winner_1_url.split('/')
doubles_winner_1_player_slug = doubles_winner_1_url_split[3]
doubles_winner_1_player_id = doubles_winner_1_url_split[4]
doubles_winner_2_url = ''
doubles_winner_2_player_slug = ''
doubles_winner_2_player_id = ''
else:
doubles_winner_1_url = ''
doubles_winner_1_player_slug = ''
doubles_winner_1_player_id = ''
doubles_winner_2_url = ''
doubles_winner_2_player_slug = ''
doubles_winner_2_player_id = ''
# Store data
tourney_year_id = str(year) + '-' + tourney_id
output.append([tourney_year_id, tourney_order, tourney_type, tourney_name, tourney_id, tourney_slug, tourney_location, tourney_date, year, tourney_month, tourney_day, tourney_singles_draw, tourney_doubles_draw, tourney_conditions, tourney_surface, tourney_fin_commit_raw, currency, tourney_fin_commit, tourney_url_suffix, singles_winner_name, singles_winner_url, singles_winner_player_slug, singles_winner_player_id, doubles_winner_1_name, doubles_winner_1_url, doubles_winner_1_player_slug, doubles_winner_1_player_id, doubles_winner_2_name, doubles_winner_2_url, doubles_winner_2_player_slug, doubles_winner_2_player_id])
# Output progress
print(year + ' ' + str(tourney_count))
# Output data
return output
# # # # # # # # # # #
# #
# MAIN ROUTINE #
# #
# # # # # # # # # # #
# Command line input
start_year = input('Enter start year: ')
end_year = input('Enter end year: ')
# Iterate through the years and scrape tourney data
print('')
print('Year Tournaments')
print('---- -----------')
tourney_data = []
for h in range(int(start_year), int(end_year) + 1):
year = str(h)
tourney_data += tournaments(year)
# Output to CSV
filename = 'tournaments_' + start_year + '-' + end_year + '.csv'
array2csv(tourney_data, filename)