In [1]:
import pandas as pd
import re
from bs4 import BeautifulSoup
import requests
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
URLS = ['https://en.wikipedia.org/wiki/2015_Rugby_World_Cup_Pool_A',
        'https://en.wikipedia.org/wiki/2015_Rugby_World_Cup_Pool_B',
        'https://en.wikipedia.org/wiki/2015_Rugby_World_Cup_Pool_C',
        'https://en.wikipedia.org/wiki/2015_Rugby_World_Cup_Pool_D',
        'https://en.wikipedia.org/wiki/2015_Rugby_World_Cup_knockout_stage']
big_soup = []
for i in URLS:
    response = requests.get(i)
    soup = BeautifulSoup(response.text,'html.parser')
    div = soup.find('div', {'id':'mw-content-text'})
    summary = div.find_all('div',{'class':'vevent summary'})
    big_soup.append(summary)

flattened  = [val for sublist in big_soup for val in sublist]
data = []
for i in flattened:
    text = i.text
    data.append(text)
data = [e.replace('\n',' ').replace('\xa0', ' ').strip() for e in data]
data = " ".join(data)
data = data.replace('(a.e.t.)','').replace('-','–')

In [3]:
data

"18 September 201520:00    England   35–11   Fiji   Try: Penalty try 13' cBrown (2) 22' m, 72' cB. Vunipola 80' cCon: Ford (1/2) 13'Farrell (2/2) 73', 80'Pen: Ford (2/3) 3', 34'Farrell (1/1) 68'  Report  Try: Nadolo 30' mPen: Nadolo (1/3) 37'Volavola (1/2) 64'    Twickenham Stadium, LondonAttendance: 80,015Referee: Jaco Peyper (South Africa) 20 September 201514:30    Wales   54–9   Uruguay   Try: Lee 15' cAllen (3) 19' c, 30' c, 40' cAmos 50' cDavies (2) 60' m, 80' cTipuric 71' cCon: Priestland (7/8) 16', 19', 30', 40', 51', 72', 80'  Report  Pen: Berchesi (3/4) 2', 9', 24'    Millennium Stadium, CardiffAttendance: 71,887Referee: Romain Poite (France) 23 September 201516:45    Australia   28–13   Fiji   Try: Pocock (2) 26' c, 31' mKepu 43' cCon: Foley (2/3) 28', 44'Pen: Foley (3/3) 10', 38', 70'  Report  Try: Volavola 60' cCon: Nadolo (1/1) 61'Pen: Nadolo (2/2) 21', 47'    Millennium Stadium, CardiffAttendance: 67,253Referee: Glen Jackson (New Zealand) 26 September 201520:00    England

In [4]:
#datemethodregex
date_pattern = re.compile(r'\d{1,2}\s\w*\s2015')
dates = date_pattern.findall(data)
date_list = []
for i in dates:
    date_list.append(i)

In [5]:
#time
time_pattern = re.compile(r'2015(\d{2}:\d{2})')
time = time_pattern.findall(data)
time_list = []
for i in time:
    time_list.append(i)

In [6]:
#team1
team1_pattern = re.compile(r'\s{4,6}(\D*|\D*\s\D*)\s{2,3}\d{1,3}–')
team1 = team1_pattern.findall(data)
team1_list = []
for i in team1:
    team1_list.append(i)

In [7]:
#team2
team2_pattern = re.compile(r'[–]\d*\s{3,4}(\w*|\w*\s\w*)\s{3,7}\D{3,6}')
team2 = team2_pattern.findall(data)
team2_list = []
for i in team2:
    team2_list.append(i)

In [8]:
#score
score_pattern = re.compile(r'\s{3}\d*[–]\d*\s{3}')
scores = score_pattern.findall(data)
scores_list = []
for i in scores:
    scores_list.append(i)

In [9]:
#points team 1
pt1_pattern = re.compile(r'\s{3}(\d*)[–]\d*\s{3}')
pt1 = pt1_pattern.findall(data)
pt1_list = []
for i in pt1:
    pt1_list.append(i)
pt1_list = [ int(x) for x in pt1_list ]

In [10]:
#points team 2 
pt2_pattern = re.compile(r'\s{3}\d*[–](\d*)\s{3}')
pt2 = pt2_pattern.findall(data)
pt2_list = []
for i in pt2:
    pt2_list.append(i)
pt2_list = [ int(x) for x in pt2_list ]

In [11]:
#total points
total_points_list =[]
for i,j in zip(pt1_list, pt2_list):
    k = i+j
    total_points_list.append(k)

In [12]:
#attendance
attendance_pattern = re.compile(r'Attendance:\s(\d*[,]\d*)')
attendance = attendance_pattern.findall(data)
attendance_list = []
for i in attendance:
    attendance_list.append(i)
attendance_list = [ x.replace(',','') for x in attendance_list]
attendance_list = [ int(x) for x in attendance_list]

In [13]:
#referee
referee_pattern = re.compile(r'Referee:\s(\w*\s\w*)')
referee = referee_pattern.findall(data)
referee_list = []
for i in referee:
    referee_list.append(i)

In [14]:
#referee nationality
ref_nationality_pattern = re.compile(r'[(](\D*\s\D*|\D*)[)]')
ref_nationality = ref_nationality_pattern.findall(data)
referee_nationality_list = []
for i in ref_nationality:
    referee_nationality_list.append(i)

In [15]:
dico = {'Match Date':date_list,'Kickoff Time':time_list, 'Home Team': team1_list, 'Away Team' : team2_list, 'Score': scores_list, 'Points Home Team': pt1_list, 'Points Away Team' : pt2_list, 'Total Points': total_points_list, 'Attendance': attendance_list, 'Referee': referee_list, 'Referee Nationality': referee_nationality_list}
df= pd.DataFrame.from_dict(dico, orient='index')
df= df.transpose()
df['Match Date'] = pd.to_datetime(df['Match Date'])
df['Points Home Team'] = pd.to_numeric(df['Points Home Team'])
df['Points Away Team'] = pd.to_numeric(df['Points Away Team'])
df['Total Points'] = pd.to_numeric(df['Total Points'])
df['Attendance'] = pd.to_numeric(df['Attendance'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Match Date           48 non-null     datetime64[ns]
 1   Kickoff Time         48 non-null     object        
 2   Home Team            48 non-null     object        
 3   Away Team            48 non-null     object        
 4   Score                48 non-null     object        
 5   Points Home Team     48 non-null     int64         
 6   Points Away Team     48 non-null     int64         
 7   Total Points         48 non-null     int64         
 8   Attendance           48 non-null     int64         
 9   Referee              48 non-null     object        
 10  Referee Nationality  48 non-null     object        
dtypes: datetime64[ns](1), int64(4), object(6)
memory usage: 4.2+ KB


In [16]:
df

Unnamed: 0,Match Date,Kickoff Time,Home Team,Away Team,Score,Points Home Team,Points Away Team,Total Points,Attendance,Referee,Referee Nationality
0,2015-09-18,20:00,England,Fiji,35–11,35,11,46,80015,Jaco Peyper,South Africa
1,2015-09-20,14:30,Wales,Uruguay,54–9,54,9,63,71887,Romain Poite,France
2,2015-09-23,16:45,Australia,Fiji,28–13,28,13,41,67253,Glen Jackson,New Zealand
3,2015-09-26,20:00,England,Wales,25–28,25,28,53,81129,Jérôme Garcès,France
4,2015-09-27,12:00,Australia,Uruguay,65–3,65,3,68,39605,Pascal Gaüzère,France
5,2015-10-01,16:45,Wales,Fiji,23–13,23,13,36,71576,John Lacey,Ireland
6,2015-10-03,20:00,England,Australia,13–33,13,33,46,81010,Romain Poite,France
7,2015-10-06,20:00,Fiji,Uruguay,47–15,47,15,62,30048,JP Doyle,England
8,2015-10-10,16:45,Australia,Wales,15–6,15,6,21,80863,Craig Joubert,South Africa
9,2015-10-10,20:00,England,Uruguay,60–3,60,3,63,50778,Chris Pollock,New Zealand


In [17]:
# df.to_csv(r'C:\Users\lacar\DQ Projects\Rugby DataVis' + '\\2015.csv', index=False)