In [1]:
import pandas as pd
import re
from bs4 import BeautifulSoup
import requests
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
URLS = ['https://en.wikipedia.org/wiki/2019_Rugby_World_Cup_Pool_A',
        'https://en.wikipedia.org/wiki/2019_Rugby_World_Cup_Pool_B',
        'https://en.wikipedia.org/wiki/2019_Rugby_World_Cup_Pool_C',
        'https://en.wikipedia.org/wiki/2019_Rugby_World_Cup_Pool_D',
        'https://en.wikipedia.org/wiki/2019_Rugby_World_Cup_knockout_stage']
big_soup = []
for i in URLS:
    response = requests.get(i)
    soup = BeautifulSoup(response.text,'html.parser')
    div = soup.find('div', {'id':'mw-content-text'})
    summary = div.find_all('div',{'class':'vevent summary'})
    big_soup.append(summary)

flattened  = [val for sublist in big_soup for val in sublist]
data = []
for i in flattened:
    text = i.text
    data.append(text)
data = [e.replace('\n',' ').replace('\xa0', ' ').strip() for e in data]
data = " ".join(data)
data = data.replace('(a.e.t.)','').replace('-','–').replace('(1 BP)','').replace('(2 BP)','').replace('(Cancelled)','')

In [3]:
data

"20 September 201919:45 JST (UTC+08)     Japan   30–10   Russia   Try: Matsushima (3) 12' m, 39' c, 69' cLabuschagné 47' mCon: Tamura (1/3) 40'Matsuda (1/1) 71'Pen: Tamura (2/2) 44', 64'  Report  Try: Golosnitski 5' cCon: Kushnarev (1/1) 6'Pen: Kushnarev (1/1) 61'    Tokyo Stadium, ChōfuAttendance: 45,745Referee: Nigel Owens (Wales) 22 September 201916:45 JST (UTC+08)     Ireland   27–3   Scotland   Try: Ja. Ryan 6' cBest 14' mFurlong 25' cConway 56' mCon: Sexton (1/2) 8'Murray (1/2) 27'Pen: Carty (1/1) 68'  Report  Pen: Laidlaw (1/1) 21'    International Stadium Yokohama, YokohamaAttendance: 63,731Referee: Wayne Barnes (England) 24 September 201919:15 JST (UTC+09)    Russia   9–34   Samoa    Pen: Kushnarev (2/2) 19', 26'Drop: Kushnarev (1/1) 48'  Report  Try: Leiua (2) 16' m, 80' mAmosa 45' mFidow (2) 49' c, 53' cLee–Lo 63' mCon: Pisi (2/5) 50', 54'    Kumagaya Rugby Stadium, KumagayaAttendance: 22,564Referee: Romain Poite (France) 28 September 201916:15 JST (UTC+09)    Japan   19–12 

In [4]:
#datemethodregex
date_pattern = re.compile(r'\d{1,2}\s\w*\s2019')
dates = date_pattern.findall(data)
date_list = []
for i in dates:
    date_list.append(i)
    
#time
time_pattern = re.compile(r'2019(\d{2}:\d{2})')
time = time_pattern.findall(data)
time_list = []
for i in time:
    time_list.append(i)
    
#team1
team1_pattern = re.compile(r'\s{4,6}(\D*|\D*\s\D*)\s{2,3}\d{1,3}–')
team1 = team1_pattern.findall(data)
team1_list = []
for i in team1:
    team1_list.append(i)
    
#team2
team2_pattern = re.compile(r'[–]\d*\s{3,4}(\w*|\w*\s\w*)\s{3,7}\D{3,6}')
team2 = team2_pattern.findall(data)
team2_list = []
for i in team2:
    team2_list.append(i)
    
#score
score_pattern = re.compile(r'\s{3}\d*[–]\d*\s{3}')
scores = score_pattern.findall(data)
scores_list = []
for i in scores:
    scores_list.append(i)
    
#points team 1
pt1_pattern = re.compile(r'\s{3}(\d*)[–]\d*\s{3}')
pt1 = pt1_pattern.findall(data)
pt1_list = []
for i in pt1:
    pt1_list.append(i)
pt1_list = [ int(x) for x in pt1_list ]

#points team 2 
pt2_pattern = re.compile(r'\s{3}\d*[–](\d*)\s{3}')
pt2 = pt2_pattern.findall(data)
pt2_list = []
for i in pt2:
    pt2_list.append(i)
pt2_list = [ int(x) for x in pt2_list ]

#total points
total_points_list =[]
for i,j in zip(pt1_list, pt2_list):
    k = i+j
    total_points_list.append(k)
    
#attendance
attendance_pattern = re.compile(r'Attendance:\s(\d*[,]\d*)')
attendance = attendance_pattern.findall(data)
attendance_list = []
for i in attendance:
    attendance_list.append(i)
attendance_list = [ x.replace(',','') for x in attendance_list]
attendance_list = [ int(x) for x in attendance_list]

#referee
referee_pattern = re.compile(r'Referee:\s(\w*\s\w*)')
referee = referee_pattern.findall(data)
referee_list = []
for i in referee:
    referee_list.append(i)
    
#referee nationality
ref_nationality_pattern = re.compile(r'[(](\D*\s\D*|\D*)[)]')
ref_nationality = ref_nationality_pattern.findall(data)
referee_nationality_list = []
for i in ref_nationality:
    referee_nationality_list.append(i)

In [5]:
attendance_list.insert(18,0)
attendance_list.insert(19,0)
attendance_list.insert(28,0)
#games 19, 20, 29 have no attendance values, so adding them in manually.

In [6]:
dico = {'Match Date':date_list,'Kickoff Time':time_list, 'Home Team': team1_list, 'Away Team' : team2_list, 'Score': scores_list, 'Points Home Team': pt1_list, 'Points Away Team' : pt2_list, 'Total Points': total_points_list, 'Attendance': attendance_list, 'Referee': referee_list, 'Referee Nationality': referee_nationality_list}
df= pd.DataFrame.from_dict(dico, orient='index')
df= df.transpose()
df['Match Date'] = pd.to_datetime(df['Match Date'])
df['Points Home Team'] = pd.to_numeric(df['Points Home Team'])
df['Points Away Team'] = pd.to_numeric(df['Points Away Team'])
df['Total Points'] = pd.to_numeric(df['Total Points'])
df['Attendance'] = pd.to_numeric(df['Attendance'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Match Date           48 non-null     datetime64[ns]
 1   Kickoff Time         48 non-null     object        
 2   Home Team            48 non-null     object        
 3   Away Team            48 non-null     object        
 4   Score                48 non-null     object        
 5   Points Home Team     48 non-null     int64         
 6   Points Away Team     48 non-null     int64         
 7   Total Points         48 non-null     int64         
 8   Attendance           48 non-null     int64         
 9   Referee              48 non-null     object        
 10  Referee Nationality  48 non-null     object        
dtypes: datetime64[ns](1), int64(4), object(6)
memory usage: 4.2+ KB


In [7]:
df

Unnamed: 0,Match Date,Kickoff Time,Home Team,Away Team,Score,Points Home Team,Points Away Team,Total Points,Attendance,Referee,Referee Nationality
0,2019-09-20,19:45,Japan,Russia,30–10,30,10,40,45745,Nigel Owens,Wales
1,2019-09-22,16:45,Ireland,Scotland,27–3,27,3,30,63731,Wayne Barnes,England
2,2019-09-24,19:15,Russia,Samoa,9–34,9,34,43,22564,Romain Poite,France
3,2019-09-28,16:15,Japan,Ireland,19–12,19,12,31,47813,Angus Gardner,Australia
4,2019-09-30,19:15,Scotland,Samoa,34–0,34,0,34,27586,Pascal Gaüzère,France
5,2019-10-03,19:15,Ireland,Russia,35–0,35,0,35,26856,Jérôme Garcès,France
6,2019-10-05,19:30,Japan,Samoa,38–19,38,19,57,39695,Jaco Peyper,South Africa
7,2019-10-09,16:15,Scotland,Russia,61–0,61,0,61,44123,Wayne Barnes,England
8,2019-10-12,19:45,Ireland,Samoa,47–5,47,5,52,17967,Nic Berry,Australia
9,2019-10-13,19:45,Japan,Scotland,28–21,28,21,49,67666,Ben O,New Zealand


In [8]:
# df.to_csv(r'C:\Users\lacar\DQ Projects\Rugby DataVis' + '\\2019.csv', index=False)

In [9]:
attendance_corr = df.Attendance.str.get_dummies(sep=' ').corrwith(df.Attendance/df.Attendance.max())
print (attendance_corr)

AttributeError: Can only use .str accessor with string values!