# Parsing data from [Scotland COVID-19 Wikipedia page](https://en.wikipedia.org/wiki/2020_coronavirus_pandemic_in_Scotland) and conversion to csv file

In [None]:
pip install lxml pandas

In [2]:
import pandas as pd

In [3]:
def get_scotland_data(url: str = "https://en.wikipedia.org/wiki/2020_coronavirus_pandemic_in_the_United_Kingdom") -> pd.DataFrame:
    
    def post_process(dataframe: pd.DataFrame) -> pd.DataFrame:
        dataframe.columns = dataframe.columns.droplevel()
        dataframe.set_index('Date', inplace=True)
        return dataframe
    
    dataframes = pd.read_html(url)
    for index, dataframe in enumerate(dataframes):
        try:
          # Use the first column name to find the right table
          if dataframe.columns[1][0] == 'A&A':
              return post_process(dataframe)
        except:
            print("Issue parsing dataframe:", index)

In [4]:
covid_scotland_data = get_scotland_data()
print(covid_scotland_data)

Issue parsing dataframe: 0
Issue parsing dataframe: 1
Issue parsing dataframe: 2
            A&A   BOR   D&G   FIF    FV  GRA    GGC  HLD   LAN   LOT  SHE  \
Date                                                                        
2020/03/02  NaN   NaN   NaN   NaN   NaN  NaN    NaN  NaN   NaN   NaN  NaN   
2020/03/03  NaN   NaN   NaN   NaN   NaN  NaN    NaN  NaN   NaN   NaN  NaN   
2020/03/04    1   NaN   NaN   NaN   NaN    1    NaN  NaN   NaN   NaN  NaN   
2020/03/05  NaN   NaN   NaN   NaN   1.0    1    1.0  NaN   NaN   NaN  NaN   
2020/03/06  NaN   NaN   NaN   2.0   1.0    1    NaN  NaN   NaN   1.0  NaN   
2020/03/07  NaN   NaN   NaN   NaN   NaN    1    1.0  NaN   2.0   1.0  NaN   
2020/03/08  NaN   NaN   NaN   NaN   NaN  NaN    1.0  NaN   NaN   1.0  NaN   
2020/03/09  NaN   NaN   NaN   NaN   NaN  NaN    NaN  NaN   1.0   2.0    2   
2020/03/10  NaN   NaN   NaN   NaN   NaN    2    NaN  NaN   NaN   2.0  NaN   
2020/03/11    2   2.0   NaN   NaN   NaN  NaN    2.0  NaN   1.0   1.0  Na

In [5]:
health_board_data = covid_scotland_data.iloc[:,:12]

In [6]:
columns = []
for index, column in health_board_data.iloc[:-1].iterrows():
    column = column.astype('str').str.replace('–','-').str.replace('−','-').astype('float')
    columns.append(column)
counts = pd.concat(columns,axis=1).transpose().fillna(0).astype('int')

In [9]:
counts.to_csv('data/wikipedia_numbers.csv')

In [7]:
counts

Unnamed: 0,A&A,BOR,D&G,FIF,FV,GRA,GGC,HLD,LAN,LOT,SHE,TAY
2020/03/02,0,0,0,0,0,0,0,0,0,0,0,1
2020/03/03,0,0,0,0,0,0,0,0,0,0,0,0
2020/03/04,1,0,0,0,0,1,0,0,0,0,0,0
2020/03/05,0,0,0,0,1,1,1,0,0,0,0,0
2020/03/06,0,0,0,2,1,1,0,0,0,1,0,0
2020/03/07,0,0,0,0,0,1,1,0,2,1,0,0
2020/03/08,0,0,0,0,0,0,1,0,0,1,0,0
2020/03/09,0,0,0,0,0,0,0,0,1,2,2,0
2020/03/10,0,0,0,0,0,2,0,0,0,2,0,0
2020/03/11,2,2,0,0,0,0,2,0,1,1,0,1
