This notebook webscraps data from https://www.timeanddate.com/ to extract temperature information at Austin, Texas during the period of 2015-2016

In [3]:
## Uncomment to download the packages
# !pip install selenium

import requests, re, typing
from bs4 import BeautifulSoup as soup
import contextlib
from selenium import webdriver
import pandas as pd
import numpy as np

In [4]:
def _remove(d):
    return list(filter(None, [re.sub('\xa0', '', b) for b in d]))

def get_weather_data(url, by_url = True):
    d = soup(requests.get(url).text if by_url else url, 'html.parser')
    Data = []
    table = d.find('table', attrs={'id':'wt-his'})
    for tr in table.find('tbody').find_all('tr'):
        dict = {}
        dict['time'] = tr.find('th').text.strip()
        all_td = tr.find_all('td')
        dict['temp'] = all_td[1].text
        dict['weather'] = all_td[2].text
        dict['wind'] = all_td[3].text
        arrow = all_td[4].text


        dict['humidity'] = all_td[5].text
        dict['barometer'] = all_td[6].text
        dict['visibility'] = all_td[7].text

        Data.append(dict)
    return Data


# get_weather_data("https://www.timeanddate.com/weather/usa/dayton/historic?month=2&year=2016")

In [5]:
d = webdriver.Chrome(r'.\chromedriver_win32\chromedriver.exe')
duration = [(2015,10),(2015,11),(2015,12),(2016,1),(2016,2),(2016,3),(2016,4),(2016,5),(2016,6),(2016,7),(2016,8),(2016,9)]
_d = {}
for date in duration:
    d.get('https://www.timeanddate.com/weather/usa/austin/historic?month={}&year={}'.format(date[1],date[0]))
    for i in d.find_element_by_id('wt-his-select').find_elements_by_tag_name('option'):
    #     i.click()
        weather = get_weather_data(d.page_source, False)
        _d[i.text] = weather
    print("Data is extracted for {}".format(date))

Data is extracted for (2015, 10)
Data is extracted for (2015, 11)
Data is extracted for (2015, 12)
Data is extracted for (2016, 1)
Data is extracted for (2016, 2)
Data is extracted for (2016, 3)
Data is extracted for (2016, 4)
Data is extracted for (2016, 5)
Data is extracted for (2016, 6)
Data is extracted for (2016, 7)
Data is extracted for (2016, 8)
Data is extracted for (2016, 9)


In [6]:
month_list = ['January','February','March','April','May','June','July','August','September','October','November','December']
df_temperature = pd.DataFrame(np.zeros((2,5)).astype(int),columns=['Year','Month','Day','Hour','Temperature'])

temp_list = []
n = 0

for date, data in _d.items():
    day = int(date.split(" ")[0])
    month = month_list.index(date.split(" ")[1]) + 1
    year = int(date.split(" ")[2])
#     print(day)
#     print(month)
#     print(year)

    for entry in data:
        for label,value in entry.items():
            if label == 'time':
                value = value.split(":")[0]
                hour = int(value)
#                 print("time: {}".format(value))
            elif label == 'temp':
                value = re.sub('°C', '', value)
                temperature = int(value)
#                 print("temp: {}".format(value))
                temp_list.append(year)
                temp_list.append(month)
                temp_list.append(day)
                temp_list.append(hour)
                temp_list.append(temperature)
                df_temperature.loc[n] = temp_list
                n=n+1
                temp_list.clear()


df_temperature

Unnamed: 0,Year,Month,Day,Hour,Temperature
0,2015,10,1,0,26
1,2015,10,1,1,22
2,2015,10,1,2,22
3,2015,10,1,3,23
4,2015,10,1,4,23
5,2015,10,1,5,21
6,2015,10,1,6,23
7,2015,10,1,7,23
8,2015,10,1,8,24
9,2015,10,1,9,26


## Modification of data

In the webscrape data, there are some missing value. To address this, we would be doing some interpolation to fill in the missing data

In [9]:
df_temperature = df_temperature.groupby(['Year','Month','Day','Hour']).mean().reset_index()
df_temperature.shape

(8693, 5)

In [12]:
# Creating a templete to store the data
temp_list = []

for year in [2015,2016]:
    if year == 2015:
        for month in [10,11,12]:
            if month == 10:
                for day in range(1,32):
                    for hour in range(0,24):
                        temp_list.append([year,month,day,hour])   
            elif month == 11:
                for day in range(1,31):
                    for hour in range(0,24):
                        temp_list.append([year,month,day,hour])  
            else:
                for day in range(1,32):
                    for hour in range(0,24):
                        temp_list.append([year,month,day,hour])  
    else:
        for month in [1,2,3,4,5,6,7,8,9]:
            if (month == 4) | (month == 6) | (month == 9):
                for day in range(1,31):
                    for hour in range(0,24):
                        temp_list.append([year,month,day,hour])  
            elif month == 2:
                # Febraruy 2019 is a leap year,hence there are 29 days
                for day in range(1,30): 
                    for hour in range(0,24):
                        temp_list.append([year,month,day,hour])  
            else:
                for day in range(1,32):
                    for hour in range(0,24):
                        temp_list.append([year,month,day,hour])  

temp_list = np.asarray(temp_list)
templete_df = pd.DataFrame(temp_list,columns=['Year','Month','Day','Hour'])
templete_df['Temperature'] = 0
templete_df.head()

Unnamed: 0,Year,Month,Day,Hour,Temperature
0,2015,10,1,0,0
1,2015,10,1,1,0
2,2015,10,1,2,0
3,2015,10,1,3,0
4,2015,10,1,4,0


In [24]:
def replace_0_with_NaN(x):
    temp = x[4]
    if temp == 0:
        return np.nan
    else:
        return temp
    
frames = [df_temperature, templete_df]
df_temperature = pd.concat(frames).groupby(['Year','Month','Day','Hour']).sum().reset_index()

df_temperature['Temperature'] = df_temperature.apply(replace_0_with_NaN,axis=1)
df_temperature['Temperature'] = df_temperature['Temperature'].interpolate(limit_direction='both').astype(int)
print(df_temperature.shape)
print(df_temperature['Temperature'].unique())
df_temperature

(8784, 5)
[26 22 23 21 24 28 30 32 33 34 31 29 27 25 18 17 16 15 19 10  9  8 11 12
 14 13  7 20 35 36 37]


Unnamed: 0,Year,Month,Day,Hour,Temperature
0,2015,10,1,0,26
1,2015,10,1,1,22
2,2015,10,1,2,22
3,2015,10,1,3,23
4,2015,10,1,4,23
5,2015,10,1,5,21
6,2015,10,1,6,23
7,2015,10,1,7,23
8,2015,10,1,8,24
9,2015,10,1,9,26


In [25]:
df_temperature[(df_temperature['Month'] == 11) & (df_temperature['Hour'] == 9)]

Unnamed: 0,Year,Month,Day,Hour,Temperature
753,2015,11,1,9,18
777,2015,11,2,9,18
801,2015,11,3,9,18
825,2015,11,4,9,18
849,2015,11,5,9,18
873,2015,11,6,9,18
897,2015,11,7,9,18
921,2015,11,8,9,18
945,2015,11,9,9,18
969,2015,11,10,9,18


In [26]:
df_temperature.to_csv('temperature.csv',index=False)