In [1]:
import requests
import pandas as pd
from io import StringIO
from bs4 import BeautifulSoup
import re

import warnings
warnings.filterwarnings('ignore')

Tanay Station: https://weather.uwyo.edu/cgi-bin/sounding?region=seasia&TYPE=TEXT%3ALIST&YEAR=2023&MONTH=01&FROM=0400&TO=0400&STNM=98433

In [2]:
resp = requests.get(
    'http://weather.uwyo.edu/cgi-bin/sounding?region=seasia&TYPE=TEXT%3ALIST&YEAR=2017&MONTH=01&FROM=0400&TO=0400&STNM=98433',
    verify = False)

resp.text

'<HTML>\n<TITLE>University of Wyoming - Radiosonde Data</TITLE>\n<BODY BGCOLOR="white">\n<H2>98433  Tanay Observations at 00Z 04 Jan 2017</H2>\n<PRE>\n-----------------------------------------------------------------------------\n   PRES   HGHT   TEMP   DWPT   RELH   MIXR   DRCT   SKNT   THTA   THTE   THTV\n    hPa     m      C      C      %    g/kg    deg   knot     K      K      K \n-----------------------------------------------------------------------------\n 1000.0    109                                                               \n  940.0    614   21.8   19.5     87  15.43     30     12  300.2  345.6  303.0\n  925.0    791   21.0   18.8     87  15.00     45     19  300.8  345.0  303.5\n  901.0   1018   19.4   17.8     90  14.46     55     35  301.4  344.2  304.0\n  865.0   1369   17.0   16.3     96  13.66     55     37  302.4  343.0  304.9\n  852.0   1499   16.4   15.6     95  13.25     59     37  303.1  342.6  305.5\n  850.0   1519   16.8   15.8     94  13.46     60     37  3

In [3]:
soup = BeautifulSoup(resp.text)
soup.findAll("pre")[0]

<pre>
-----------------------------------------------------------------------------
   PRES   HGHT   TEMP   DWPT   RELH   MIXR   DRCT   SKNT   THTA   THTE   THTV
    hPa     m      C      C      %    g/kg    deg   knot     K      K      K 
-----------------------------------------------------------------------------
 1000.0    109                                                               
  940.0    614   21.8   19.5     87  15.43     30     12  300.2  345.6  303.0
  925.0    791   21.0   18.8     87  15.00     45     19  300.8  345.0  303.5
  901.0   1018   19.4   17.8     90  14.46     55     35  301.4  344.2  304.0
  865.0   1369   17.0   16.3     96  13.66     55     37  302.4  343.0  304.9
  852.0   1499   16.4   15.6     95  13.25     59     37  303.1  342.6  305.5
  850.0   1519   16.8   15.8     94  13.46     60     37  303.7  344.0  306.2
  844.0   1580   17.0   15.2     89  13.04     61     37  304.6  343.7  306.9
  841.0   1610   18.8   13.8     73  11.93     62     36  

In [4]:
clean_soup = soup.findAll("pre")[0].contents[0].split("\n")[5:-1]
clean_soup

[' 1000.0    109                                                               ',
 '  940.0    614   21.8   19.5     87  15.43     30     12  300.2  345.6  303.0',
 '  925.0    791   21.0   18.8     87  15.00     45     19  300.8  345.0  303.5',
 '  901.0   1018   19.4   17.8     90  14.46     55     35  301.4  344.2  304.0',
 '  865.0   1369   17.0   16.3     96  13.66     55     37  302.4  343.0  304.9',
 '  852.0   1499   16.4   15.6     95  13.25     59     37  303.1  342.6  305.5',
 '  850.0   1519   16.8   15.8     94  13.46     60     37  303.7  344.0  306.2',
 '  844.0   1580   17.0   15.2     89  13.04     61     37  304.6  343.7  306.9',
 '  841.0   1610   18.8   13.8     73  11.93     62     36  306.8  342.9  309.0',
 '  829.0   1734   18.6   12.6     68  11.18     64     36  307.8  341.9  309.9',
 '  825.0   1776   19.0    7.0     46   7.67     64     35  308.7  332.4  310.1',
 '  807.0   1965   17.0   12.7     76  11.56     67     34  308.5  343.8  310.6',
 '  792.0   2124

```
 PRES   HGHT   TEMP   DWPT   RELH   MIXR   DRCT   SKNT   THTA   THTE   THTV
 hPa     m      C      C      %    g/kg    deg   knot     K      K      K 
```

In [5]:
REGEX_FORMAT = r"^" + \
               r"(?P<pres_hpa>.{8})" + \
               r"(?P<hght_m>.{6})" + \
               r"(?P<temp_c>.{7})" + \
               r"(?P<dwpt_c>.{7})" + \
               r"(?P<relh_pct>.{7})" + \
               r"(?P<mixr_gkg>.{7})" + \
               r"(?P<drct_deg>.{7})" + \
               r"(?P<sknt_knot>.{7})" + \
               r"(?P<tht_k>.{7})" + \
               r"(?P<the_k>.{7})" + \
               r"(?P<thv_k>.{7})" + \
               r".*$"
re.match(REGEX_FORMAT, clean_soup[1]).groupdict()

{'pres_hpa': '  940.0 ',
 'hght_m': '   614',
 'temp_c': '   21.8',
 'dwpt_c': '   19.5',
 'relh_pct': '     87',
 'mixr_gkg': '  15.43',
 'drct_deg': '     30',
 'sknt_knot': '     12',
 'tht_k': '  300.2',
 'the_k': '  345.6',
 'thv_k': '  303.0'}

In [6]:
for row in clean_soup[:10]:
    print(re.match(REGEX_FORMAT, row).groupdict())

{'pres_hpa': ' 1000.0 ', 'hght_m': '   109', 'temp_c': '       ', 'dwpt_c': '       ', 'relh_pct': '       ', 'mixr_gkg': '       ', 'drct_deg': '       ', 'sknt_knot': '       ', 'tht_k': '       ', 'the_k': '       ', 'thv_k': '       '}
{'pres_hpa': '  940.0 ', 'hght_m': '   614', 'temp_c': '   21.8', 'dwpt_c': '   19.5', 'relh_pct': '     87', 'mixr_gkg': '  15.43', 'drct_deg': '     30', 'sknt_knot': '     12', 'tht_k': '  300.2', 'the_k': '  345.6', 'thv_k': '  303.0'}
{'pres_hpa': '  925.0 ', 'hght_m': '   791', 'temp_c': '   21.0', 'dwpt_c': '   18.8', 'relh_pct': '     87', 'mixr_gkg': '  15.00', 'drct_deg': '     45', 'sknt_knot': '     19', 'tht_k': '  300.8', 'the_k': '  345.0', 'thv_k': '  303.5'}
{'pres_hpa': '  901.0 ', 'hght_m': '  1018', 'temp_c': '   19.4', 'dwpt_c': '   17.8', 'relh_pct': '     90', 'mixr_gkg': '  14.46', 'drct_deg': '     55', 'sknt_knot': '     35', 'tht_k': '  301.4', 'the_k': '  344.2', 'thv_k': '  304.0'}
{'pres_hpa': '  865.0 ', 'hght_m': '  13

In [7]:
import datetime as dt
import time

URL_FORMAT = "http://weather.uwyo.edu/cgi-bin/sounding?region=seasia&TYPE=TEXT%3ALIST&YEAR={year}&MONTH={month}&FROM={day}00&TO={day}00&STNM=98433"

start_date = dt.datetime(2022, 9, 6)
end_date = dt.datetime(2023, 1, 4)
delta = dt.timedelta(days=1)

while start_date <= end_date:
    station_data = []

    url = URL_FORMAT.format(
        year=start_date.strftime("%Y"),
        month=start_date.strftime("%m"),
        day=start_date.strftime("%d"))

    resp = requests.get(url, verify=False)
    
    
    try:
        soup = BeautifulSoup(resp.text)
        soup.findAll("pre")[0]
        clean_soup = soup.findAll("pre")[0].contents[0].split("\n")[5:-1]

        for row in clean_soup:
            row_obj = re.match(REGEX_FORMAT, row).groupdict()
            station_data.append(row_obj)

        station_data_df = pd.DataFrame.from_dict(station_data)
        station_data_df.to_csv("data/output/Upper Air Data - UWYO/UWYO-UPPER-AIR-{year}-{month}-{day}.csv".format(
            year=start_date.strftime("%Y"),
            month=start_date.strftime("%m"),
            day=start_date.strftime("%d")))
    except:
        print("Error: " + url)

    start_date += delta
    


Error: http://weather.uwyo.edu/cgi-bin/sounding?region=seasia&TYPE=TEXT%3ALIST&YEAR=2022&MONTH=12&FROM=1900&TO=1900&STNM=98433
Error: http://weather.uwyo.edu/cgi-bin/sounding?region=seasia&TYPE=TEXT%3ALIST&YEAR=2022&MONTH=12&FROM=2000&TO=2000&STNM=98433
Error: http://weather.uwyo.edu/cgi-bin/sounding?region=seasia&TYPE=TEXT%3ALIST&YEAR=2022&MONTH=12&FROM=2100&TO=2100&STNM=98433
Error: http://weather.uwyo.edu/cgi-bin/sounding?region=seasia&TYPE=TEXT%3ALIST&YEAR=2022&MONTH=12&FROM=2200&TO=2200&STNM=98433
Error: http://weather.uwyo.edu/cgi-bin/sounding?region=seasia&TYPE=TEXT%3ALIST&YEAR=2022&MONTH=12&FROM=2400&TO=2400&STNM=98433
Error: http://weather.uwyo.edu/cgi-bin/sounding?region=seasia&TYPE=TEXT%3ALIST&YEAR=2022&MONTH=12&FROM=2500&TO=2500&STNM=98433
Error: http://weather.uwyo.edu/cgi-bin/sounding?region=seasia&TYPE=TEXT%3ALIST&YEAR=2022&MONTH=12&FROM=2600&TO=2600&STNM=98433
Error: http://weather.uwyo.edu/cgi-bin/sounding?region=seasia&TYPE=TEXT%3ALIST&YEAR=2022&MONTH=12&FROM=2700&TO=

In [8]:
station_data_df

Unnamed: 0,pres_hpa,hght_m,temp_c,dwpt_c,relh_pct,mixr_gkg,drct_deg,sknt_knot,tht_k,the_k,thv_k
0,940.0,614.0,21.4,19.3,88.0,15.23,40,10,299.8,344.5,302.5
1,914.0,868.0,19.2,18.8,98.0,15.18,44,19,300.0,344.6,302.7
2,883.0,1179.0,16.8,16.8,100.0,13.82,49,30,300.4,341.2,302.9
3,850.0,1519.0,15.2,15.2,100.0,12.94,55,43,302.1,340.5,304.4
4,839.0,1629.0,14.6,14.6,100.0,12.6,50,49,302.5,340.0,304.8
5,794.0,2095.0,12.0,12.0,100.0,11.22,71,42,304.6,338.3,306.6
6,772.0,2331.0,11.2,11.2,100.0,10.94,82,39,306.2,339.3,308.2
7,767.0,2385.0,10.4,9.4,94.0,9.74,85,38,305.9,335.4,307.7
8,700.0,3147.0,9.6,6.0,78.0,8.45,120,27,313.1,339.6,314.7
9,695.0,3207.0,9.5,5.7,77.0,8.35,120,27,313.6,339.9,315.2
