In [1]:
from bs4 import BeautifulSoup
import requests
import pandas
import io
import warnings

In [None]:
'''
the following lines of codes can be used to download bulk upper air sounding data 
from <http://weather.uwyo.edu/upperair/sounding.html>


The script has been borrowed and modified from below sources.
<https://www.alaskanresearcher.dev/pysoundings/pysoundings.html>

<https://github.com/dkllrjr/pysoundings>

'''

In [2]:
"""
# Behind the Scenes

These functions comprise the tiny backbone of this little module. They can be called if desired.
"""


__docformat__ = "numpy"


from bs4 import BeautifulSoup
import requests
import pandas
import io
import warnings


def build_param_dict(stnm, year, month, day_hour):
    """
    Builds a dictionary containing the station number, year, month, and day/hour for the desired atmospheric sounding data.
    
    Parameters
    ----------
    stnm : string
        String of the station identifier, e.g. '70261' for PAFA, Fairbanks Int'l Airport.
    year : string
        String of the year, e.g. '2021'.
    month : string
        String of the month, e.g. '01'.
    day_hour : string
        String of the combined day and hour, e.g. '0100' for the first day of the month as '01', and for the beginning of the day in UTC as '00'.

    Returns
    -------
    param_dict : dict
        A dictionary containing the station number, year, month, and day/hour of the desired date and location.
    """
    
    param_dict = {'STNM': stnm, 'YEAR': year, 'MONTH': month, 'FROM': day_hour, 'TO': day_hour}

    return param_dict


def build_url(param_dict):
    """
    Builds the URL needed to query [University of Wyoming, College of Engineering, Department of Atmospheric Science's website](http://weather.uwyo.edu/upperair/sounding.html) to get the proper sounding data.
    
    Parameters
    ----------
    param_dict : dict
        A dictionary containing the station number, year, month, and day/hour of the desired date and location.
    
    Returns
    -------
    full_url : string
        String of the query URL with the proper date and location of the desired atmospheric sounding.
    """
    
    base_url = 'http://weather.uwyo.edu/cgi-bin/sounding?TYPE=TEXT%3ALIST'

    full_url = base_url
    for key in param_dict.keys():
        full_url += '&' + key + '=' + param_dict[key]

    return full_url


def format_data(html_string):
    """
    Takes a string containing the html container that has the fixed width formatted atmospheric sounding data and stores it in a [pandas.DataFrame](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html), as well as extracts the units for the data.
    
    Parameters
    ----------
    html_string : string
        A string of the html container that holds the atmospheric sounding data.

    Returns
    -------
    data : DataFrame
        A [pandas.DataFrame](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html) containing the atmospheric data with a labeled header.
    units : string
        String of the units of the pandas DataFrame columns.
    """
    
    fwf = html_string.split('\n')[5:-2]
    header = html_string.split('\n')[2].split()
    units = html_string.split('\n')[3]

    fwf_buffer = io.StringIO('\n'.join(fwf))

    data = pandas.read_fwf(fwf_buffer, names=header)

    return data, units


def pull_data(url):
    """
    This function makes an http request with the given URL to retrieve the atmospheric sounding data from the [University of Wyoming, College of Engineering, Department of Atmospheric Science's website](http://weather.uwyo.edu/upperair/sounding.html). It then puts the data into a [pandas.DataFrame](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html) and the units into a string.

    Parameters
    ----------
    url : string
        String of the url containing the information necessary to query the sounding database.

    Returns
    -------
    data : DataFrame
        A [pandas.DataFrame](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html) containing the atmospheric data with a labeled header.
    units : string
        String of the units of the pandas DataFrame columns.
    """
    
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'lxml')
    data_container = soup.pre

    if data_container == None:
        warnings.warn('No data found for the given parameters')
        data, units = None, None

    else: 
        data, units = format_data(data_container.prettify())

    return data, units

def get_data(stnm, year, month, day_hour):
    """
    This function calls all the necessary functions to build the url with the given date and station number, and then extracts the data from the loaded webpage from the [University of Wyoming, College of Engineering, Department of Atmospheric Science's website](http://weather.uwyo.edu/upperair/sounding.html) that contains the atmospheric sounding data.

    Parameters
    ----------
    stnm : string
        String of the station identifier, e.g. '70261' for PAFA, Fairbanks Int'l Airport.
    year : string
        String of the year, e.g. '2021'.
    month : string
        String of the month, e.g. '01'.
    day_hour : string
        String of the combined day and hour, e.g. '0100' for the first day of the month as '01', and for the beginning of the day in UTC as '00'.

    Returns
    -------
    data : DataFrame
        A [pandas.DataFrame](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html) containing the atmospheric data with a labeled header.
    units : string
        String of the units of the pandas DataFrame columns.

    Examples
    --------

    >>> stnm, year, month, day_hour = '70261', '2021', '01', '0100'
    >>> data, units = get_data(stnm, year, month, day_hour)
    >>> data
           PRES   HGHT  TEMP  DWPT  RELH  MIXR   DRCT  SKNT   THTA   THTE   THTV
    0    1000.0     93   NaN   NaN   NaN   NaN    NaN   NaN    NaN    NaN    NaN
    1     994.0    134 -23.3 -25.7  81.0  0.48   85.0   1.0  250.3  251.6  250.3
    2     992.0    149 -19.3 -20.9  87.0  0.73   80.0   1.0  254.4  256.5  254.6
    3     990.0    164 -18.9 -20.9  84.0  0.73   75.0   2.0  255.0  257.0  255.1
    4     988.0    179 -15.7 -17.6  85.0  0.98   70.0   2.0  258.3  261.1  258.5
    ..      ...    ...   ...   ...   ...   ...    ...   ...    ...    ...    ...
    153    10.0  30830 -59.7 -89.7   1.0  0.01  345.0  94.0  795.6  795.8  795.7
    154     9.9  30893 -60.1 -90.1   1.0  0.01  345.0  95.0  796.5  796.6  796.5
    155     9.6  31090 -59.8 -90.1   1.0  0.01  345.0  98.0  804.9  805.0  804.9
    156     9.1  31394 -59.3 -90.0   1.0  0.01  345.0  94.0  818.0  818.2  818.0
    157     8.0  32229 -57.9 -89.9   1.0  0.02    NaN   NaN  855.2  855.4  855.2
        [158 rows x 11 columns]
    >>> units
    '    hPa     m      C      C      %    g/kg    deg   knot     K      K      K '
    """
    
    param_dict = build_param_dict(stnm, year, month, day_hour)
    url = build_url(param_dict)
    data, units = pull_data(url)

    return data, units

#### Change the following params according to which station, date and hour(s) to download.


stnm = '42369'   #42260- agra #42182- delhi
year = '2016'
month = '11'
day_start = 11
day_end = 20

hours =["00"]  #["12"] #

first_half_rows = []
second_half_rows = []

dir_path = "UWYO/"+ stnm + "-00/"

 #["00", "12", "09", "06", "03"]

for day in range(day_start, day_end + 1):
    for hour in hours:
        hour_of_day = "%s%s" % (day, hour) if day > 9 else "0%s%s" % (day, hour)
        hour_of_day_data, units = get_data(stnm, year, month, hour_of_day)
        if hour_of_day_data is None:
            print("Got NO data for time:", hour_of_day)
        else:
            print("Got data for time:", hour_of_day)
            hour_of_day_data['date'] = "%s-%s-%s-%s" % (stnm, year, month, hour_of_day)
            hour_of_day_data.to_csv(dir_path+"%s-%s-%s-%s.csv" % (stnm, year, month, hour_of_day))
    



Got data for time: 1100
Got data for time: 1200
Got data for time: 1300
Got data for time: 1400
Got data for time: 1500
Got data for time: 1600
Got data for time: 1700
Got data for time: 1800
Got data for time: 1900
Got data for time: 2000


In [81]:
'''
##########################  information about stations to download required ##################################

DELHI	42182	VIDD
LUCKNOW	42369	VILK
CALCUTTA/DUM DUM	42809	VECC
PATIALA	42101	no abbreveation
GORAKHPUR (IN-AFB)	42379	VEGK
GWALIOR (IN-AFB)	42361	VIGR
JODHPUR (IN-AFB)	42339	VIJO
PATNA	42492	VEPT
RANCHI	42701	VERC
BHUBANESWAR	42971	VEBS

'''

'\nDELHI\t42182\tVIDD\nLUCKNOW\t42369\tVILK\nCALCUTTA/DUM DUM\t42809\tVECC\nPATIALA\t42101\tno abbreveation\nGORAKHPUR (IN-AFB)\t42379\tVEGK\nGWALIOR (IN-AFB)\t42361\tVIGR\nJODHPUR (IN-AFB)\t42339\tVIJO\nPATNA\t42492\tVEPT\nRANCHI\t42701\tVERC\nBHUBANESWAR\t42971\tVEBS\n\n'

In [6]:
# for day in range(day_start, day_end + 1):
#     for hour in hours:
#         second_half_day_hour = "%s%s" % (day, hour) if day > 9 else "0%s%s" % (day, hour)
#         second_half_data, units = get_data(stnm, year, month, second_half_day_hour)
#         if second_half_data is None:
#             print("Got NO data for time:", second_half_day_hour)
#         else:
#             print("Got data for time:", second_half_day_hour)
#             second_half_data['date'] = "%s-%s-%s-%s" % (stnm, year, month, second_half_day_hour)
#             second_half_data.to_csv(dir_path+"%s-%s-%s-%s.csv" % (stnm, year, month, second_half_day_hour))
#     

'UWYO/4218212/'