In [132]:
from radiosunpy.time import TimeRange 
#from radiosunpy.client import BaseClient
from radiosunpy.scrapper import Scrapper
from urllib.request import urlopen
from urllib.parse import urlparse
import re
from astropy.table import Table


from astropy.io.fits.verify import VerifyWarning
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter('ignore', category=VerifyWarning)

## Custom data client
In `radiosunpy` package there is a submodule called `Scrapper`, which can help you to built client with your custom data source - all you need is to prepare base url to the files in your data source and implement your own data proccesing 

For example, let's look at the process of collecting and preparing AR's flare probabilities info from Solar Monitor

In [93]:
base_url = 'https://solarmonitor.org/data/%Y/%m/%d/meta/arm_forecast_%Y%m%d.txt'
regex_pattern = '(arm_forecast_(\d{8})\.txt)'

In [100]:
scrapper = Scrapper(base_url, regex_pattern)
timerange = TimeRange('2017-09-03', '2017-09-05')
for url in scrapper.form_fileslist(timerange):
    print(f'SolarMonitor url: {url}')

SolarMonitor url: https://solarmonitor.org/data/2017/09/03/meta/arm_forecast_20170903.txt
SolarMonitor url: https://solarmonitor.org/data/2017/09/04/meta/arm_forecast_20170904.txt
SolarMonitor url: https://solarmonitor.org/data/2017/09/05/meta/arm_forecast_20170905.txt


In [141]:
class FlareProbabilities:
    base_url = 'https://solarmonitor.org/data/%Y/%m/%d/meta/arm_forecast_%Y%m%d.txt'
    regex_pattern = '(arm_forecast_(\d{8})\.txt)'

    def acquire_data(self, timerange: TimeRange) -> list[str]:
        scrapper = Scrapper(self.base_url, self.regex_pattern)
        return scrapper.form_fileslist(timerange)
    
    def parse_date_from_url(self, url):
        path = urlparse(url).path
        match = re.search(r'/(\d{4})/(\d{2})/(\d{2})/', path)
        if match:
            year, month, day = match.groups()
            return f'{year}-{month}-{day}'
        return None
    
    def parse_field(self, field):
        if field == '...':
            return None
        return int(field)
    
    def parse_line(self, line):
        parts = line.split()
        noaa_number = parts[0]
        mcintosh_class = parts[1]

        # C-class fields
        c_class_fields = parts[2].split('(')
        c_class_mcevol = self.parse_field(c_class_fields[0])
        c_class_mcstat = self.parse_field(c_class_fields[1].rstrip(')')) if len(c_class_fields) > 1 else None
        c_class_noaa = self.parse_field(c_class_fields[2].rstrip(')')) if len(c_class_fields) > 1 else None

        # M-class fields
        m_class_fields = parts[3].split('(')
        m_class_mcevol = self.parse_field(m_class_fields[0])
        m_class_mcstat = self.parse_field(m_class_fields[1].rstrip(')')) if len(m_class_fields) > 1 else None
        m_class_noaa = self.parse_field(m_class_fields[2].rstrip(')')) if len(m_class_fields) > 1 else None

        # X-class fields
        x_class_fields = parts[4].split('(')
        x_class_mcevol = self.parse_field(x_class_fields[0])
        x_class_mcstat = self.parse_field(x_class_fields[1].rstrip(')')) if len(x_class_fields) > 1 else None
        x_class_noaa = self.parse_field(x_class_fields[2].rstrip(')')) if len(x_class_fields) > 1 else None

        return [
            noaa_number, 
            mcintosh_class, 
            c_class_mcevol,
            c_class_mcstat, 
            c_class_noaa, 
            m_class_mcevol,
            m_class_mcstat, 
            m_class_noaa, 
            x_class_mcevol,
            x_class_mcstat, 
            x_class_noaa
        ]

    def form_data(self, file_urls: list[str]):
        tables = []
        for url in file_urls:
            with urlopen(url) as response:
                content = response.read().decode('utf-8').split('\n') 
                date_text = self.parse_date_from_url(url)
                table_data = [[date_text] + self.parse_line(line.strip()) for line in content if line.strip()]
                tables.append(table_data)

        flattened_data = [item for sublist in tables for item in sublist]
        column_names = [
            'Date', 'NOAA Number', 'McIntosh Class', 
            'C-class_MCEVOL', 'C-class_MCSTAT', 'C-class_NOAA',  
            'M-class_MCEVOL', 'M-class_MCSTAT', 'M-class_NOAA',
            'X-class_MCEVOL', 'X-class_MCSTAT', 'X-class_NOAA'
        ]
        result = Table(rows=flattened_data, names=column_names)
        return result

    def get_data(self, timerange):
        file_urls = self.acquire_data(timerange)
        return self.form_data(file_urls)


In [143]:
timerange = TimeRange('2017-09-03', '2017-09-04')
flares_table = FlareProbabilities()
flares_table.get_data(timerange)

Date,NOAA Number,McIntosh Class,C-class_MCEVOL,C-class_MCSTAT,C-class_NOAA,M-class_MCEVOL,M-class_MCSTAT,M-class_NOAA,X-class_MCEVOL,X-class_MCSTAT,X-class_NOAA
str10,str5,str3,object,int64,int64,object,int64,int64,object,int64,int64
2017-09-03,12673,Cso,10.0,17,5,0.0,3,1,0.0,0,1
2017-09-03,12674,Fhc,,0,80,,55,15,,0,1
2017-09-03,12675,Cro,7.0,13,10,1.0,2,1,0.0,0,1
2017-09-03,12676,Bxo,,6,5,,1,1,,0,1
2017-09-04,12673,Dsc,,0,40,,22,5,,10,1
2017-09-04,12674,Fhc,100.0,0,70,98.0,55,15,0.0,0,1
2017-09-04,12675,Cro,7.0,13,10,1.0,2,1,0.0,0,1
2017-09-04,12676,Bxo,9.0,6,5,1.0,1,1,0.0,0,1
2017-09-04,12677,Axx,,3,5,,1,1,,0,1


As a result, we parsed a table of Solar Monitor flare probabilities without implementing a special parser for this data source. The only thing we did is made a processing of raw txt files