In [None]:
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
def read_csv_data(years, DATAPATH=Path('./raw_data/SWGIM_year/')):
    reserved_point = [(67.5, -65), (25, 120), (0, -90), (-20, -160), (-32.5, 20), (-77.5, 165)]
    
    droplist = [0] + list(range(9, 10235))
    for lat, lng in reserved_point:
        droplist.remove(10 + int((87.5-lat)/2.5)*72 + int((180+lng)/5+1))
        
    renamelist = ['year', 'DOY', 'hour', 'Kp index',
            'R', 'Dst-index, nT', 'ap_index, nT', 'f10.7_index', (67.5, -65),\
                            (25, 120), (0, -90), (-20, -160), (-32.5, 20), (-77.5, 165)]

    df_list = []
    print('Reading csv data...')
    for year in tqdm(years):
        year_df = pd.read_csv(DATAPATH / Path(f'{year}.csv'), header=list(range(6)))

        # drop columns
        year_df.drop(year_df.columns[droplist], inplace=True, axis=1, errors='ignore')
        
        # rename dataframe
        year_df.columns = renamelist
        
        df_list.append(year_df)
        
    all_df = pd.concat(df_list, axis=0)
        
    return all_df

In [None]:
import pandas as pd
train_df = pd.read_csv('./single_point_test.csv')
train_df.to_csv('./single_point_test.csv')
train_df = pd.read_csv('./single_point_train.csv')
train_df.to_csv('./single_point_train.csv')

In [None]:
# 2015: 18254 - 18774
# 2016: 18775 - 19296

In [12]:
import os
import sys
import threading
from tqdm import tqdm
from pathlib import Path
import numpy as np
import pandas as pd
import requests
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

def ion2list(ion_filename):
    
    daily_text = open(ion_filename, 'r').read()
    
    year, doy = daily_text[43:43+4], daily_text[38:38+3]
        
    # print(year, doy, hour)
    hour_len = 12357
    
    daily_list = []
    # each hour
    for hour in range(24):
        hourly_text = daily_text[hour_len*hour:hour_len*(hour+1)]
        
        line_len, header_len = 42, 1604
        value_list, rms_list = [], []
        for l in range(256):
            line_text = hourly_text[header_len + l * line_len:header_len + (l+1) * line_len]
            
            value_list.append(float(line_text[19:19+11]))
            rms_list.append(float(line_text[35:35+6]))
            
        daily_list.append(value_list + rms_list)
    
    return int(doy), np.array(daily_list) # 24,512
   
def download_ion_file(year, ion_filename):
    
    baseurl =  'http://ftp.aiub.unibe.ch/CODE'
    
    # download ion file
    open(ion_filename, 'wb').write(requests.get(f"{baseurl}/{year}/{ion_filename}").content)
    
    # unzip .Z file
    os.system(f'uncompress {ion_filename}')
              
    # os.system(f'rm {ion_filename}')

# download_ion_file(2015, 'COD18260.ION.Z')

def daily_data_2_df(year, ion_filename, SWGIM_filename, label_storm_df, header_df):

    # read SWGIM df
    SWGIM_day = pd.read_csv(SWGIM_filename, index_col=0, header=list(range(6))).reset_index(drop=True)
    # print(SWGIM_day.info())
    # print(SWGIM_day.head())
    # download ion file
    download_ion_file(year, ion_filename)
    
    # read ion file
    doy, daily_ion_list = ion2list(ion_filename[:-2])
    
    os.system(f'rm {ion_filename[:-2]}')
    
    label_daily_df = label_storm_df.loc[(label_storm_df['year'] == year) & (label_storm_df['DOY'] == doy)].iloc[:,3:5].reset_index(drop=True)
    # print(label_daily_df)

    # fill label
    SWGIM_day[('label', 'Geomagnetic Storms Size')] = label_daily_df['Geomagnetic Storms Size']
    SWGIM_day[('label', 'Geomagnetic Storms State')] = label_daily_df['Geomagnetic Storms State']
    
    # fill sh coef & RMSE
    orderlist = [0] 
    for k in zip(range(1, 16), range(-1,-16,-1)):
        orderlist += [k[0] , k[1]]
    
    # SH coef.
    idx = 0
    for degree in range(16):
        for order in orderlist[:2*degree+1]:
            col_name = ('CODE','SH coef','-','f2.8',str(degree),str(order))
            # assert col_name in SWGIM_day.columns, f'{col_name} not in {year} {doy} columns!'
            SWGIM_day[col_name] = daily_ion_list[:,idx]
            idx += 1
    # SH coef. RMSE
    idx = 0
    for degree in range(16):
        for order in orderlist[:2*degree+1]:
            col_name = ('CODE','SH coef RMSE','-','f1.4',str(degree),str(order))
            # assert col_name in SWGIM_day.columns, f'{col_name} not in {year} {doy} columns!'
            SWGIM_day[col_name] = daily_ion_list[:,idx]
            idx += 1
    
    SWGIM_day.columns = header_df.columns
    # SWGIM_day.to_csv('./temp1.csv')
    SWGIM_day.to_csv(SWGIM_filename)
    
# label_storm_df = pd.read_csv('./raw_data/23label_all_-10.csv', index_col=0, usecols=list(range(6)))
# headers df
# header_df = pd.read_csv('./raw_data/SWGIM_headers.csv', index_col=0, header=list(range(6)))    
# daily_data_2_df(2015, 'COD18256.ION.Z', './raw_data/SWGIM_day/2015/2015003.csv', label_storm_df, header_df)
# print(label_storm_df.head())

import re
import bs4
from bs4 import BeautifulSoup

def get_filenames(year):
    baseurl = 'http://ftp.aiub.unibe.ch/CODE'
    sauce = requests.get(f'{baseurl}/{year}')
    
    soup = BeautifulSoup(sauce.text,'html.parser')
    
    filenames = [re.match('COD\d{5}\.ION\.Z', a.text) for a in soup.find_all('a')]
    filenames = [a.group() for a in filenames if a is not None]
    
    # print(len(filename))
    return filenames
# get_filenames(2015)

def main():
    
    # read storm df
    label_storm_df = pd.read_csv('./raw_data/23label_all_-10.csv', index_col=0, usecols=list(range(6))).reset_index(drop=True)
    # display(label_storm_df.info())
    
    # headers df
    header_df = pd.read_csv('./raw_data/SWGIM_headers.csv', index_col=0, header=list(range(6)))
    
    SWGIM_path = './raw_data/SWGIM_day'
    
    args_list = [(year, a, f'{SWGIM_path}/{year}/{year}{idx+1:03d}.csv')\
        for year in range(2016,2022) for idx, a in enumerate(get_filenames(year))]
    # print(args_list)
    
    # for args in args_list[:5]:
    #     daily_data_2_df(*args, label_storm_df, header_df)
    # return    
    threads = []
    thread_limit = 10

    for args in tqdm(args_list):
        while threading.active_count() > thread_limit:
            threads[0].join()
            threads.pop(0)

        threads.append(threading.Thread(target = daily_data_2_df,\
            args = (*args, label_storm_df, header_df)))
        threads[-1].start()

    for thread in tqdm(threads):
        thread.join()

if __name__ == '__main__':
    main()
    pass

100%|██████████| 2192/2192 [1:05:14<00:00,  1.79s/it]
100%|██████████| 6/6 [00:04<00:00,  1.28it/s]


In [43]:
basepath = Path('./raw_data/SWGIM_day')

# headers df
header_df = pd.read_csv('./raw_data/SWGIM_headers.csv', index_col=0, header=list(range(6)))
    
for year in range(2018,2022):
    print(year)
    year_list = []
    for p in tqdm(sorted((basepath / Path(f'{year}')).glob('*'))):
        df = pd.read_csv(p, index_col=0, header=list(range(6)))
        
        # df.columns = header_df.columns[:len(df.columns)]
        df.to_csv(p)
        year_list.append(df)
    
    # year_df = pd.read_csv(f'./raw_data/SWGIM_year/{year}.csv', index_col=0, header=list(range(6)))
    # year_df.columns = header_df.columns[:len(year_df.columns)]
    year_df = pd.concat(year_list, ignore_index=True)
    year_df.to_csv(f'./raw_data/SWGIM_year/{year}.csv')



2018


100%|██████████| 365/365 [06:15<00:00,  1.03s/it]


2019


100%|██████████| 365/365 [06:14<00:00,  1.03s/it]


2020


100%|██████████| 366/366 [06:16<00:00,  1.03s/it]


2021


100%|██████████| 365/365 [06:17<00:00,  1.03s/it]


In [28]:
print(year_df.info())
print(year_df.head())
# print(year_df.iloc[24:72:2])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8784 entries, 0 to 8783
Columns: 10234 entries, ('UTC', 'year', '-', 'I4', 'global', 'global') to ('CODE', 'GIM RMS', '10TEC', 'I3', '-87.5', '175')
dtypes: float64(10227), int64(7)
memory usage: 685.8 MB
None
Source                UTC                OMNIWeb                            \
Feature              year    DOY   hour Kp index           R Dst-index, nT   
unit                    -      -      -        - Sunspot No.            nT   
format                 I4     I3     I2       I2          I3            S3   
location latitude  global global global   global      global        global   
location longitude global global global   global      global        global   
0                    2000    238      0        0         112            -9   
1                    2000    238      1        0         112            -7   
2                    2000    238      2        0         112            -7   
3                    2000    238      