In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

## Solar cycle history
Solar Cycle 22: October 1986 to June 1996  
Solar Cycle 23: June 1996 to December 2008  
Solar Cycle 24: December 2008 to December 2019  
Solar Cycle 25: 2019 - (projected) 2030  

In [None]:
import os
import csv
import json
import shutil
import requests
import pandas as pd
import urllib.request
from typing import Tuple

DATA_DIR = "data_from_sol_22_raw"
missing_infos = {}

def create_folder(year: int):
    #create folder to hold the data
    os.makedirs(f"{DATA_DIR}/{year}", exist_ok = True)

def download_file(year: int, url: str, file_name: str) -> None:
    # download the file from `url` and save it locally under `file_name`:
    with urllib.request.urlopen(url) as response, open(f"{DATA_DIR}/{year}/{file_name}", 'wb') as out_file:
        shutil.copyfileobj(response, out_file)

def get_continuous_ranges(nums: list) -> list[Tuple]:
    # helper function to get continuous interval
    nums = sorted(set(nums))
    gaps = [[s, e] for s, e in zip(nums, nums[1:]) if s+1 < e]
    edges = iter(nums[:1] + sum(gaps, []) + nums[-1:])
    return list(zip(edges, edges))
    
def get_missing_info(file_path: str, found_data_key: str) -> Tuple[list[str], list[int]]:
    # get different missing data informations
    list_missing_intervals = []
    list_missing_length = []
    
    df = pd.read_csv(file_path)
    if found_data_key == "starting_point_new":
        col = "A_AVG"
    elif found_data_key == "starting_point_old" or found_data_key == "starting_point_old_v2":
        col = "xl"
        
    df_missing_values = df.loc[df[col] == -99999]
    missing_indexes = df_missing_values.index.tolist()

    intervals = get_continuous_ranges(missing_indexes)
    for interval in intervals:
        st =  df.iloc[[interval[0]]].values.tolist()[0][0]
        end = df.iloc[[interval[1]]].values.tolist()[0][0]
        list_missing_intervals.append(f"{st} to {end}")
        list_missing_length.append(interval[1] - interval[0] + 1)
    
    return list_missing_intervals, list_missing_length

def pre_process_data(missing_infos:dict, satelite: str, year: int, month: str, file_name: str) -> dict:
    """
    Extract the important rows and save them as csv
    Also get some information about missing values
    """
    data_dict = {}
    yearly_missing_dict = {}
    missing_row_counter = 0
    
    csv_filename_with_dir = f"{DATA_DIR}/{year}/{file_name}"
    starting_point_found = False
    found_data_key = ""
    original_data_dict = {
        "starting_point_new" : ['time_tag', 'A_QUAL_FLAG', 'A_NUM_PTS', 'A_AVG', 'B_QUAL_FLAG', 'B_NUM_PTS', 'B_AVG'],
        "starting_point_old" : ['time_tag', 'xl', 'xs'],
        "starting_point_old_v2" : ['time_tag', 'xs', 'xl'],
    }
    
    with open(csv_filename_with_dir, newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='|')
        for row in reader:
            if row == original_data_dict["starting_point_new"]:
                starting_point_found = True
                found_data_key = "starting_point_new"
                for col in original_data_dict[found_data_key]:
                    data_dict[col] = []
                continue
                
            if row == original_data_dict["starting_point_old"]:
                starting_point_found = True
                found_data_key = "starting_point_old"
                for col in original_data_dict[found_data_key]:
                    data_dict[col] = []
                continue
            if row == original_data_dict["starting_point_old_v2"]:
                starting_point_found = True
                found_data_key = "starting_point_old_v2"
                for col in original_data_dict[found_data_key]:
                    data_dict[col] = []
                continue
                
            if starting_point_found:
                for i, col in enumerate(original_data_dict[found_data_key]):
                    data_dict[col].append(row[i])

    df = pd.DataFrame.from_dict(data_dict)
    df.to_csv(csv_filename_with_dir, index = False)
    
    yearly_missing_dict["file_name"] = f"{year}/{file_name}"
    yearly_missing_dict["number of total rows"] = df.shape[0]
    list_missing_intervals, list_missing_length = get_missing_info(csv_filename_with_dir, found_data_key)
    yearly_missing_dict["number of missing rows"] = sum(list_missing_length)
    yearly_missing_dict["missing rows each interval"] = list_missing_length
    yearly_missing_dict["missing intervals"] = list_missing_intervals
    missing_infos[f"{satelite}_{month}"] = yearly_missing_dict
    
    return missing_infos
    
def download_data_by_year():
    """
    Need some steps to get the data from www.ncei.noaa.gov
        -> Need date range of the year by month
        -> Need the filename
        -> Need the URL for file in NOAA website
        -> Download the data
    """
    df = pd.read_csv("existing_data_history.csv")
    years = [year for year in range(1986, 2021)]
    
    for year in years:
        print(f"Downloading data for {year}")
        missing_infos = {}
        #create folder to hold data
        create_folder(year)
        
        filtered_df = df[df["year"] == year]
        rows_as_list = filtered_df.values.tolist()
        for row in rows_as_list:
            satelite, year, month, file_name, url = row
            month = f"{month:02d}"
            download_file(year, url, file_name)
            print(f"Download completed of {file_name}, url: {url}")
            
            missing_infos = pre_process_data(missing_infos, satelite, year, month, file_name)
            print(f"Pre-processing completed of {file_name}")
            print("......")
    
        print(f"saving summary for year {year}")
        with open(f"{DATA_DIR}/{year}_summary.json", "w") as fp:
            json.dump(missing_infos, fp)
        
        print(f"--------End of {year}--------")

download_data_by_year()