In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
import os
import csv
import json
import shutil
import requests
import pandas as pd
import urllib.request
from typing import Tuple

missing_infos = {}

launch_years = {
    "goes01": 1975,
    "goes02": 1977,
    "goes03": 1978,
    "goes04": 1980,
    "goes05": 1981,
    "goes06": 1983,
    "goes07": 1987,
    "goes08": 1994,
    "goes09": 1995,
    "goes10": 1997,
    "goes11": 2000,
    "goes12": 2001,
    "goes13": 2006,
    "goes14": 2009,
    "goes15": 2010,
    "goes16": 2016,
    "goes17": 2018,
    "goes18": 2022,
}
def get_year_range(start_yr: int, end_yr: int) -> list[int]:
    # to get the year's based on range
    return [yr + 1 for yr in range(start_yr - 1, end_yr)]

def get_month_ranges_in_year(year: int) -> Tuple[list[tuple()], list[str]]:
    #get the month's date range from january to december in a given year
    month_numbers = []
    dates_in_month = []
    end_dates = pd.date_range(f"{year}-01-01", f"{year}-12-31", freq='M')
    for end_date in end_dates:
        end_date = str(end_date).strip(" ")[:10]
        splitted_end_date = end_date.split("-")
        month_numbers.append(splitted_end_date[1])
        dates_in_month.append((f"{year}{splitted_end_date[1]}01", ("").join(splitted_end_date)))
    
    return dates_in_month, month_numbers
 
def build_file_names(st_name: str, start: str, end: str) -> list[str]:
    #build the file name of the csv file
    return [f"g{st_name}_xrs_1m_{start}_{end}.csv", f"g{st_name}_xrs_1m_3s_{start}_{end}.csv"]

def build_file_URL(st_name: str, year: int, month: str, file_name: str) -> str:
    #build the URL for file path of the csv file
    return f"https://www.ncei.noaa.gov/data/goes-space-environment-monitor/access/avg/{year}/{month}/goes{st_name}/csv/{file_name}"

def download_file(year: int, url: str, file_name: str) -> None:
    # download the file from `url` and save it locally under `file_name`:
    print(url)
    with urllib.request.urlopen(url) as response, open(f"data/{year}/{file_name}", 'wb') as out_file:
        shutil.copyfileobj(response, out_file)

def is_file_exist(file_url: str) -> bool:
    try:
        response = requests.head(file_url)
        if response.status_code == 200:
            return True
    except requests.ConnectionError:
        return False

    
valid_file_urls = []
def check_data_existance_and_save_in_report(year: int) -> None:
    """
    Need some steps to get the data from www.ncei.noaa.gov
        -> Need date range of the year by month
        -> Need the filename
        -> Need the URL for file in NOAA website
        -> Download the data
    """
    
    date_range_in_month, month_numbers = get_month_ranges_in_year(year)
    for i, date_range in enumerate(date_range_in_month):
        year_range = ["{:02d}".format(num) for num in range(2, 19)]
        for st_name in year_range:
            satelite_launch_year = launch_years[f"goes{st_name}"]
            if year >= satelite_launch_year:
                file_name = build_file_names(st_name, date_range[0], date_range[1])
                for n, name in enumerate(file_name):
                    file_url = build_file_URL(st_name, year, month_numbers[i], name)
                    if is_file_exist(file_url):
                        print(f"goes{st_name}", year, month_numbers[i], file_url)
                        #download_file(year, file_url, file_name)
                        valid_file_urls.append(f"goes{st_name}, {year}, {month_numbers[i]}, {file_url}")
    df = pd.DataFrame(valid_file_urls, columns=["file_urls"])
    df.to_csv("file_urls.csv", index = False)

def post_process_history():
    df = pd.read_csv("file_urls.csv")
    urls_data = df["file_urls"].tolist()
    dict_history = {
        "satelite": [],
        "year": [],
        "month": [],
        "file_name": [],
        "file_url": []
    }
    for urls_and_data in urls_data:
        satelite, year, month, file_url = urls_and_data.split(",")
        file_name = file_url.split("/")[-1]
        if "_1m_3s_" in file_url:
            continue
        else:
            dict_history["satelite"].append(satelite)
            dict_history["year"].append(year)
            dict_history["month"].append(month)
            dict_history["file_name"].append(file_name)
            dict_history["file_url"].append(file_url)


    df_existing_data_history = pd.DataFrame(dict_history)
    df_existing_data_history.to_csv("existing_data_history.csv", index = False)

def main() -> None:
    #main function caller
    start_yr = int(input("Enter start year:"))
    end_yr = int(input("Enter end year:"))

    if start_yr > end_yr:
        print("Start year can not be greater than end date")
        return
    
    years = get_year_range(start_yr, end_yr)
    #make_folder(years)
    for yr in years:
        print("Running for year:", yr)
        check_data_existance_and_save_in_report(yr)
        print(f"Processed year: {yr}")
    post_process_history()

if __name__ == '__main__':
    main()