In [None]:
#This program requires a stable Internet connection!
import requests #For html access
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup #For html parsing
! pip install tqdm
import tqdm

In [None]:
def download_data(month_time_list):
    '''
    Download neutron data from nest.nmdb.edu by specifying any number of exact months for which you want data. For each month
    specified, the data for that month will be written into a .txt file in this script's directory. This function assumes
    you want to download data from as many stations as possible.
    
    Arguments
    month_time_list: a list of shape (n,2) where n is the number of instances of data you wish to get from nest.nmdb.eu. Syntax:
    [[month1,year1],[month2,year2],...,[monthN,yearN]]. Months and years should be of type int.
    See the example below for the required structure of month_time_list.
    Returns
    Nothing is returned, but data is written to new files in the directory of this script. Only data for stations with 44640
    data points is written (44640 = minutes per month). Stations with less data points per month are omitted from the file.
    
    ****WARNING!****
    This function will OVERWRITE existing files with the same name as the one it is trying to create!
    
    Example
    >>>month_time_list = [[1,1951],[11,2019]]
    >>>dowloadData(dateTimeList)
    Months index at 1, so this would create 2 separate files named "Jan_1951_NMDB.txt" and "Nov_2019_NMDB" with the
    corresponding data for those months in each.
    
    '''
    #Names of all the stations for NMDB url access
    stations = ['AATA','BKSN','DOMC','INVK','JUNG','LMKS','MWSN','NEWK','PTFM','SOPB','TSMB','AATB','CALG','DRBS','IRK2','JUNG1',
            'MRCL','MXCO','NRLK','PWNK','SOPO','TXBY','APTY','CALM','ESOI','IRK3','KERG','MGDN','NAIN','NVBK','ROME','TERA',
            'YKTK','ARNM','DJON','FSMT','IRKT','KIEL','MOSC','NANM','OULU','SANB','THUL','ATHN','DOMB','HRMS','JBGO','KIEL2',
            'MRNY','NEU3','PSNM','SNAE','TIBT']
    months_dict = {1:"Jan",2:"Feb",3:"Mar",4:"Apr",5:"May",6:"Jun",7:"Jul",8:"Aug",9:"Sep",10:"Oct",11:"Nov",12:"Dec"}
   
    #Raise an error message if the year or month will not be able to be accessed
    for interval in month_time_list: 
        thisMonth, thisYear = interval[0], interval[1]
        if thisYear < 1951 or thisMonth < 1  or thisMonth > 12:
            raise ValueError("Impossible month or a year before 1951")

        multi_station_data = pd.DataFrame() #Initialize a dataframe in which to store all the data retrieved with this function
        for station_ID in tqdm.trange(len(stations)):
            this_station = [] #Initialize an empty list in which to store the counts/s measurement for the current station in the loop
            URL = 'http://www.nmdb.eu/nest/draw_graph.php?formchk=1&stations[]='+stations[station_ID]+'&tabchoice=revori&dtype=corr_for_efficiency&tresolution=0&force=1&yunits=0&date_choice=bydate&start_day=1&start_month='+str(thisMonth)+'&start_year='+str(thisYear)+'&start_hour=0&start_min=0&end_day=31&end_month='+str(thisMonth)+'&end_year='+str(thisYear)+'&end_hour=23&end_min=59&output=ascii&display_null=1'
            html = requests.get(url = URL) #Grab the url
            html = html.content #Reassign the variable to the text in the url
            soup = BeautifulSoup(html,"lxml") #Instantiate BeautifulSoup
            if soup.code != None: #The data we want is in the <code></code> tag. If that tag isn't empty...
                text = soup.code.text #...then read in its contents
                start_position = text.find('start_date_time   RCORR_E') #This string is the last header info before the data begins. It has length 25.
                data = soup.code.text[start_position+26:] #There is header info that is not always the same length. This ignores it, no matter the length, by finding the end of the above string and ignoring everything before that point.
                data = data.splitlines()
                
                for measurement in range(len(data)):
                    this_station.append(data[measurement][-7:]) #Takes the last 7 bytes of the string, which is always the counts/s measurement
                    if 'null' in this_station[measurement]: 
                        this_station[measurement] = np.nan #Rename null values to a parsable format
                        
                    this_station[measurement] = float(this_station[measurement]) #Ensures that the data point is a float
            
            #Add the data for this station to our final dataframe if it has the proper number of measurements
            if len(this_station) == 44640:
                multi_station_data = pd.concat([multi_station_data,pd.DataFrame(this_station,columns = [stations[station_ID]])],axis=1)
        
        #If any data was retrieved by calling this function, write the final dataframe to a .txt file
        fileName = months_dict[thisMonth]+'_'+str(thisYear)+'_NMDB.txt'
        if len(multi_station_data) > 0: 
             multi_station_data.to_csv(fileName)

In [None]:
#Use this cell to download data for a set of specific months, and have them written to their own .txt files.
downloadData([[2,1956],[8,1956],[7,1959],[5,1960],[9,1960],[11,1960],[7,1961],[7,1966],[1,1967],[9,1968],[11,1968],[2,1969],
              [3,1969],[1,1971],[9,1971],[8,1972],[4,1973],[4,1976],[9,1977],[11,1977],[5,1978],[9,1978],[8,1979],[4,1981],
              [5,1981],[10,1981],[11,1982],[12,1982],[2,1984],[7,1989],[8,1989],[9,1989],[10,1989],[11,1989],[5,1990],[6,1991],
              [6,1992],[11,1992],[11,1997],[5,1998],[8,1998],[7,2000],[4,2001],[11,2001],[12,2001],[8,2002],[10,2003],[11,2003],
              [1,2005],[12,2006],[5,2012],[9,2017]])

In [None]:
def download_eps_flare_data(input_file_name='GOES_SEP_flares.txt', valid_flare_classes=['M','X'], min_longitude=60):
    '''
    Downloads all the months of data from nest.nmdb.eu that contain flare times in the dataframe created by the 
    read_EPS_data function in GOES_access_data.ipynb. Uses the download_data function in NMDB_access_data.ipynb.
    
    Arguments
    input_file_name:
    The path of the input file. This file should have been created by the read_EPS_data function
    in GOES_access_data.ipynb.
    valid_flare_classes:
    List-like whose elements are strings corresponding to X-ray flare classes. These should be
    the classes you want to examine. For instance, if you only want to examine C-class flares, the only element of 
    this object should be 'C'. More energetic flares are more likely to be associated with neutron production. By
    default, this is equal to ['M','X']. This does not support subclasses, e.g. valid_flare_classes=['M5'] is not
    allowed.
    min_longitude: The minimum absolute value of longitude above which to consider events. X-ray events on the solar
    limb (closer to min_longitude=90) may be more likely to be associated with neutron signals in NMDB. Can be of
    type float or int.
    
    Returns
    Does not return anything, but instead downloads all the relevant months of data to the directory containing
    this script.
    '''

    flare_months = [] #Initialize an array. Will be input to download_data later
    months_dict = {1:"Jan",2:"Feb",3:"Mar",4:"Apr",5:"May",6:"Jun",7:"Jul",8:"Aug",9:"Sep",10:"Oct",11:"Nov",12:"Dec"}
    num_months_dict = {value: key for key, value in months_dict.items()} #Invert months_dict

    flare_df = pd.read_csv(input_file_name, index_col=0) #Read in the dataframe created by read_EPS_data


    #Sometimes longitude is a nonnumerical string, the following helps ignores the location if so.
    possible_longs = list(np.arange(91))
    for long in range(len(possible_longs)):
        possible_longs[long] = str(possible_longs[long])
    
    for event in flare_df:
        if flare_df[event]['location'] == flare_df[event]['location']: #Any location that is NaN will not be considered
            if flare_df[event]['location'][-2:] in possible_longs:
            #Sometimes flare_df[event]['location'][-2:] is a nonnumerical string because of bad parsing of flare_df's parent file, this ignores the location if so.
                abs_longitude = int(flare_df[event]['location'][-2:])
                if flare_df[event]['class'][0] in valid_flare_classes and abs_longitude > min_longitude and flare_df[event]['month'] not in flare_months:
                    #If the considered event has the requested flare class and longitude, and has not already been downloaded...
                    flare_months.append([num_months_dict[flare_df[event]['month']], int(flare_df[event]['year'])]) #...then append its month and year to flare_months for later downloading

    downloadData(flare_months)

In [None]:
#Use this cell to download data via download_eps_flare_data
download_eps_flare_data()