## Datenalyse: 01 Datenaufbereitung

### 01 Datenaufbereitung

#### get data about solar flares (sf) for each year and create one CSV-File with all data

documentation of columns: https://www.ngdc.noaa.gov/stp/space-weather/solar-data/solar-features/solar-flares/x-rays/goes/xrs/documentation/miscellaneous/software/xraydatareports.pro

Data source: https://www.ngdc.noaa.gov/stp/space-weather/solar-data/solar-features/solar-flares/x-rays/goes/xrs/

In [4]:
import pandas as pd
import requests

df_sf_concat = pd.DataFrame(columns=['Data code', 'Station code', 'Year', 'Month', 'Day', 'Astriks', 
                         'Start time', 'End time', 'Max time', 'N or S', 'Latitude N/S', 
                         'E or W', 'Central mer. distance', 'SXI', 'X-ray Class', 'X-ray intensity',
                         'Station Name', 'Integrated flux', 'Sunspot region nr', 'Year CMP',
                         'Month CMP', 'Day CMP', 'Total region area', 'Total intensity'])
url = 'https://www.ngdc.noaa.gov/stp/space-weather/solar-data/solar-features/solar-flares/x-rays/goes/xrs/goes-xrs-report_YEAR.txt'

for year in range(1975,2017): #2017
    txt = requests.get(url.replace("YEAR", str(year)))
    with open('data_solar_flare_raw/solar_flare_raw_' + str(year) +'.csv', 'w') as f:
        f.write(txt.text)
        
    df_solar_flare = pd.read_csv('data_solar_flare_raw/solar_flare_raw_' + str(year) +'.csv', header=None, names=["text"])

    df_solar_flare['Data code'] = df_solar_flare.text.str[:2]
    df_solar_flare['Station code'] = df_solar_flare.text.str[2:5]
    df_solar_flare['Year'] = df_solar_flare.text.str[5:7]
    df_solar_flare['Year_long'] = year
    df_solar_flare['Month'] = df_solar_flare.text.str[7:9]
    df_solar_flare['Day'] = df_solar_flare.text.str[9:11]
    df_solar_flare['Astriks'] = df_solar_flare.text.str[11:13]
    df_solar_flare['Start time'] = df_solar_flare.text.str[13:17]
    df_solar_flare['End time'] = df_solar_flare.text.str[18:22]
    df_solar_flare['Max time'] = df_solar_flare.text.str[23:27]
    df_solar_flare['N or S'] = df_solar_flare.text.str[28:29]
    df_solar_flare['Latitude N/S'] = df_solar_flare.text.str[29:31]
    df_solar_flare['E or W'] = df_solar_flare.text.str[31:32]
    df_solar_flare['Central mer. distance'] = df_solar_flare.text.str[32:34]
    df_solar_flare['SXI'] = df_solar_flare.text.str[34:37]
    df_solar_flare['X-ray Class'] = df_solar_flare.text.str[59:60]
    df_solar_flare['X-ray intensity'] = df_solar_flare.text.str[60:63]
    df_solar_flare['Station Name'] = df_solar_flare.text.str[67:71]
    df_solar_flare['Integrated flux'] = df_solar_flare.text.str[72:80]
    df_solar_flare['Sunspot region nr'] = df_solar_flare.text.str[80:85]
    df_solar_flare['Year CMP'] = df_solar_flare.text.str[86:88]
    df_solar_flare['Month CMP'] = df_solar_flare.text.str[88:90]
    df_solar_flare['Day CMP'] = df_solar_flare.text.str[90:94]
    df_solar_flare['Total region area'] = df_solar_flare.text.str[95:102]
    df_solar_flare['Total intensity'] = df_solar_flare.text.str[103:110]
    # drop the first column with the raw string
    df_solar_flare.drop(columns=['text'], inplace=True)

    #df_solar_flare.to_csv('solar_flare_new_' + str(year) +'.csv', sep=';', index=False)
    df_sf_concat = pd.concat([df_sf_concat, df_solar_flare], axis=0)
    
df_sf_concat.to_csv('solar_flare_new.csv', sep=';', index=False)

In [4]:
# Daten aufbereiten, damit sie für die Berechnungen benutzt werden können

import numpy as np
import pandas as pd
import re

df_sf = pd.read_csv('solar_flare_new.csv', sep=';', low_memory=False,
                    usecols=['Year_long', 'Month', 'Day', 'X-ray Class', 'X-ray intensity'])

date = pd.to_datetime(dict(year=df_sf.Year_long, month=df_sf.Month, day=df_sf.Day))
df_sf.insert(0, "date", date, True)
df_sf.drop(columns=['Year_long', 'Month', 'Day'], inplace=True)

df_sf['X-ray intensity'].str.strip()
df_sf['X-ray intensity'] = df_sf['X-ray intensity'].apply(pd.to_numeric, errors='ignore')
df_sf = df_sf.dropna()
df_sf['X-ray intensity'] = np.where(df_sf['X-ray intensity'].astype(str).str.isnumeric(), df_sf['X-ray intensity'], 0)

df_sf['X-ray intensity'] = df_sf['X-ray intensity'] / 10

df_sf['XRay_class_nr1'] = np.where(df_sf['X-ray Class']== 'A', df_sf['X-ray intensity'] * 10 ** -8, 
                                 np.where(df_sf['X-ray Class']== 'B', df_sf['X-ray intensity'] * 10 ** -7, 
                                     np.where(df_sf['X-ray Class']== 'C', df_sf['X-ray intensity'] * 10 ** -6, 
                                         np.where(df_sf['X-ray Class']== 'M', df_sf['X-ray intensity'] * 10 ** -5, 
                                                 np.where(df_sf['X-ray Class']== 'X', df_sf['X-ray intensity'] * 10 ** -4, 0
)))))

# Remove all rows with "X-ray intensity" = 0
df_sf = df_sf[(df_sf.XRay_class_nr1 > 0)]

df_test = df_sf

df_sf.set_index('date', inplace=True)

df_sf.to_csv('solar_flare_prepared.csv', sep=';')

df_sf.head()

0.0028

In [4]:
# Folgend die Rechnung um die Klassen in Zahlen umzuwandeln
# offene Frage: was mach ich mit A0 usw.?
# -> Entweder löschen oder auf A1 switchen. Aktuell tendiere ich dazu, diese zu löschen.

a1 = 10 ** -8
print('A1:', a1)

b1 = 10 ** -7
print('B1:', b1)

c1 = 10 ** -6
print('C1:', c1)

m1 = 10 ** -5
print('M1:', m1)

x1 = 10 ** -4
print('X1:', x1)

a22 = 2.2 * 10 ** -8
print('A2.2:', a22)

b22 = 2.2 * 10 ** -7
print('B2.2:', b22)

c22 = 2.2 * 10 ** -6
print('C2.2:', c22)

m22 = 2.2 * 10 ** -5
print('M2.2:', m22)

x22 = 2.2 * 10 ** -4
print('X2.2:', x22)

a99 = 9.9 * 10 ** -8
print('A9.9:', a99)

b99 = 9.9 * 10 ** -7
print('B9.9:', b99)

c99 = 9.9 * 10 ** -6
print('C9.9:', c99)

m99 = 9.9 * 10 ** -5
print('M9.9:', m99)

x99 = 9.9 * 10 ** -4
print('X9.9:', x99)

# carrington like event
x450 = 45 * 10 ** -4
print('X45:', x450)

A1: 1e-08
B1: 1e-07
C1: 1e-06
M1: 1e-05
X1: 0.0001
A2.2: 2.2000000000000002e-08
B2.2: 2.2e-07
C2.2: 2.2e-06
M2.2: 2.2000000000000003e-05
X2.2: 0.00022000000000000003
A9.9: 9.9e-08
B9.9: 9.9e-07
C9.9: 9.9e-06
M9.9: 9.900000000000001e-05
X9.9: 0.00099
X45: 0.0045000000000000005
