In [1]:
from bs4 import BeautifulSoup
import requests as req
import pandas as pd

## Part 1

# Step 1

# (a)
r = req.get('https://www.spaceweatherlive.com/en/solar-activity/top-50-solar-flares')
soup = BeautifulSoup(r.content, 'html')
space_table = soup.find('table', {'class':'table'})
sw_df = pd.read_html(str(space_table))[0]
sw_df.columns = ['rank', 'x_class', 'date', 'region', 'start_time', 'max_time', 'end_time', 'movie']
sw_df

# (b)

Unnamed: 0,rank,x_class,date,region,start_time,max_time,end_time,movie
0,1,X28.0,2003/11/04,486,19:29,19:53,20:06,MovieView archive
1,2,X20.0,2001/04/02,9393,21:32,21:51,22:03,MovieView archive
2,3,X17.2,2003/10/28,486,09:51,11:10,11:24,MovieView archive
3,4,X17.0,2005/09/07,808,17:17,17:40,18:03,MovieView archive
4,5,X14.4,2001/04/15,9415,13:19,13:50,13:55,MovieView archive
5,6,X10.0,2003/10/29,486,20:37,20:49,21:01,MovieView archive
6,7,X9.4,1997/11/06,8100,11:49,11:55,12:01,MovieView archive
7,8,X9.3,2017/09/06,2673,11:53,12:02,12:10,MovieView archive
8,9,X9.0,2006/12/05,930,10:18,10:35,10:45,MovieView archive
9,10,X8.3,2003/11/02,486,17:03,17:25,17:39,MovieView archive


In [None]:
# (c)
# First, I use requests to get the actual contente of the page. Then, I use BeautifulSoup to
# find the first 'table' tag that we want. Then, I read that into a dataframe with pandas' read_html to
# make a dataframe. The last line just changes the column names to something more appropriate.

In [2]:
# Step 2

# (a)
for index, row in sw_df.iterrows():
    
    sw_df.set_value(index, 'start_time', pd.to_datetime(row['date'] + ' ' + row['start_time']))
    sw_df.set_value(index, 'max_time', pd.to_datetime(row['date'] + ' ' + row['max_time']))
    sw_df.set_value(index, 'end_time', pd.to_datetime(row['date'] + ' ' + row['end_time']))
    
    if (row['region'] != '-' or row['region'] != '----'):
        sw_df.set_value(index, 'region', '{:04d}'.format(row['region']))

  
  import sys
  
  # This is added back by InteractiveShellApp.init_path()


In [3]:
sw_df.drop(columns=['date', 'movie'], inplace=True)
sw_df.replace(to_replace='-', value='NaN', inplace=True)
sw_df.replace(to_replace='----', value='NaN', inplace=True)
sw_df.rename(columns={"start_time": "start_datetime", "max_time": "max_datetime", "end_time": "end_datetime"}, inplace=True)
sw_df

# (b)

Unnamed: 0,rank,x_class,region,start_datetime,max_datetime,end_datetime
0,1,X28.0,486,2003-11-04 19:29:00,2003-11-04 19:53:00,2003-11-04 20:06:00
1,2,X20.0,9393,2001-04-02 21:32:00,2001-04-02 21:51:00,2001-04-02 22:03:00
2,3,X17.2,486,2003-10-28 09:51:00,2003-10-28 11:10:00,2003-10-28 11:24:00
3,4,X17.0,808,2005-09-07 17:17:00,2005-09-07 17:40:00,2005-09-07 18:03:00
4,5,X14.4,9415,2001-04-15 13:19:00,2001-04-15 13:50:00,2001-04-15 13:55:00
5,6,X10.0,486,2003-10-29 20:37:00,2003-10-29 20:49:00,2003-10-29 21:01:00
6,7,X9.4,8100,1997-11-06 11:49:00,1997-11-06 11:55:00,1997-11-06 12:01:00
7,8,X9.3,2673,2017-09-06 11:53:00,2017-09-06 12:02:00,2017-09-06 12:10:00
8,9,X9.0,930,2006-12-05 10:18:00,2006-12-05 10:35:00,2006-12-05 10:45:00
9,10,X8.3,486,2003-11-02 17:03:00,2003-11-02 17:25:00,2003-11-02 17:39:00


In [None]:
# (c)
# I iterate through each row of the dataframe, setting the value for start, max, and end_time by combining
# the date and time values. I also make sure the region is not null before padding it with leading zeroes (if necessary).
# Finally, I drop the date and movie columns and replace all regions as '-' with 'NaN'.

In [4]:
# Step 3

# (a)
r = req.get('https://cdaw.gsfc.nasa.gov/CME_list/radio/waves_type2.html')
soup = BeautifulSoup(r.content, 'html').prettify()
whole_table = soup.split('\n')

nasa_df = pd.DataFrame(columns=['start_date', 'start_time', 'end_date', 'end_time', 'start_frequency', 'end_frequency', 'flare_location', 'flare_region', 'flare_classification', 'cme_date', 'cme_time', 'cme_angle', 'cme_width', 'cme_speed', 'plot'])

col = ['start_date', 'start_time', 'end_date', 'end_time', 'start_frequency', 'end_frequency', 'flare_location', 'flare_region', 'flare_classification', 'cme_date', 'cme_time', 'cme_angle', 'cme_width', 'cme_speed', 'plot']
list_of_dicts = []

for i in range(20, 531):
    dict = {}
    curr_row = whole_table[i].split(" ")
    del curr_row[-2]
    tidy_row = []
    # remove bad info
    for pos in range(len(curr_row)):
        if ("href" in curr_row[pos] or "target" in curr_row[pos]):
            string = curr_row[pos]
            first_pos = string.find('>')
            second_pos = string.find('<')
            text = string[first_pos+1:second_pos]
            tidy_row.append(text)
        elif (curr_row[pos] != '' and curr_row[pos] != '<a'):
            tidy_row.append(curr_row[pos])
        if ("href" in tidy_row[-1]):
            del tidy_row[-1]
        if (tidy_row[-1] == 'PHTX'):
            break
    if (tidy_row[-1] != 'PHTX'):
        del tidy_row[-1]
    for x in range(len(tidy_row)):
        dict[col[x]] = tidy_row[x]
    list_of_dicts.append({"start_date": tidy_row[0], "start_time": tidy_row[1], "end_date": tidy_row[2], 'end_time': tidy_row[3], 'start_frequency': tidy_row[4], 'end_frequency': tidy_row[5], 'flare_location': tidy_row[6], 'flare_region': tidy_row[7], 'flare_classification': tidy_row[8], 'cme_date': tidy_row[9], 'cme_time': tidy_row[10], 'cme_angle': tidy_row[11], 'cme_width': tidy_row[12], 'cme_speed': tidy_row[13], 'plot': tidy_row[14]})
    
for i in range(len(list_of_dicts)):
    nasa_df = nasa_df.append(list_of_dicts[i], ignore_index='True')
    
# (b)
nasa_df

Unnamed: 0,start_date,start_time,end_date,end_time,start_frequency,end_frequency,flare_location,flare_region,flare_classification,cme_date,cme_time,cme_angle,cme_width,cme_speed,plot
0,1997/04/01,14:00,04/01,14:15,8000,4000,S25E16,8026,M1.3,04/01,15:18,74,79,312,PHTX
1,1997/04/07,14:30,04/07,17:30,11000,1000,S28E19,8027,C6.8,04/07,14:27,Halo,360,878,PHTX
2,1997/05/12,05:15,05/14,16:00,12000,80,N21W08,8038,C1.3,05/12,05:30,Halo,360,464,PHTX
3,1997/05/21,20:20,05/21,22:00,5000,500,N05W12,8040,M1.3,05/21,21:00,263,165,296,PHTX
4,1997/09/23,21:53,09/23,22:16,6000,2000,S29E25,8088,C1.4,09/23,22:02,133,155,712,PHTX
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
506,2016/02/05,20:28,02/05,23:31,1650,500,------,-----,----,--/--,--:--,----,---,----,PHTX
507,2016/02/05,22:35,02/05,22:55,5200,1900,------,-----,----,02/05,21:24,193,155,445,PHTX
508,2016/05/04,14:20,05/04,14:34,14000,10500,N06W61,12535,C1.3,05/04,14:12,255,134,390,PHTX
509,2016/05/24,17:00,05/24,20:50,1500,700,------,-----,----,--/--,--:--,----,---,----,PHTX


In [None]:
# (c)
# First, I get the HTML content. I then do a ton of tidying on the data for each line.
# Afterwards, I make a list of dictionaries, get each row from the table, and split it up by space to get
# each element. Now that I have a dictionary for each row, its simple to add each row to the dataframe.

In [5]:
# Step 4

# (a)
nasa_df.replace(to_replace='------', value='NaN', inplace=True)
nasa_df.replace(to_replace='-----', value='NaN', inplace=True)
nasa_df.replace(to_replace='----', value='NaN', inplace=True)
nasa_df.replace(to_replace='---', value='NaN', inplace=True)
nasa_df.replace(to_replace='????', value='NaN', inplace=True)
nasa_df.replace(to_replace='--/--', value='NaN', inplace=True)
nasa_df.replace(to_replace='--:--', value='NaN', inplace=True)

nasa_df['is_halo'] = False
nasa_df['width_lower_bound'] = False

for index, row in nasa_df.iterrows():
        
    if (row['cme_angle'] == 'Halo'):
        nasa_df.set_value(index, 'is_halo', True)
        nasa_df.set_value(index, 'cme_angle', 'NA')
    if ('&gt;' in row['cme_width'] or '>' in row['cme_width']):
        string = row['cme_width']
        nasa_df.set_value(index, 'width_lower_bound', True)
        nasa_df.set_value(index, 'cme_width', string[4:])
    if (row['start_time'] == '24:00'):
        row['start_time'] = '23:59'
    if (row['end_time'] == '24:00'):
        row['end_time'] = '23:59'
    if (row['cme_time'] == '24:00'):
        row['cme_time'] = '23:59'
    
    nasa_df.set_value(index, 'end_date', pd.to_datetime((row['start_date'][:4] + "/" + row['end_date'])))
    nasa_df.set_value(index, 'cme_date', pd.to_datetime((row['start_date'][:4] + "/" + row['cme_date'])))
    
    #print("end_time: " + row['end_time'])
    #print("end_date: " + row['end_date'])
    
    #nasa_df.set_value(index, 'start_time', pd.to_datetime(row['start_date'] + ' ' + row['start_time']))
    #nasa_df.set_value(index, 'end_time', pd.to_datetime(row['end_date'] + ' ' + row['end_time']))
    #nasa_df.set_value(index, 'cme_time', pd.to_datetime(row['cme_date'] + ' ' + row['cme_time']))

nasa_df




ValueError: ('Unknown string format:', '2000/NaN')

In [None]:
# (c)
# First, I remove all bad/missing data with 'NaN'. I then make two new columns, is_halo and width_lower_bound, and go through
# each row in the dataframe to see if the solar flare is a halo and if the width is a lower bound. I also combine the date and time

In [None]:
# Part 2

# Question 1

question1_df = nasa_df.copy()

for index, row in question1_df.iterrows():
    if (row['flare_classification'][-1] == '.'):
        question1_df.set_value(index, 'flare_classification', row['flare_classification'][:-1])

for index, row in question1_df.iterrows():
    curr = row['flare_classification']
    if (curr != 'NaN' and curr != 'FILA'):
        original_tuple = (row['flare_classification'][0], (float)(row['flare_classification'][1:]))
    else:
        original_tuple = ('A', -1)
        
    question1_df.set_value(index, 'flare_classification', original_tuple)
    
question1_df = question1_df.sort_values(by='flare_classification', ascending=False)
question1_df.drop(columns=['start_frequency', 'cme_width', 'end_frequency', 'flare_location', 'cme_date', 'cme_angle', 'cme_speed', 'plot', 'width_lower_bound'], inplace=True)
cols = question1_df.columns.tolist()
cols = [cols[-3]] + [cols[0]] + [cols[4]] + [cols[1]] + [cols[-2]] + [cols[3]] + [cols[-1]]
question1_df = question1_df[cols]

for index, row in question1_df.iterrows():
    val = row['flare_classification']
    if (val[1] == -1):
        question1_df.set_value(index, 'flare_classification', 'NaN')
    else:
        question1_df.set_value(index, 'flare_classification', val[0] + str(val[1]))

analysis_df = question1_df.copy()
question1_df.drop(columns=['is_halo'], inplace=True)

question1_df.head(50)

In [None]:
# The table you get when trying to replicate SpaceWeather data with NASA data is close, but some solar flares
# are definitely missing or just not labeled correctly. The region numbers are also sometimes different, 
# with the times being off as well. In general, however, the order of flares is correct.

In [None]:
# Question 2

question2_df = question1_df.copy()
question2_df['rank according to SW'] = 'NaN'

def getBestMatchingRowFromNASAdata(SW_row):
    
    for index, row in question2_df.iterrows():
        if ((row['flare_classification'] in SW_row['x_class']) and (SW_row['region'] in row['region']) 
            and ((SW_row['start_datetime'] > (row['start_datetime'] - datetime.timedelta(days=5)))
              and (SW_row['start_datetime'] < (row['start_datetime'] + datetime.timedelta(days=5)))) 
           and ((SW_row['end_datetime'] > (row['end_datetime'] - datetime.timedelta(days=5)))
             and (SW_row['end_datetime'] < (row['end_datetime'] + datetime.timedelta(days=5))))
           and ((SW_row['cme_datetime'] > (row['cme_datetime'] - datetime.timedelta(days=5)))
             and (SW_row['cme_datetime'] < (row['cme_datetime'] + datetime.timedelta(days=5))))):
            question2_df.set_value(index, 'rank according to SW', SW_row['rank'])
            
# Explanation: There's a few things I'm trying to match here: First, the classification itself. The NASA data isn't as pinpoint
# as SW (SpaceWeather), so I check to see if the SW row at least contains the NASA classification. Then, I check if the SW region
# is contained within the NASA region, since the NASA regions tend to have extra numbers. Finally, I check the start, end, and
# cme dates within an error of +-5 days, since its understandable that the days will not exactly match up. Thus, we end up with a
# one-to-many matching, since many rows in the NASA data can match the one specific row we're looking at from SW, as I'm
# not trying to be too picky with my criteria.

# PS: if this section ends up outputting an error, then I ended up not being able to correctly make the datetime columns
# for the NASA data. Hopefully, my explanation / logic is good, though!

In [None]:
# Question 3

# (b)

halos_top50 = 0
halos_dataset = 0

for index, row in analysis_df.head(50).iterrows():
    if (row['is_halo'] == True):
        halos_top50 += 1

for index, row in analysis_df.iterrows():
    if (row['is_halo'] == True):
        halos_dataset += 1

halos_top50 = (halos_top50 / 50.0) * 100
halos_dataset = (halos_dataset / 511.0) * 100

graph = pd.DataFrame({'% of Halos in Top 50': halos_top50, '% of Halos in whole dataset': halos_dataset}, index=['% of Halos'])

print(graph.plot.bar(rot=0))

In [None]:
# (a): The intent of this plot is to figure out how much larger (if at all) the % of halos in the top 50 is compared to
# the entire dataset.

# (b): Above

# (c): Bar plot, two bars. Shows percentage of halos for just the top 50 strongest solar flares (blue), and the
# percentage of halos in the entire dataset (orange).

# (d): The average % of halos (from the dataset) is about 50%, while the % of halos in the top 50
# is 80%, so we are justified in saying that there is a correlation between how strong the solar flare is 
# and whether that solar flare is a halo. It seems that the stronger the solar flare, the higher the chance
# of it being a halo.