### Import Required Libraries and Set Up Environment Variables

In [287]:
# Dependencies
import requests
import time
from dotenv import load_dotenv
import os
import pandas as pd
import json
import os
from datetime import datetime
## Load the NASA_API_KEY from the env file
load_dotenv()
NASA_API_KEY = os.getenv('NASA_API_KEY')

### CME Data

In [288]:
# Set the base URL to NASA's DONKI API:
base_url = "https://api.nasa.gov/DONKI/"

# Set the specifier for CMEs:
CME = "CME"

# Search for CMEs published between a begin and end date
startDate = "2013-05-01"
endDate   = "2024-05-01"

# Build URL for CME
cme_url = f'{base_url}{CME}?StartDate={startDate}&EndDate={endDate}&api_key={NASA_API_KEY}'

In [289]:
# Make a "GET" request for the CME URL and store it in a variable named cme_response
cme_response = requests.get(cme_url)
cme_response

<Response [200]>

In [290]:
# Convert the response variable to json and store it as a variable named cme_json
cme_json = cme_response.json()
cme_json

[{'activityID': '2024-10-26T06:48:00-CME-001',
  'catalog': 'M2M_CATALOG',
  'startTime': '2024-10-26T06:48Z',
  'instruments': [{'displayName': 'SOHO: LASCO/C2'},
   {'displayName': 'SOHO: LASCO/C3'},
   {'displayName': 'STEREO A: SECCHI/COR2'}],
  'sourceLocation': 'S16E60',
  'activeRegionNum': 13873,
  'note': 'This CME is visible as a partial halo directed towards the southeast in SOHO LASCO C2/C3 and STEREO A COR2 imagery. The source is an eruption which is associated with an M9.5 flare from Active Region 13873 (S14E56) and X1.8 flare from Active Region 13873 (S16E60) starting around 2024-10-26T06:30Z as seen in SDO AIA 131, 171, 193, 304 imagery. A wide opening of field lines is visible over the southeast limb in SDO AIA 171 and 193, with post eruptive arcades visible in SDO AIA 131, 171, 193, 304 starting around 07:50Z.',
  'submissionTime': '2024-10-30T14:51Z',
  'versionId': 3,
  'link': 'https://webtools.ccmc.gsfc.nasa.gov/DONKI/view/CME/34212/-1',
  'cmeAnalyses': [{'isMost

In [291]:
# Preview the first result in JSON format
# Use json.dumps with argument indent=4 to format data
print(json.dumps(cme_json, indent=4))

[
    {
        "activityID": "2024-10-26T06:48:00-CME-001",
        "catalog": "M2M_CATALOG",
        "startTime": "2024-10-26T06:48Z",
        "instruments": [
            {
                "displayName": "SOHO: LASCO/C2"
            },
            {
                "displayName": "SOHO: LASCO/C3"
            },
            {
                "displayName": "STEREO A: SECCHI/COR2"
            }
        ],
        "sourceLocation": "S16E60",
        "activeRegionNum": 13873,
        "note": "This CME is visible as a partial halo directed towards the southeast in SOHO LASCO C2/C3 and STEREO A COR2 imagery. The source is an eruption which is associated with an M9.5 flare from Active Region 13873 (S14E56) and X1.8 flare from Active Region 13873 (S16E60) starting around 2024-10-26T06:30Z as seen in SDO AIA 131, 171, 193, 304 imagery. A wide opening of field lines is visible over the southeast limb in SDO AIA 171 and 193, with post eruptive arcades visible in SDO AIA 131, 171, 193, 304 star

In [292]:
# Convert cme_json to a Pandas DataFrame 
df = pd.DataFrame(cme_json)
df

# Keep only the columns: activityID, startTime, linkedEvents
df= df[['activityID', 'startTime', 'linkedEvents']]
df.head()

#call the data base [ then pick columns [

Unnamed: 0,activityID,startTime,linkedEvents
0,2024-10-26T06:48:00-CME-001,2024-10-26T06:48Z,[{'activityID': '2024-10-26T05:57:00-FLR-001'}...
1,2024-10-26T08:09:00-CME-001,2024-10-26T08:09Z,
2,2024-10-26T08:12:00-CME-001,2024-10-26T08:12Z,
3,2024-10-26T12:23:00-CME-001,2024-10-26T12:23Z,
4,2024-10-26T17:38:00-CME-001,2024-10-26T17:38Z,


In [293]:
# Notice that the linkedEvents column allows us to identify the corresponding GST
# Remove rows with missing 'linkedEvents' since we won't be able to assign these to GSTs
df_cleaned = df.dropna(subset="linkedEvents")
df_cleaned

Unnamed: 0,activityID,startTime,linkedEvents
0,2024-10-26T06:48:00-CME-001,2024-10-26T06:48Z,[{'activityID': '2024-10-26T05:57:00-FLR-001'}...
12,2024-10-29T13:23:00-CME-001,2024-10-29T13:23Z,[{'activityID': '2024-10-31T19:28:00-IPS-001'}...
32,2024-10-31T13:48:00-CME-001,2024-10-31T13:48Z,[{'activityID': '2024-11-03T16:47:00-IPS-001'}...
34,2024-10-31T22:36:00-CME-001,2024-10-31T22:36Z,[{'activityID': '2024-10-31T21:12:00-FLR-001'}]
40,2024-11-03T15:27:00-CME-001,2024-11-03T15:27Z,[{'activityID': '2024-11-03T15:13:00-FLR-001'}]
42,2024-11-04T01:36:00-CME-001,2024-11-04T01:36Z,[{'activityID': '2024-11-04T01:05:00-FLR-001'}...
43,2024-11-04T13:23:00-CME-001,2024-11-04T13:23Z,[{'activityID': '2024-11-04T11:46:00-FLR-001'}]
44,2024-11-04T17:24:00-CME-001,2024-11-04T17:24Z,[{'activityID': '2024-11-04T15:26:00-FLR-001'}...
45,2024-11-05T00:09:00-CME-001,2024-11-05T00:09Z,[{'activityID': '2024-11-07T14:29:00-IPS-001'}]
46,2024-11-05T07:36:00-CME-001,2024-11-05T07:36Z,[{'activityID': '2024-11-05T06:35:00-FLR-001'}]


In [294]:
# Notice that the linkedEvents sometimes contains multiple events per row
# Write a nested for loop that iterates first over each row in the cme DataFrame (using the index)
# and then iterates over the values in 'linkedEvents' 
# and adds the elements individually to a list of dictionaries where each row is one element 
# Initialize an empty list to store the expanded rows
# Iterate over each index in the DataFrame
# Iterate over each dictionary in the list
# # Append a new dictionary to the expanded_rows list for each dictionary item and corresponding 'activityID' and 'startTime' value
#Create a new DataFrame from the expanded rows
#ASK TA!!!!!!!!!!!!!!!!!!! - result is fine. Dont need function but it woks so its fine. 

expanded_rows = []
for index, row in df_cleaned.iterrows():
    activityID = row['activityID']
    startTime = row['startTime']
    linkedEvents = row['linkedEvents']
    for event in linkedEvents:
        expanded_rows.append({'activityID': activityID, 'startTime': startTime, 'linkedEvents': event})
    
df_expanded = pd.DataFrame(expanded_rows)
df_expanded.head()




Unnamed: 0,activityID,startTime,linkedEvents
0,2024-10-26T06:48:00-CME-001,2024-10-26T06:48Z,{'activityID': '2024-10-26T05:57:00-FLR-001'}
1,2024-10-26T06:48:00-CME-001,2024-10-26T06:48Z,{'activityID': '2024-10-26T06:32:00-FLR-001'}
2,2024-10-26T06:48:00-CME-001,2024-10-26T06:48Z,{'activityID': '2024-10-26T16:45:00-SEP-001'}
3,2024-10-26T06:48:00-CME-001,2024-10-26T06:48Z,{'activityID': '2024-10-26T18:05:00-SEP-001'}
4,2024-10-26T06:48:00-CME-001,2024-10-26T06:48Z,{'activityID': '2024-10-26T18:57:00-SEP-001'}


In [295]:
# Create a function called extract_activityID_from_dict that 
# takes a dict as input such as in linkedEvents
# and verify below that it works as expected using 
# one row from linkedEvents as an example
# Be sure to use a try and except block to handle errors
# Log the error or print it for debugging


# Create a function called extract_activityID_from_dict that takes a dict as input such as in linkedEvents and extracts the value of that dict
def extract_activityID_from_dict(input_dict):
        try:
                activityID = input_dict.get('activityID', None)
                return activityID
        except (ValueError, TypeError) as e:
                print(f"Error processing input dictionary: {input_dict}. Error: {e}")
                return None

extract_activityID_from_dict(df_expanded.loc[0,'linkedEvents'])


'2024-10-26T05:57:00-FLR-001'

In [296]:
# Apply this function to each row in the 'linkedEvents' column (you can use apply() and a lambda function)
# and create a new column called 'GST_ActivityID' using loc indexer:
# Apply the function to the 'linkedEvents' column
# Verify the new column
# Define the function

df_expanded.loc[:, 'GST_ActivityID']= df_expanded['linkedEvents'].apply(lambda x: extract_activityID_from_dict(x))
df_expanded.head()


Unnamed: 0,activityID,startTime,linkedEvents,GST_ActivityID
0,2024-10-26T06:48:00-CME-001,2024-10-26T06:48Z,{'activityID': '2024-10-26T05:57:00-FLR-001'},2024-10-26T05:57:00-FLR-001
1,2024-10-26T06:48:00-CME-001,2024-10-26T06:48Z,{'activityID': '2024-10-26T06:32:00-FLR-001'},2024-10-26T06:32:00-FLR-001
2,2024-10-26T06:48:00-CME-001,2024-10-26T06:48Z,{'activityID': '2024-10-26T16:45:00-SEP-001'},2024-10-26T16:45:00-SEP-001
3,2024-10-26T06:48:00-CME-001,2024-10-26T06:48Z,{'activityID': '2024-10-26T18:05:00-SEP-001'},2024-10-26T18:05:00-SEP-001
4,2024-10-26T06:48:00-CME-001,2024-10-26T06:48Z,{'activityID': '2024-10-26T18:57:00-SEP-001'},2024-10-26T18:57:00-SEP-001


In [297]:
# Remove rows with missing GST_ActivityID, since we can't assign them to GSTs:
df_expanded = df_expanded.dropna(subset="GST_ActivityID")
df_expanded.head()

Unnamed: 0,activityID,startTime,linkedEvents,GST_ActivityID
0,2024-10-26T06:48:00-CME-001,2024-10-26T06:48Z,{'activityID': '2024-10-26T05:57:00-FLR-001'},2024-10-26T05:57:00-FLR-001
1,2024-10-26T06:48:00-CME-001,2024-10-26T06:48Z,{'activityID': '2024-10-26T06:32:00-FLR-001'},2024-10-26T06:32:00-FLR-001
2,2024-10-26T06:48:00-CME-001,2024-10-26T06:48Z,{'activityID': '2024-10-26T16:45:00-SEP-001'},2024-10-26T16:45:00-SEP-001
3,2024-10-26T06:48:00-CME-001,2024-10-26T06:48Z,{'activityID': '2024-10-26T18:05:00-SEP-001'},2024-10-26T18:05:00-SEP-001
4,2024-10-26T06:48:00-CME-001,2024-10-26T06:48Z,{'activityID': '2024-10-26T18:57:00-SEP-001'},2024-10-26T18:57:00-SEP-001


In [298]:
# print out the datatype of each column in this DataFrame:
df_expanded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   activityID      51 non-null     object
 1   startTime       51 non-null     object
 2   linkedEvents    51 non-null     object
 3   GST_ActivityID  51 non-null     object
dtypes: object(4)
memory usage: 1.7+ KB


In [299]:
#ASK!!!!!!!!!!!!!!!!!!!!!! #I only have 50 rows of Data...

# Convert the 'GST_ActivityID' column to string format 
df_expanded['GST_ActivityID'] = df_expanded['GST_ActivityID'].astype(str)

# Convert startTime to datetime format  
df_expanded["startTime"] = pd.to_datetime(df_expanded['startTime'])

# Rename startTime to startTime_CME and activityID to cmeID
df_expanded = df_expanded.rename(columns={
    'startTime': 'startTime_CME',
    'activityID': 'cmeID'
})

# Drop linkedEvents
df_expanded = df_expanded.drop(columns =['linkedEvents'])

# Verify that all steps were executed correctly
df_expanded



Unnamed: 0,cmeID,startTime_CME,GST_ActivityID
0,2024-10-26T06:48:00-CME-001,2024-10-26 06:48:00+00:00,2024-10-26T05:57:00-FLR-001
1,2024-10-26T06:48:00-CME-001,2024-10-26 06:48:00+00:00,2024-10-26T06:32:00-FLR-001
2,2024-10-26T06:48:00-CME-001,2024-10-26 06:48:00+00:00,2024-10-26T16:45:00-SEP-001
3,2024-10-26T06:48:00-CME-001,2024-10-26 06:48:00+00:00,2024-10-26T18:05:00-SEP-001
4,2024-10-26T06:48:00-CME-001,2024-10-26 06:48:00+00:00,2024-10-26T18:57:00-SEP-001
5,2024-10-26T06:48:00-CME-001,2024-10-26 06:48:00+00:00,2024-10-26T19:02:00-SEP-001
6,2024-10-26T06:48:00-CME-001,2024-10-26 06:48:00+00:00,2024-10-26T19:50:00-SEP-001
7,2024-10-26T06:48:00-CME-001,2024-10-26 06:48:00+00:00,2024-10-26T20:06:00-SEP-001
8,2024-10-26T06:48:00-CME-001,2024-10-26 06:48:00+00:00,2024-10-26T21:43:00-SEP-001
9,2024-10-26T06:48:00-CME-001,2024-10-26 06:48:00+00:00,2024-10-28T04:13:00-IPS-001


In [300]:
# We are only interested in CMEs related to GSTs so keep only rows where the GST_ActivityID column contains 'GST'
# use the method 'contains()' from the str library.  
#nameofcolumn.str.contains("whatever you're trying to match on")
#ASKKKKKKKKKKK

df_expanded[df_expanded["GST_ActivityID"].str.contains("GST")] 


Unnamed: 0,cmeID,startTime_CME,GST_ActivityID


### GST Data

In [301]:
# Set the base URL to NASA's DONKI API:
base_url = "https://api.nasa.gov/DONKI/"

# Set the specifier for GSTs:
GST = "GST"

# Search for GSTs published between a begin and end date
startDate = "2013-05-01"
endDate   = "2024-05-01"

# Build URL for GST
gst_url = f'{base_url}{GST}?startDate={startDate}&endDate={endDate}&api_key={NASA_API_KEY}'
gst_url

'https://api.nasa.gov/DONKI/GST?startDate=2013-05-01&endDate=2024-05-01&api_key=xd6laDXZahJaqfGeG0teFyg5EjIWGziP7XsdnUjm'

In [302]:
# Make a "GET" request for the GST URL and store it in a variable named gst_response
gst_response = requests.get(gst_url)
gst_response.url

'https://api.nasa.gov/DONKI/GST?startDate=2013-05-01&endDate=2024-05-01&api_key=xd6laDXZahJaqfGeG0teFyg5EjIWGziP7XsdnUjm'

In [303]:
# Convert the response variable to json and store it as a variable named gst_json

# Preview the first result in JSON format
# Use json.dumps with argument indent=4 to format data

#why is the data not printing

gst_json = gst_response.json()
print(json.dumps(gst_json, indent=4))

[
    {
        "gstID": "2013-06-01T01:00:00-GST-001",
        "startTime": "2013-06-01T01:00Z",
        "allKpIndex": [
            {
                "observedTime": "2013-06-01T01:00Z",
                "kpIndex": 6.0,
                "source": "NOAA"
            }
        ],
        "link": "https://webtools.ccmc.gsfc.nasa.gov/DONKI/view/GST/326/-1",
        "linkedEvents": [
            {
                "activityID": "2013-05-31T15:45:00-HSS-001"
            }
        ],
        "submissionTime": "2013-07-15T19:26Z",
        "versionId": 1
    },
    {
        "gstID": "2013-06-07T03:00:00-GST-001",
        "startTime": "2013-06-07T03:00Z",
        "allKpIndex": [
            {
                "observedTime": "2013-06-07T03:00Z",
                "kpIndex": 6.0,
                "source": "NOAA"
            }
        ],
        "link": "https://webtools.ccmc.gsfc.nasa.gov/DONKI/view/GST/330/-1",
        "linkedEvents": [
            {
                "activityID": "2013-06-02T20:24:

In [304]:
# Convert gst_json to a Pandas DataFrame  

# Keep only the columns: gstID, startTime, linkedEvents

df = pd.DataFrame(gst_json)
df

df= df[['gstID', 'startTime', 'linkedEvents']]
df.head()



Unnamed: 0,gstID,startTime,linkedEvents
0,2013-06-01T01:00:00-GST-001,2013-06-01T01:00Z,[{'activityID': '2013-05-31T15:45:00-HSS-001'}]
1,2013-06-07T03:00:00-GST-001,2013-06-07T03:00Z,[{'activityID': '2013-06-02T20:24:00-CME-001'}]
2,2013-06-29T03:00:00-GST-001,2013-06-29T03:00Z,
3,2013-10-02T03:00:00-GST-001,2013-10-02T03:00Z,[{'activityID': '2013-09-29T22:40:00-CME-001'}...
4,2013-12-08T00:00:00-GST-001,2013-12-08T00:00Z,[{'activityID': '2013-12-04T23:12:00-CME-001'}...


In [305]:
# Notice that the linkedEvents column allows us to identify the corresponding CME
# Remove rows with missing 'linkedEvents' since we won't be able to assign these to CME
df = df.dropna(subset=["linkedEvents"])
df.head()


Unnamed: 0,gstID,startTime,linkedEvents
0,2013-06-01T01:00:00-GST-001,2013-06-01T01:00Z,[{'activityID': '2013-05-31T15:45:00-HSS-001'}]
1,2013-06-07T03:00:00-GST-001,2013-06-07T03:00Z,[{'activityID': '2013-06-02T20:24:00-CME-001'}]
3,2013-10-02T03:00:00-GST-001,2013-10-02T03:00Z,[{'activityID': '2013-09-29T22:40:00-CME-001'}...
4,2013-12-08T00:00:00-GST-001,2013-12-08T00:00Z,[{'activityID': '2013-12-04T23:12:00-CME-001'}...
5,2014-02-19T03:00:00-GST-001,2014-02-19T03:00Z,[{'activityID': '2014-02-16T14:15:00-CME-001'}...


In [306]:
# Notice that the linkedEvents sometimes contains multiple events per row
# Use the explode method to ensure that each row is one element. Ensure to reset the index and drop missing values.
df_explode = df.explode('linkedEvents', ignore_index=True).dropna()
df_explode.head()

Unnamed: 0,gstID,startTime,linkedEvents
0,2013-06-01T01:00:00-GST-001,2013-06-01T01:00Z,{'activityID': '2013-05-31T15:45:00-HSS-001'}
1,2013-06-07T03:00:00-GST-001,2013-06-07T03:00Z,{'activityID': '2013-06-02T20:24:00-CME-001'}
2,2013-10-02T03:00:00-GST-001,2013-10-02T03:00Z,{'activityID': '2013-09-29T22:40:00-CME-001'}
3,2013-10-02T03:00:00-GST-001,2013-10-02T03:00Z,{'activityID': '2013-10-02T01:54:00-IPS-001'}
4,2013-10-02T03:00:00-GST-001,2013-10-02T03:00Z,{'activityID': '2013-10-02T02:47:00-MPC-001'}


In [307]:
# Apply the extract_activityID_from_dict function to each row in the 'linkedEvents' column (you can use apply() and a lambda function)
# and create a new column called 'CME_ActivityID' using loc indexer:

# Remove rows with missing CME_ActivityID, since we can't assign them to CMEs:


In [308]:
# Convert the 'CME_ActivityID' column to string format 
df['CME_ActivityID'] = df['CME_ActivityID'].astype(str)

# Convert the 'gstID' column to string format 

# Convert startTime to datetime format  

# Rename startTime to startTime_GST 

# Drop linkedEvents

# Verify that all steps were executed correctly


KeyError: 'CME_ActivityID'

In [62]:
# We are only interested in GSTs related to CMEs so keep only rows where the CME_ActivityID column contains 'CME'
# use the method 'contains()' from the str library.  


### Merge both datatsets

In [63]:
# Now merge both datasets using 'gstID' and 'CME_ActivityID' for gst and 'GST_ActivityID' and 'cmeID' for cme. Use the 'left_on' and 'right_on' specifiers.


In [64]:
# Verify that the new DataFrame has the same number of rows as cme and gst


### Computing the time it takes for a CME to cause a GST

In [65]:
# Compute the time diff between startTime_GST and startTime_CME by creating a new column called `timeDiff`.
time_diff = startTime_GST - startTime_CME 


In [66]:
# Use describe() to compute the mean and median time 
# that it takes for a CME to cause a GST. 


### Exporting data in csv format

In [85]:
# Export data to CSV without the index
df.to_csv('collected_data.csv', index=False)