### Import Required Libraries and Set Up Environment Variables

In [1]:
# Dependencies
import requests
import time
from dotenv import load_dotenv
import os
import pandas as pd
import json
import os
from datetime import datetime
## Load the NASA_API_KEY from the env file
load_dotenv()
NASA_API_KEY = os.getenv('NASA_API_KEY')

### CME Data

In [2]:
# Set the base URL to NASA's DONKI API:
base_url = "https://api.nasa.gov/DONKI/"

# Set the specifier for CMEs:
CME = "CME"

# Search for CMEs published between a begin and end date
startDate = "2013-05-01"
endDate   = "2024-05-01"

# Build URL for CME
cme_url = f'{base_url}{CME}?StartDate={startDate}&EndDate={endDate}&api_key={NASA_API_KEY}'

In [3]:
# Make a "GET" request for the CME URL and store it in a variable named cme_response
cme_response = requests.get(cme_url)
cme_response

<Response [200]>

In [4]:
# Convert the response variable to json and store it as a variable named cme_json
cme_json = cme_response.json()
cme_json

[{'activityID': '2025-02-13T01:36:00-CME-001',
  'catalog': 'M2M_CATALOG',
  'startTime': '2025-02-13T01:36Z',
  'instruments': [{'displayName': 'SOHO: LASCO/C2'},
   {'displayName': 'SOHO: LASCO/C3'}],
  'sourceLocation': '',
  'activeRegionNum': None,
  'note': 'Faint, narrow CME first seen in the NE by SOHO LASCO C2 beginning at 2025-02-13T01:36Z, as well as by SOHO LASCO C3 in later frames. This event is not visible in STEREO A COR2 due to a data gap from 2025-02-12T10:38Z to 2025-02-13T03:23Z. There is no clear source seen for this event aside from general field line movement along the NE limb seen in SDO AIA 193.',
  'submissionTime': '2025-02-13T14:09Z',
  'versionId': 1,
  'link': 'https://webtools.ccmc.gsfc.nasa.gov/DONKI/view/CME/37004/-1',
  'cmeAnalyses': [{'isMostAccurate': True,
    'time21_5': '2025-02-13T08:23Z',
    'latitude': 21.0,
    'longitude': None,
    'halfAngle': 11.0,
    'speed': 515.0,
    'type': 'C',
    'featureCode': 'LE',
    'imageType': 'running dif

In [5]:
# Preview the first result in JSON format
# Use json.dumps with argument indent=4 to format data
print(json.dumps(cme_json, indent=4))

[
    {
        "activityID": "2025-02-13T01:36:00-CME-001",
        "catalog": "M2M_CATALOG",
        "startTime": "2025-02-13T01:36Z",
        "instruments": [
            {
                "displayName": "SOHO: LASCO/C2"
            },
            {
                "displayName": "SOHO: LASCO/C3"
            }
        ],
        "sourceLocation": "",
        "activeRegionNum": null,
        "note": "Faint, narrow CME first seen in the NE by SOHO LASCO C2 beginning at 2025-02-13T01:36Z, as well as by SOHO LASCO C3 in later frames. This event is not visible in STEREO A COR2 due to a data gap from 2025-02-12T10:38Z to 2025-02-13T03:23Z. There is no clear source seen for this event aside from general field line movement along the NE limb seen in SDO AIA 193.",
        "submissionTime": "2025-02-13T14:09Z",
        "versionId": 1,
        "link": "https://webtools.ccmc.gsfc.nasa.gov/DONKI/view/CME/37004/-1",
        "cmeAnalyses": [
            {
                "isMostAccurate": true,
 

In [6]:
# Convert cme_json to a Pandas DataFrame 
df = pd.DataFrame(cme_json)
df

# Keep only the columns: activityID, startTime, linkedEvents
df= df[['activityID', 'startTime', 'linkedEvents']]
df.head()

#call the data base [ then pick columns [

Unnamed: 0,activityID,startTime,linkedEvents
0,2025-02-13T01:36:00-CME-001,2025-02-13T01:36Z,
1,2025-02-13T15:23:00-CME-001,2025-02-13T15:23Z,
2,2025-02-13T17:36:00-CME-001,2025-02-13T17:36Z,
3,2025-02-13T21:12:00-CME-001,2025-02-13T21:12Z,
4,2025-02-14T08:48:00-CME-001,2025-02-14T08:48Z,[{'activityID': '2025-02-14T07:31:00-FLR-001'}]


In [7]:
# Notice that the linkedEvents column allows us to identify the corresponding GST
# Remove rows with missing 'linkedEvents' since we won't be able to assign these to GSTs
df_cleaned = df.dropna(subset="linkedEvents")
df_cleaned

Unnamed: 0,activityID,startTime,linkedEvents
4,2025-02-14T08:48:00-CME-001,2025-02-14T08:48Z,[{'activityID': '2025-02-14T07:31:00-FLR-001'}]
10,2025-02-15T01:36:00-CME-001,2025-02-15T01:36Z,[{'activityID': '2025-02-15T00:50:00-FLR-001'}]
13,2025-02-15T20:36:00-CME-001,2025-02-15T20:36Z,[{'activityID': '2025-02-15T19:22:00-FLR-001'}]
27,2025-02-20T00:00:00-CME-001,2025-02-20T00:00Z,[{'activityID': '2025-02-19T23:23:00-FLR-001'}...
32,2025-02-20T10:36:00-CME-001,2025-02-20T10:36Z,[{'activityID': '2025-02-23T21:20:00-IPS-001'}]
44,2025-02-22T21:36:00-CME-001,2025-02-22T21:36Z,[{'activityID': '2025-02-22T20:55:00-FLR-001'}]
46,2025-02-23T02:36:00-CME-001,2025-02-23T02:36Z,[{'activityID': '2025-02-23T02:00:00-FLR-001'}]
50,2025-02-23T19:36:00-CME-001,2025-02-23T19:36Z,[{'activityID': '2025-02-23T19:22:00-FLR-001'}]
53,2025-02-24T07:00:00-CME-001,2025-02-24T07:00Z,[{'activityID': '2025-02-24T06:53:00-FLR-001'}...
55,2025-02-24T21:24:00-CME-001,2025-02-24T21:24Z,[{'activityID': '2025-02-24T21:50:00-FLR-001'}...


In [8]:
# Notice that the linkedEvents sometimes contains multiple events per row
# Write a nested for loop that iterates first over each row in the cme DataFrame (using the index)
# and then iterates over the values in 'linkedEvents' 
# and adds the elements individually to a list of dictionaries where each row is one element 
# Initialize an empty list to store the expanded rows
# Iterate over each index in the DataFrame
# Iterate over each dictionary in the list
# # Append a new dictionary to the expanded_rows list for each dictionary item and corresponding 'activityID' and 'startTime' value
#Create a new DataFrame from the expanded rows
#ASK TA!!!!!!!!!!!!!!!!!!! - result is fine. Dont need function but it woks so its fine. 

expanded_rows = []
for index, row in df_cleaned.iterrows():
    activityID = row['activityID']
    startTime = row['startTime']
    linkedEvents = row['linkedEvents']
    for event in linkedEvents:
        expanded_rows.append({'activityID': activityID, 'startTime': startTime, 'linkedEvents': event})
    
df_expanded = pd.DataFrame(expanded_rows)
df_expanded.head()




Unnamed: 0,activityID,startTime,linkedEvents
0,2025-02-14T08:48:00-CME-001,2025-02-14T08:48Z,{'activityID': '2025-02-14T07:31:00-FLR-001'}
1,2025-02-15T01:36:00-CME-001,2025-02-15T01:36Z,{'activityID': '2025-02-15T00:50:00-FLR-001'}
2,2025-02-15T20:36:00-CME-001,2025-02-15T20:36Z,{'activityID': '2025-02-15T19:22:00-FLR-001'}
3,2025-02-20T00:00:00-CME-001,2025-02-20T00:00Z,{'activityID': '2025-02-19T23:23:00-FLR-001'}
4,2025-02-20T00:00:00-CME-001,2025-02-20T00:00Z,{'activityID': '2025-02-23T14:08:00-IPS-001'}


In [9]:
# Create a function called extract_activityID_from_dict that 
# takes a dict as input such as in linkedEvents
# and verify below that it works as expected using 
# one row from linkedEvents as an example
# Be sure to use a try and except block to handle errors
# Log the error or print it for debugging


# Create a function called extract_activityID_from_dict that takes a dict as input such as in linkedEvents and extracts the value of that dict
def extract_activityID_from_dict(input_dict):
        try:
                activityID = input_dict.get('activityID', None)
                return activityID
        except (ValueError, TypeError) as e:
                print(f"Error processing input dictionary: {input_dict}. Error: {e}")
                return None

extract_activityID_from_dict(df_expanded.loc[0,'linkedEvents'])


'2025-02-14T07:31:00-FLR-001'

In [10]:
# Apply this function to each row in the 'linkedEvents' column (you can use apply() and a lambda function)
# and create a new column called 'GST_ActivityID' using loc indexer:
# Apply the function to the 'linkedEvents' column
# Verify the new column
# Define the function

df_expanded.loc[:, 'GST_ActivityID']= df_expanded['linkedEvents'].apply(lambda x: extract_activityID_from_dict(x))
df_expanded.head()


Unnamed: 0,activityID,startTime,linkedEvents,GST_ActivityID
0,2025-02-14T08:48:00-CME-001,2025-02-14T08:48Z,{'activityID': '2025-02-14T07:31:00-FLR-001'},2025-02-14T07:31:00-FLR-001
1,2025-02-15T01:36:00-CME-001,2025-02-15T01:36Z,{'activityID': '2025-02-15T00:50:00-FLR-001'},2025-02-15T00:50:00-FLR-001
2,2025-02-15T20:36:00-CME-001,2025-02-15T20:36Z,{'activityID': '2025-02-15T19:22:00-FLR-001'},2025-02-15T19:22:00-FLR-001
3,2025-02-20T00:00:00-CME-001,2025-02-20T00:00Z,{'activityID': '2025-02-19T23:23:00-FLR-001'},2025-02-19T23:23:00-FLR-001
4,2025-02-20T00:00:00-CME-001,2025-02-20T00:00Z,{'activityID': '2025-02-23T14:08:00-IPS-001'},2025-02-23T14:08:00-IPS-001


In [11]:
# Remove rows with missing GST_ActivityID, since we can't assign them to GSTs:
df_expanded = df_expanded.dropna(subset="GST_ActivityID")
df_expanded.head()

Unnamed: 0,activityID,startTime,linkedEvents,GST_ActivityID
0,2025-02-14T08:48:00-CME-001,2025-02-14T08:48Z,{'activityID': '2025-02-14T07:31:00-FLR-001'},2025-02-14T07:31:00-FLR-001
1,2025-02-15T01:36:00-CME-001,2025-02-15T01:36Z,{'activityID': '2025-02-15T00:50:00-FLR-001'},2025-02-15T00:50:00-FLR-001
2,2025-02-15T20:36:00-CME-001,2025-02-15T20:36Z,{'activityID': '2025-02-15T19:22:00-FLR-001'},2025-02-15T19:22:00-FLR-001
3,2025-02-20T00:00:00-CME-001,2025-02-20T00:00Z,{'activityID': '2025-02-19T23:23:00-FLR-001'},2025-02-19T23:23:00-FLR-001
4,2025-02-20T00:00:00-CME-001,2025-02-20T00:00Z,{'activityID': '2025-02-23T14:08:00-IPS-001'},2025-02-23T14:08:00-IPS-001


In [12]:
# print out the datatype of each column in this DataFrame:
df_expanded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   activityID      35 non-null     object
 1   startTime       35 non-null     object
 2   linkedEvents    35 non-null     object
 3   GST_ActivityID  35 non-null     object
dtypes: object(4)
memory usage: 1.2+ KB


In [13]:

# Convert the 'GST_ActivityID' column to string format 
df_expanded['GST_ActivityID'] = df_expanded['GST_ActivityID'].astype(str)

# Convert startTime to datetime format  
df_expanded["startTime"] = pd.to_datetime(df_expanded['startTime'])

# Rename startTime to startTime_CME and activityID to cmeID
df_expanded = df_expanded.rename(columns={
    'startTime': 'startTime_CME',
    'activityID': 'cmeID'
})

# Drop linkedEvents
df_expanded = df_expanded.drop(columns =['linkedEvents'])

# Verify that all steps were executed correctly
df_expanded



Unnamed: 0,cmeID,startTime_CME,GST_ActivityID
0,2025-02-14T08:48:00-CME-001,2025-02-14 08:48:00+00:00,2025-02-14T07:31:00-FLR-001
1,2025-02-15T01:36:00-CME-001,2025-02-15 01:36:00+00:00,2025-02-15T00:50:00-FLR-001
2,2025-02-15T20:36:00-CME-001,2025-02-15 20:36:00+00:00,2025-02-15T19:22:00-FLR-001
3,2025-02-20T00:00:00-CME-001,2025-02-20 00:00:00+00:00,2025-02-19T23:23:00-FLR-001
4,2025-02-20T00:00:00-CME-001,2025-02-20 00:00:00+00:00,2025-02-23T14:08:00-IPS-001
5,2025-02-20T10:36:00-CME-001,2025-02-20 10:36:00+00:00,2025-02-23T21:20:00-IPS-001
6,2025-02-22T21:36:00-CME-001,2025-02-22 21:36:00+00:00,2025-02-22T20:55:00-FLR-001
7,2025-02-23T02:36:00-CME-001,2025-02-23 02:36:00+00:00,2025-02-23T02:00:00-FLR-001
8,2025-02-23T19:36:00-CME-001,2025-02-23 19:36:00+00:00,2025-02-23T19:22:00-FLR-001
9,2025-02-24T07:00:00-CME-001,2025-02-24 07:00:00+00:00,2025-02-24T06:53:00-FLR-001


In [14]:
# We are only interested in CMEs related to GSTs so keep only rows where the GST_ActivityID column contains 'GST'
# use the method 'contains()' from the str library.  
#nameofcolumn.str.contains("whatever you're trying to match on")


df_new = df_expanded[df_expanded["GST_ActivityID"].str.contains("GST")] 
df_new

Unnamed: 0,cmeID,startTime_CME,GST_ActivityID
11,2025-02-24T07:00:00-CME-001,2025-02-24 07:00:00+00:00,2025-02-27T09:00:00-GST-001


### GST Data

In [72]:
# Set the base URL to NASA's DONKI API:
base_url = "https://api.nasa.gov/DONKI/"

# Set the specifier for GSTs:
GST = "GST"

# Search for GSTs published between a begin and end date
startDate = "2013-05-01"
endDate   = "2024-05-01"

# Build URL for GST
gst_url = f'{base_url}{GST}?startDate={startDate}&endDate={endDate}&api_key={NASA_API_KEY}'
gst_url

'https://api.nasa.gov/DONKI/GST?startDate=2013-05-01&endDate=2024-05-01&api_key=xd6laDXZahJaqfGeG0teFyg5EjIWGziP7XsdnUjm'

In [73]:
# Make a "GET" request for the GST URL and store it in a variable named gst_response
gst_response = requests.get(gst_url)
gst_response.url

'https://api.nasa.gov/DONKI/GST?startDate=2013-05-01&endDate=2024-05-01&api_key=xd6laDXZahJaqfGeG0teFyg5EjIWGziP7XsdnUjm'

In [74]:
# Convert the response variable to json and store it as a variable named gst_json

# Preview the first result in JSON format
# Use json.dumps with argument indent=4 to format data

#why is the data not printing

gst_json = gst_response.json()
print(json.dumps(gst_json, indent=4))

[
    {
        "gstID": "2013-06-01T01:00:00-GST-001",
        "startTime": "2013-06-01T01:00Z",
        "allKpIndex": [
            {
                "observedTime": "2013-06-01T01:00Z",
                "kpIndex": 6.0,
                "source": "NOAA"
            }
        ],
        "link": "https://webtools.ccmc.gsfc.nasa.gov/DONKI/view/GST/326/-1",
        "linkedEvents": [
            {
                "activityID": "2013-05-31T15:45:00-HSS-001"
            }
        ],
        "submissionTime": "2013-07-15T19:26Z",
        "versionId": 1
    },
    {
        "gstID": "2013-06-07T03:00:00-GST-001",
        "startTime": "2013-06-07T03:00Z",
        "allKpIndex": [
            {
                "observedTime": "2013-06-07T03:00Z",
                "kpIndex": 6.0,
                "source": "NOAA"
            }
        ],
        "link": "https://webtools.ccmc.gsfc.nasa.gov/DONKI/view/GST/330/-1",
        "linkedEvents": [
            {
                "activityID": "2013-06-02T20:24:

In [75]:
# Convert gst_json to a Pandas DataFrame  

# Keep only the columns: gstID, startTime, linkedEvents

df_gst = pd.DataFrame(gst_json)


df_gst= df_gst[['gstID', 'startTime', 'linkedEvents']]
df_gst.head()



Unnamed: 0,gstID,startTime,linkedEvents
0,2013-06-01T01:00:00-GST-001,2013-06-01T01:00Z,[{'activityID': '2013-05-31T15:45:00-HSS-001'}]
1,2013-06-07T03:00:00-GST-001,2013-06-07T03:00Z,[{'activityID': '2013-06-02T20:24:00-CME-001'}]
2,2013-06-29T03:00:00-GST-001,2013-06-29T03:00Z,
3,2013-10-02T03:00:00-GST-001,2013-10-02T03:00Z,[{'activityID': '2013-09-29T22:40:00-CME-001'}...
4,2013-12-08T00:00:00-GST-001,2013-12-08T00:00Z,[{'activityID': '2013-12-04T23:12:00-CME-001'}...


In [76]:
# Notice that the linkedEvents column allows us to identify the corresponding CME
# Remove rows with missing 'linkedEvents' since we won't be able to assign these to CME
df_gst = df_gst.dropna(subset=["linkedEvents"])
df_gst.head()


Unnamed: 0,gstID,startTime,linkedEvents
0,2013-06-01T01:00:00-GST-001,2013-06-01T01:00Z,[{'activityID': '2013-05-31T15:45:00-HSS-001'}]
1,2013-06-07T03:00:00-GST-001,2013-06-07T03:00Z,[{'activityID': '2013-06-02T20:24:00-CME-001'}]
3,2013-10-02T03:00:00-GST-001,2013-10-02T03:00Z,[{'activityID': '2013-09-29T22:40:00-CME-001'}...
4,2013-12-08T00:00:00-GST-001,2013-12-08T00:00Z,[{'activityID': '2013-12-04T23:12:00-CME-001'}...
5,2014-02-19T03:00:00-GST-001,2014-02-19T03:00Z,[{'activityID': '2014-02-16T14:15:00-CME-001'}...


In [77]:
# Notice that the linkedEvents sometimes contains multiple events per row
# Use the explode method to ensure that each row is one element. Ensure to reset the index and drop missing values.
df_gst = df_gst.explode('linkedEvents', ignore_index=True).dropna()
df_gst.head()

Unnamed: 0,gstID,startTime,linkedEvents
0,2013-06-01T01:00:00-GST-001,2013-06-01T01:00Z,{'activityID': '2013-05-31T15:45:00-HSS-001'}
1,2013-06-07T03:00:00-GST-001,2013-06-07T03:00Z,{'activityID': '2013-06-02T20:24:00-CME-001'}
2,2013-10-02T03:00:00-GST-001,2013-10-02T03:00Z,{'activityID': '2013-09-29T22:40:00-CME-001'}
3,2013-10-02T03:00:00-GST-001,2013-10-02T03:00Z,{'activityID': '2013-10-02T01:54:00-IPS-001'}
4,2013-10-02T03:00:00-GST-001,2013-10-02T03:00Z,{'activityID': '2013-10-02T02:47:00-MPC-001'}


In [79]:
# Apply the extract_activityID_from_dict function to each row in the 'linkedEvents' column (you can use apply() and a lambda function)
# and create a new column called 'CME_ActivityID' using loc indexer:

# Remove rows with missing CME_ActivityID, since we can't assign them to CMEs:

df_gst.loc[:,'CME_ActivityID'] = df_gst['linkedEvents'].apply(lambda x: extract_activityID_from_dict(x))
df_gst = df_gst.dropna(subset="CME_ActivityID")
df_gst.head()



Unnamed: 0,gstID,startTime,linkedEvents,CME_ActivityID


In [80]:
# Convert the 'CME_ActivityID' column to string format 
if 'CME_ActivityID' in df.columns:
    df['CME_ActivityID'] = df['CME_ActivityID'].astype(str)
else:
    print("Error: 'CME_ActivityID' column not found. Ensure it was created correctly.")

# Convert the 'gstID' column to string format 
if 'GST_ActivityID' in df.columns:
    df['GST_ActivityID'] = df['GST_ActivityID'].astype(str)
else:
    print("Error: 'GST_ActivityID' column not found. Ensure it was loaded correctly.")

# Convert startTime to datetime format  
if 'startTime' in df.columns:
    df['startTime'] = pd.to_datetime(df['startTime'], errors='coerce')
else:
    print("Error: 'startTime' column not found. Ensure it was loaded correctly.")

# Rename startTime to startTime_GST 
df.rename(columns={'startTime': 'startTime_GST'}, inplace=True)

# Drop linkedEvents
df.rename(columns={'startTime': 'startTime_GST'}, inplace=True)

# Verify that all steps were executed correctly
print(df.info()) 
print(df.head())

Error: 'GST_ActivityID' column not found. Ensure it was loaded correctly.
Error: 'startTime' column not found. Ensure it was loaded correctly.
<class 'pandas.core.frame.DataFrame'>
Index: 105 entries, 0 to 116
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   gstID           105 non-null    object             
 1   startTime_GST   105 non-null    datetime64[ns, UTC]
 2   linkedEvents    105 non-null    object             
 3   CME_ActivityID  105 non-null    object             
dtypes: datetime64[ns, UTC](1), object(3)
memory usage: 4.1+ KB
None
                         gstID             startTime_GST  \
0  2013-06-01T01:00:00-GST-001 2013-06-01 01:00:00+00:00   
1  2013-06-07T03:00:00-GST-001 2013-06-07 03:00:00+00:00   
3  2013-10-02T03:00:00-GST-001 2013-10-02 03:00:00+00:00   
4  2013-12-08T00:00:00-GST-001 2013-12-08 00:00:00+00:00   
5  2014-02-19T03:00:00-GST-001 2014-02-19 03:00

In [23]:
# We are only interested in GSTs related to CMEs so keep only rows where the CME_ActivityID column contains 'CME'
# use the method 'contains()' from the str library.  


### Merge both datatsets

In [24]:
# Now merge both datasets using 'gstID' and 'CME_ActivityID' for gst and 'GST_ActivityID' and 'cmeID' for cme. Use the 'left_on' and 'right_on' specifiers.


In [25]:
# Verify that the new DataFrame has the same number of rows as cme and gst


### Computing the time it takes for a CME to cause a GST

In [26]:
# Compute the time diff between startTime_GST and startTime_CME by creating a new column called `timeDiff`.
time_diff = startTime_GST - startTime_CME 


NameError: name 'startTime_GST' is not defined

In [66]:
# Use describe() to compute the mean and median time 
# that it takes for a CME to cause a GST. 


### Exporting data in csv format

In [85]:
# Export data to CSV without the index
df.to_csv('collected_data.csv', index=False)