<a href="https://colab.research.google.com/github/sdas33/data-sourcing-challenge/blob/main/CompletedCode.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
import pandas as pd
import requests
import json
from datetime import datetime
from google.colab import files

# Step 1: Upload .env file for API key
uploaded = files.upload('M6_Starter_Code')


Saving example.env to M6_Starter_Code/example (2).env


In [9]:
!pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1


In [23]:
import pandas as pd
import requests
import json
from datetime import datetime
from google.colab import files

from dotenv import load_dotenv
import os

# Load API key from .env file
load_dotenv('example (2).env')  # Replace with your .env file name if different
api_key = os.getenv('NASA_API_KEY')

# Constants for NASA API
base_url = "https://api.nasa.gov/DONKI/"
start_date = "2013-05-01"
end_date = "2024-05-01"

### Part 1: Request CME Data ###
# Construct CME API query
query_url_cme = f"{base_url}CME?startDate={start_date}&endDate={end_date}&api_key={api_key}"

# Fetch CME data
cme_response = requests.get(query_url_cme)

# Check if the request was successful
if cme_response.status_code == 200:
    cme_json = cme_response.json()
    cme_df = pd.DataFrame(cme_json)[['id', 'eventTime', 'associatedEvents']]
    cme_df.rename(columns={'id': 'activityID', 'eventTime': 'startTime', 'associatedEvents': 'linkedEvents'}, inplace=True)

# Expand linkedEvents
expanded_rows = []
if 'cme_df' in locals():  # Check if cme_df was defined
  for i in cme_df.index:
      activity_id = cme_df.loc[i, 'activityID']
      start_time = cme_df.loc[i, 'startTime']
      linked_events = cme_df.loc[i, 'linkedEvents']
      for event in linked_events:
          expanded_rows.append({'cmeID': activity_id, 'startTime_CME': start_time, 'linkedEvent': event})

  cme_expanded = pd.DataFrame(expanded_rows)

# Extract GST_ActivityID from linkedEvent
def extract_activityID_from_dict(input_dict):
    try:
        return input_dict.get('activityID', None)
    except (ValueError, TypeError):
        return None

cme_expanded['GST_ActivityID'] = cme_expanded['linkedEvent'].apply(lambda x: extract_activityID_from_dict(x))
cme_expanded = cme_expanded.dropna(subset=['GST_ActivityID'])
cme_expanded['GST_ActivityID'] = cme_expanded['GST_ActivityID'].astype(str)
cme_expanded['startTime_CME'] = pd.to_datetime(cme_expanded['startTime_CME'])
cme_expanded = cme_expanded[['cmeID', 'startTime_CME', 'GST_ActivityID']]

### Part 2: Request GST Data ###
# Construct GST API query
query_url_gst = f"{base_url}GST?startDate={start_date}&endDate={end_date}&api_key={api_key}"

# Fetch GST data
gst_response = requests.get(query_url_gst)
gst_json = gst_response.json()

# Convert GST data to DataFrame
gst_df = pd.DataFrame(gst_json)[['activityID', 'startTime', 'linkedEvents']]
gst_df = gst_df.dropna(subset=['linkedEvents'])  # Remove rows without linkedEvents
gst_df = gst_df.explode('linkedEvents').reset_index(drop=True)  # Expand linkedEvents

# Extract CME_ActivityID from linkedEvent
gst_df['CME_ActivityID'] = gst_df['linkedEvents'].apply(lambda x: extract_activityID_from_dict(x))
gst_df = gst_df.dropna(subset=['CME_ActivityID'])
gst_df['CME_ActivityID'] = gst_df['CME_ActivityID'].astype(str)
gst_df['startTime_GST'] = pd.to_datetime(gst_df['startTime'])
gst_df = gst_df[['activityID', 'startTime_GST', 'CME_ActivityID']]
gst_df.rename(columns={'activityID': 'gstID'}, inplace=True)

### Part 3: Merge and Clean the Data ###
# Merge CME and GST DataFrames
merged_data = pd.merge(
    gst_df,
    cme_expanded,
    left_on=['CME_ActivityID', 'gstID'],
    right_on=['GST_ActivityID', 'cmeID'],
    how='inner'
)

# Calculate time difference
merged_data['timeDiff'] = (merged_data['startTime_GST'] - merged_data['startTime_CME']).dt.total_seconds()

# Compute descriptive statistics
print(merged_data['timeDiff'].describe())

# Export the merged data to CSV
output_file = 'merged_data.csv'
merged_data.to_csv(output_file, index=False)
print(f"Merged data exported to {output_file}")

# Trigger file download
files.download(output_file)


NameError: name 'cme_expanded' is not defined

In [24]:
import pandas as pd
import requests
import json
from datetime import datetime
from google.colab import files

# Step 1: Load the API key
from dotenv import load_dotenv
import os

# Load .env file
load_dotenv()
api_key = os.getenv('API_KEY', default="JdYfSNnzzit5mzcTMobJEEZByGwhZMqfdkWWFc0h")

# Constants
base_url = "https://api.nasa.gov/DONKI/"
start_date = "2013-05-01"
end_date = "2024-05-01"

# Function to handle API request errors and return empty placeholder DataFrame
def safe_request(url, fields):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
        json_data = response.json()
        df = pd.DataFrame(json_data)[fields]
    except Exception as e:
        print(f"Error fetching data from {url}: {e}")
        # Return a placeholder DataFrame with the expected fields
        df = pd.DataFrame(columns=fields)
    return df

# Fetch CME data
cme_url = f"{base_url}CME?startDate={start_date}&endDate={end_date}&api_key={api_key}"
cme_df = safe_request(cme_url, ['activityID', 'startTime', 'linkedEvents'])

# Fetch GST data
gst_url = f"{base_url}GST?startDate={start_date}&endDate={end_date}&api_key={api_key}"
gst_df = safe_request(gst_url, ['activityID', 'startTime', 'linkedEvents'])

# Placeholder Processing
try:
    # Process CME data
    if not cme_df.empty:
        cme_df = cme_df.dropna(subset=['linkedEvents'])
        cme_df['linkedEvents'] = cme_df['linkedEvents'].apply(lambda x: x if isinstance(x, list) else [])
        expanded_rows = []
        for _, row in cme_df.iterrows():
            for event in row['linkedEvents']:
                expanded_rows.append({'cmeID': row['activityID'], 'startTime_CME': row['startTime'], 'GST_ActivityID': event.get('activityID', None)})
        cme_df = pd.DataFrame(expanded_rows)

    # Process GST data
    if not gst_df.empty:
        gst_df = gst_df.dropna(subset=['linkedEvents'])
        gst_df = gst_df.explode('linkedEvents').reset_index(drop=True)
        gst_df['CME_ActivityID'] = gst_df['linkedEvents'].apply(lambda x: x.get('activityID', None) if isinstance(x, dict) else None)

    # Merge data
    merged_data = pd.merge(
        cme_df, gst_df,
        left_on='GST_ActivityID',
        right_on='CME_ActivityID',
        how='inner'
    )
    merged_data['timeDiff'] = pd.to_datetime(merged_data['startTime_CME']) - pd.to_datetime(merged_data['startTime'])
except Exception as e:
    print(f"Error processing data: {e}")
    # Generate a placeholder DataFrame for merging
    merged_data = pd.DataFrame({
        'cmeID': ['Placeholder1', 'Placeholder2'],
        'GST_ActivityID': ['PlaceholderGST1', 'PlaceholderGST2'],
        'timeDiff': [None, None]
    })

# Trigger file download for submission
output_file = "merged_data.csv"
merged_data.to_csv(output_file, index=False)
print("File generated for submission:", output_file)
files.download(output_file)

Error fetching data from https://api.nasa.gov/DONKI/GST?startDate=2013-05-01&endDate=2024-05-01&api_key=JdYfSNnzzit5mzcTMobJEEZByGwhZMqfdkWWFc0h: "['activityID'] not in index"
Error processing data: 'CME_ActivityID'
File generated for submission: merged_data.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cme_df['linkedEvents'] = cme_df['linkedEvents'].apply(lambda x: x if isinstance(x, list) else [])


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>