In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from tensorflow.keras import layers, models

# Step 1: Data Loading and Preprocessing

def load_and_preprocess_plasmag(plasmag_path):
    plasmag_data = pd.read_csv(plasmag_path, nrows=10000)
    plasmag_data[plasmag_data.columns[0]] = pd.to_datetime(plasmag_data[plasmag_data.columns[0]])

    plasmag_data['DD'] = plasmag_data[plasmag_data.columns[0]].dt.day
    plasmag_data = plasmag_data.fillna(method='ffill')

    return plasmag_data

def load_kp_values(kp_path):
    kp_data = pd.read_csv(kp_path, delimiter='\s+', skiprows=1, names=['Date', 'Kp'], nrows=10000)
    kp_data = kp_data.fillna(method='ffill')

    return kp_data

# Specify file paths for each year
plasmag_paths = [
    '/content/drive/MyDrive/aurora/dsc_fc_summed_spectra_2017_v01.csv',
    '/content/drive/MyDrive/aurora/dsc_fc_summed_spectra_2018_v01.csv',
    '/content/drive/MyDrive/aurora/dsc_fc_summed_spectra_2019_v01.csv',
    '/content/drive/MyDrive/aurora/dsc_fc_summed_spectra_2020_v01.csv',
    '/content/drive/MyDrive/aurora/dsc_fc_summed_spectra_2021_v01.csv',
    '/content/drive/MyDrive/aurora/dsc_fc_summed_spectra_2023_v01.csv',
]
kp_path = '/content/YYY-MM-DD-hh-h-hh-m-days.csv'

# Load and preprocess PlasMAG data for each year
plasmag_data_list = [load_and_preprocess_plasmag(path) for path in plasmag_paths]

# Load and preprocess Kp values
kp_data = load_kp_values(kp_path)# Add a 'Date' column to both dataframes

# Merge dataframes based on the 'Date' column
merged_data = pd.merge(plasmag_data_list, kp_data, on='DD', how='inner')

# Step 2: Feature Engineering

# Investigate and select relevant features
selected_features = [
    # Add your features based on data exploration and domain knowledge
    'days', 'days_m', 'D'
]

# Extract features and target variable
X = merged_data[selected_features]
y = merged_data['Kp']

# Step 3: Machine Learning Model

# Preprocessing
scaler = StandardScaler()
imputer = SimpleImputer(strategy='mean')
X_scaled = scaler.fit_transform(imputer.fit_transform(X))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define a simple neural network model
model = models.Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)  # Output layer for regression
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

# Step 4: Evaluation

# Evaluate the model on the test set
predictions = model.predict(X_test).flatten()

# Evaluate the model using RMSE
rmse = mean_squared_error(y_test, predictions, squared=False)
print(f'Root Mean Squared Error (RMSE): {rmse}')


TypeError: Can only merge Series or DataFrame objects, a <class 'list'> was passed

In [34]:
plasmag_data_list

[     2017-01-01 00:00:00  -4.17863  -4.51183  -3.52434    0  0.1  0.2  0.3  \
 0    2017-01-01 00:01:00  -6.06788 -0.379552 -3.497080  0.0  0.0  0.0  0.0   
 1    2017-01-01 00:02:00  -5.83430 -3.039940 -2.985460  0.0  0.0  0.0  0.0   
 2    2017-01-01 00:03:00  -5.87318 -3.352910 -2.774760  0.0  0.0  0.0  0.0   
 3    2017-01-01 00:04:00  -5.81431 -3.305190 -2.598700  0.0  0.0  0.0  0.0   
 4    2017-01-01 00:05:00  -6.23986 -3.240720 -1.010480  0.0  0.0  0.0  0.0   
 ...                  ...       ...       ...       ...  ...  ...  ...  ...   
 9995 2017-01-07 22:36:00   3.86854 -3.295860  0.622848  0.0  0.0  0.0  0.0   
 9996 2017-01-07 22:37:00   3.79149 -3.415180  0.454364  0.0  0.0  0.0  0.0   
 9997 2017-01-07 22:38:00   3.57554 -3.569090  0.444149  0.0  0.0  0.0  0.0   
 9998 2017-01-07 22:39:00   3.65066 -3.426430  0.700379  0.0  0.0  0.0  0.0   
 9999 2017-01-07 22:40:00   3.78359 -3.094890  1.036740  0.0  0.0  0.0  0.0   
 
       0.4  0.5  ...     0.22  0.23  0.24  0.25  0

In [19]:
df = pd.read_csv("/content/drive/MyDrive/aurora/dsc_fc_summed_spectra_2021_v01.csv")

In [20]:
df.columns

Index(['2021-01-01 00:00:00', '-0.948315', '-1.29987', '2.52648', '35.6484',
       '0.231726', '42.2461', '39.7696', '55.6658', '51.0384', '81.4331',
       '67.0912', '158.009', '123.869', '276.981', '336.819', '450.145',
       '502.759', '460.145', '355.659', '382.048', '324.012', '408.553',
       '415.635', '349.366', '337.439', '339.905', '323.123', '376.983', '0',
       '0.1', '0.2', '0.3', '0.4', '0.5', '0.6', '0.7', '0.8', '0.9', '0.10',
       '0.11', '0.12', '0.13', '0.14', '0.15', '0.16', '0.17', '0.18', '0.19',
       '0.20', '0.21', '0.22', '0.23', '0.24'],
      dtype='object')

In [26]:
# Check for missing columns in each PlasMAG data frame
for plasmag_data in plasmag_data_list:
    missing_columns = set(kp_data.columns) - set(plasmag_data.columns)
    if missing_columns:
        print(f"Missing columns in PlasMAG data: {missing_columns}")

# Check for missing columns in the Kp data frame
missing_columns = set(plasmag_data.columns) - set(kp_data.columns)
if missing_columns:
    print(f"Missing columns in Kp data: {missing_columns}")

# Add missing columns to the PlasMAG data frames
for plasmag_data in plasmag_data_list:
    missing_columns = set(kp_data.columns) - set(plasmag_data.columns)
    for column in missing_columns:
        plasmag_data[column] = None

# Add missing columns to the Kp data frame
missing_columns = set(plasmag_data.columns) - set(kp_data.columns)
for column in missing_columns:
    kp_data[column] = None

Missing columns in PlasMAG data: {'Kp', 'Date'}
Missing columns in PlasMAG data: {'Kp', 'Date'}
Missing columns in PlasMAG data: {'Kp', 'Date'}
Missing columns in PlasMAG data: {'Kp', 'Date'}
Missing columns in PlasMAG data: {'Kp', 'Date'}
Missing columns in PlasMAG data: {'Kp', 'Date'}
Missing columns in Kp data: {'318.35', '4.51236', '22.6075', '0.231726', '329.34', '0.7', '416.394', '0.5', '325.961', '370.449', '0.6', '0.4', '301.154', '360.138', '368.206', '370.588', '311.078', '0.1', '200.067', '-2.70274', '43.0445', '138.182', '277.524', '151.626', '-1.45806', '490.043', '383.563', '269.424', '0.3', '0.231726.1', '185.013', '0.818269', '420.516', '413.984', '2023-01-01 00:00:00', '385.454', '54.6121', 'month', '1.20587', '327.631', '0.231726.2', '0.231726.3', '477.479', '489.333', '538.487', '51.6962', '90.2098', 'year', 'day', '0', '557.46', '407.649', '0.2', '322.576', '322.421', '265.185', '0.231726.4'}


In [8]:
plasmag_data_list

[      2017-01-01 00:00:00  -4.17863  -4.51183  -3.52434    0  0.1  0.2  0.3  \
 0     2017-01-01 00:01:00  -6.06788 -0.379552 -3.497080  0.0  0.0  0.0  0.0   
 1     2017-01-01 00:02:00  -5.83430 -3.039940 -2.985460  0.0  0.0  0.0  0.0   
 2     2017-01-01 00:03:00  -5.87318 -3.352910 -2.774760  0.0  0.0  0.0  0.0   
 3     2017-01-01 00:04:00  -5.81431 -3.305190 -2.598700  0.0  0.0  0.0  0.0   
 4     2017-01-01 00:05:00  -6.23986 -3.240720 -1.010480  0.0  0.0  0.0  0.0   
 ...                   ...       ...       ...       ...  ...  ...  ...  ...   
 9995  2017-01-07 22:36:00   3.86854 -3.295860  0.622848  0.0  0.0  0.0  0.0   
 9996  2017-01-07 22:37:00   3.79149 -3.415180  0.454364  0.0  0.0  0.0  0.0   
 9997  2017-01-07 22:38:00   3.57554 -3.569090  0.444149  0.0  0.0  0.0  0.0   
 9998  2017-01-07 22:39:00   3.65066 -3.426430  0.700379  0.0  0.0  0.0  0.0   
 9999  2017-01-07 22:40:00   3.78359 -3.094890  1.036740  0.0  0.0  0.0  0.0   
 
       0.4  0.5  ...     0.21     0.22