In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
import tensorflow as tf
from tensorflow.keras import layers, models

In [None]:
# Specify the path to your Parquet file
file_path = "/content/drive/MyDrive/2nd sem/Big Data/data/train_meta.parquet"

# Open the Parquet file
parquet_file = pq.ParquetFile(file_path)

# Read the DataFrame normally (assuming 'event_id' is the index)
filtered_meta = pd.read_parquet(file_path)


In [None]:
filtered_meta = pd.read_csv('/content/drive/MyDrive/2nd sem/Big Data/data/filtered_meta.csv')

In [None]:
# Specify the path to your Parquet file
file_path = "/content/drive/MyDrive/2nd sem/Big Data/data/batch_1.parquet"

# Open the Parquet file
parquet_file = pq.ParquetFile(file_path)

# Read the DataFrame normally (assuming 'event_id' is the index)
df = pd.read_parquet(file_path)

# Add a column with the index values (assuming the index is numerical)
df['event_id'] = df.index  # Assuming the index is numerical

# Drop the original index (if it's 'event_id')
df = df.reset_index(drop=True)

# Print the first few rows of the dataframe
print(df.head())

   sensor_id  time  charge  auxiliary  event_id
0       3918  5928   1.325       True        24
1       4157  6115   1.175       True        24
2       3520  6492   0.925       True        24
3       5041  6665   0.225       True        24
4       2948  8054   1.575       True        24


In [None]:
file_path = "/content/drive/MyDrive/2nd sem/Big Data/data/sensor_geometry.csv"
sens_df = pd.read_csv(file_path)

In [None]:
merged_df = pd.merge(df, sens_df, on='sensor_id', how='inner')

# Rename the x, y, and z columns from sens_df to avoid conflicts
merged_df.sort_values(by=['event_id', 'time'], inplace=True)
merged_df.reset_index(drop=True, inplace=True)
# Create groups by event_id
grouped_df = merged_df.groupby('event_id')

# Calculate minimum time per event
min_time_per_event = grouped_df['time'].transform('min')

# Calculate time difference
merged_df['time_diff'] = merged_df['time'] - min_time_per_event

merged_df.sort_values(by=['event_id', 'time_diff'], inplace=True)

merged_df.drop(['time'], axis=1, inplace=True)

In [None]:
# Sample 100 rows from filtered_meta
#filtered_meta_mark = filtered_meta.sample(12000)  # Randomly sample 100 rows
filtered_meta_mark = filtered_meta.sample(3000)  # Use all rows


# Define a function to filter merged_df based on event_id from a smaller filtered_meta subset
def filter_merged_df_by_event_id(merged_df, filtered_meta_subset):

  event_ids = filtered_meta_subset["event_id"].tolist()  # Extract event_ids from the subset
  return merged_df[merged_df["event_id"].isin(event_ids) & (merged_df["auxiliary"] == True)] # Filter based on event_ids

# Filter merged_df based on sampled event_ids
merged_df_mark = filter_merged_df_by_event_id(merged_df.copy(), filtered_meta_mark)  # Use a copy to avoid modifying original data
merged_df_mark.drop('auxiliary', axis=1, inplace=True)
merged_df_mark.reset_index(drop=True, inplace=True)



In [None]:
filtered_meta_mark.shape

(3000, 6)

In [None]:
def create_3d_volumes(df, sens_df):
    # Determine the maximum x, y, and z values from sens_df
    max_x = (int(sens_df['x'].max()) - int(sens_df['x'].min()))//40 + 1
    max_y = (int(sens_df['y'].max()) - int(sens_df['y'].min()))//40 + 1
    max_z = (int(sens_df['z'].max()) - int(sens_df['z'].min()))//40 + 1

    x_add = -int(sens_df['x'].min())
    y_add = -int(sens_df['y'].min())
    z_add = -int(sens_df['z'].min())


    # Get unique event_ids
    unique_event_ids = df['event_id'].unique()

    # Initialize a dictionary to store volumes for each event_id
    volumes = {}

    # Iterate over unique event_ids
    for event_id in unique_event_ids:
        # Subset the DataFrame for the current event_id
        event_df = df[df['event_id'] == event_id]


        # Create an empty 3D volume filled with zeros
        volume = np.zeros((max_x, max_y, max_z), dtype=np.float64)  # Adding 1 to include 0 values

        # Fill the volume with time_diff values for each sensor_id
        for index, row in event_df.iterrows():
            x = (int(row['x']) + x_add)//40
            y = (int(row['y']) + y_add)//40
            z = (int(row['z']) + z_add)//40
            volume[x, y, z] = row['time_diff']

        # Store the volume in the dictionary
        volumes[event_id] = volume

    return volumes, unique_event_ids, max_x, max_y, max_z

In [None]:
volumes, event_ids, max_x, max_y, max_z = create_3d_volumes(merged_df_mark, sens_df)

In [None]:
# Sorting DataFrame based on event_id
filtered_meta_mark_sorted = filtered_meta_mark.set_index('event_id').reindex(event_ids).reset_index()

# Extracting azimuth values
azimuth_values = filtered_meta_mark_sorted['azimuth'].values
zenith_values = filtered_meta_mark_sorted['zenith'].values

In [None]:
# Step 1: Split your data into training and testing sets
# Assuming volumes and azimuths are already defined
X_train, X_test, y_train, y_test = train_test_split(list(volumes.values()), azimuth_values, test_size=0.2, random_state=42)

# Define the CNN architecture
model = models.Sequential([
    layers.Conv3D(16, kernel_size=(3, 3, 3), activation='relu', input_shape=(max_x, max_y, max_z, 1)),
    layers.MaxPooling3D(pool_size=(2, 2, 2)),
    layers.Flatten(),
    layers.Dense(16, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(1)
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Reshape training and testing data to fit the model input shape
X_train = np.array(X_train).reshape(-1, max_x, max_y, max_z, 1)
X_test = np.array(X_test).reshape(-1, max_x, max_y, max_z, 1)

# Decrease batch size
batch_size = 2

early_stopping_callback = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',  # Monitor validation loss
    patience=3,          # Number of epochs to wait before stopping
    min_delta=0.0001,    # Minimum change in the monitored quantity
    verbose=1            # Print messages
)

# Assuming you have defined your model somewhere above
with tf.device('/gpu:0'):
    model.fit(
        X_train,
        y_train,
        epochs=40,
        batch_size=batch_size,
        validation_split=0.1,
        callbacks=[early_stopping_callback]  # Pass the EarlyStopping callback
    )

# Evaluate the trained model using the testing data
y_pred = model.predict(X_test)

# Calculate root mean squared error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = np.mean(np.abs(y_test - y_pred))
print("AZIMUTH")
print("RMSE:", rmse)
print("MAE:", mae)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 17: early stopping
AZIMUTH
RMSE: 1.7673143745043134
MAE: 1.5268199025603153


In [None]:
# Step 1: Split your data into training and testing sets
# Assuming volumes and azimuths are already defined
X_train, X_test, y_train, y_test = train_test_split(list(volumes.values()), zenith_values, test_size=0.2, random_state=42)

# Define the CNN architecture
model = models.Sequential([
    layers.Conv3D(16, kernel_size=(3, 3, 3), activation='relu', input_shape=(max_x, max_y, max_z, 1)),
    layers.MaxPooling3D(pool_size=(2, 2, 2)),
    layers.Flatten(),
    layers.Dense(16, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(1)
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Reshape training and testing data to fit the model input shape
X_train = np.array(X_train).reshape(-1, max_x, max_y, max_z, 1)
X_test = np.array(X_test).reshape(-1, max_x, max_y, max_z, 1)

# Decrease batch size
batch_size = 2

early_stopping_callback = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',  # Monitor validation loss
    patience=3,          # Number of epochs to wait before stopping
    min_delta=0.0001,    # Minimum change in the monitored quantity
    verbose=1            # Print messages
)

# Assuming you have defined your model somewhere above
with tf.device('/gpu:0'):
    model.fit(
        X_train,
        y_train,
        epochs=40,
        batch_size=batch_size,
        validation_split=0.1,
        callbacks=[early_stopping_callback]  # Pass the EarlyStopping callback
    )

# Evaluate the trained model using the testing data
y_pred = model.predict(X_test)

# Calculate root mean squared error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = np.mean(np.abs(y_test - y_pred))
print("ZENITH")
print("RMSE:", rmse)
print("MAE:", mae)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 16: early stopping
ZENITH
RMSE: 0.7083075421690939
MAE: 0.5843521935633311
