In [None]:
import geodatasets
import geopandas as gpd
import folium
import imageio
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns


from datetime import datetime
from pathlib import Path

# 1. Exploring Metadata

Loading the Metadata as DataFrame

In [None]:
# current directory
current_dir = os.getcwd()
# parent directory of current directory
parent_dir = os.path.dirname(current_dir)
# relative path to train metadata.csv
relative_path_train = "data/train_data/metadata.csv"
# full path of train metadata.csv
csv_path_train = os.path.join(parent_dir, relative_path_train)

# relative path to test metadata.csv
relative_path_test = "data/test_data/metadata.csv"
# full path of test metadata.csv
csv_path_test = os.path.join(parent_dir, relative_path_test)

meta = pd.read_csv(csv_path_train)
meta_test = pd.read_csv(csv_path_test)

In [None]:
print(meta.head())
print(meta.dtypes)

In [None]:
# convert date column to datetime
meta['date'] = pd.to_datetime(meta['date'], format="%Y%m%d", errors='coerce')
meta_test['date'] = pd.to_datetime(meta_test['date'], format="%Y%m%d", errors='coerce')

In [None]:
summary_stats = meta.describe()
summary_stats_test = meta_test.describe()

print(summary_stats, summary_stats_test)

In [None]:
meta_test

## Geographic Location of our Images

In [None]:
# creating separate df for plume
plume_data = meta[meta['plume']=="yes"]
# creating separate df for no plume
no_plume_data = meta[meta['plume']!="yes"]

## Static Image of Map

In [None]:
plume = gpd.GeoDataFrame(plume_data[['id_coord','lon','plume', 'lat', 'set', 'coord_x', 'coord_y']],
                       geometry=gpd.points_from_xy(plume_data.lon, plume_data.lat))
no_plume = gpd.GeoDataFrame(no_plume_data[['id_coord','lon','plume', 'lat', 'set', 'coord_x', 'coord_y']],
                       geometry=gpd.points_from_xy(no_plume_data.lon, no_plume_data.lat))

test = gpd.GeoDataFrame(meta_test[['id_coord','lon', 'lat', 'coord_x', 'coord_y']],
                       geometry=gpd.points_from_xy(meta_test.lon, meta_test.lat))

world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

ax = world.plot(figsize=(15, 8))
ax.set_facecolor('lightblue')
world.plot(ax=ax, color='grey', edgecolor='white')
plume.plot(ax=ax, color='red', markersize=8, label='plume')
no_plume.plot(ax=ax, color='green', markersize=8, label='no plume')
test.plot(ax=ax, color='yellow', markersize=8, label='test data')

ax.tick_params(left=False,
               right=False,
               top=False,
               bottom=False,
               labelleft=False,
               labeltop=False,
               labelright=False,
               labelbottom=False)

ax.legend()

plt.show()

## Interactive Map

In [None]:
m = world.explore(
    legend=True,
    name="world"
)

plume.explore(
    m=m,
    color="red",
    name="plume"
)

no_plume.explore(
    m=m,
    color="green",
    name="no_plume"
)

test.explore(
    m=m,
    color="yellow",
    name="test"
)

# use folium to add alternative mape
folium.TileLayer("CartoDB positron", show=False).add_to(m)  
# use folium to add layer control
folium.LayerControl().add_to(m)  

m

# 2. Exploring Image Data

In [None]:
def read_tiff(file_path):
    """reads tiff files and returns images

    Args:
        file_path (str): the file path of the image(s)

    Returns:
        image(2D array): returns a 2 or 3d array (in our case 2D)
    """
    image = imageio.imread(file_path)
    return image

In [None]:
class CustomToTensor(object):
    def __call__(self, img):
        return img.astype(np.float32)/65535.0

In [None]:
# transform = transforms.Compose([
#     transforms.ToPILImage(),
#     transforms.ToTensor()
# ])

In [None]:
def create_dataframe(directory, label):
    data = []
    labels = []

    for filename in os.listdir(directory):
        if filename.endswith(".tif"):
            file_path = os.path.join(directory, filename)
            #reads tifs
            image_data = read_tiff(file_path).astype(np.float32)/65535.0
            #if transform:
                #image_data = transform(image_data)
            # adds each image to data
            data.append(image_data)
            labels.append(label)
    
    df = pd.DataFrame({'image':data, 'label':labels})

    return df

In [None]:
# relative path to train plume images
relative_path_train_p = "data/train_data/images/plume"
# relative path to train no plume images
relative_path_train_np = "data/train_data/images/no_plume"

# full path to train plume images
train_image_p = os.path.join(parent_dir, relative_path_train_p)
# full path to train no-plume images
train_image_np = os.path.join(parent_dir, relative_path_train_np)

In [None]:
df_p = create_dataframe(train_image_p, 1)
df_np = create_dataframe(train_image_np, 0)

In [None]:
# Calculate summary statistics for the 'plume' class
plume_means = [np.mean(img) for img in df_p['image']]
plume_stds = [np.std(img) for img in df_p['image']]
plume_min = [np.min(img) for img in df_p['image']]
plume_max = [np.max(img) for img in df_p['image']]

# Calculate summary statistics for the 'plume' class
no_plume_means = [np.mean(img) for img in df_np['image']]
no_plume_stds = [np.std(img) for img in df_np['image']]
no_plume_min = [np.min(img) for img in df_np['image']]
no_plume_max = [np.max(img) for img in df_np['image']]

# Create plume summary dataframe
plume_summary = pd.DataFrame({'Mean': plume_means,
                              'Std': plume_stds,
                              'Min': plume_min,
                              'Max': plume_max})

# Create no plume summary dataframe
no_plume_summary = pd.DataFrame({'Mean': no_plume_means,
                              'Std': no_plume_stds,
                              'Min': no_plume_min,
                              'Max': no_plume_max})

print("Summary Statistics for 'Plume' Class:")
print(plume_summary.describe())

print("\nSummary Statistics for 'No Plume' Class:")
print(no_plume_summary.describe())

In [None]:
plume_images = np.concatenate(df_p['image'].values)
no_plume_images = np.concatenate(df_np['image'].values)

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.hist(plume_images.ravel(),
         bins=256,
         range=(0.0, 1.0),
         density=True,
         color='blue',
         alpha=0.7,
         label='plume')
plt.title('Plume Image Histogram')
plt.xlabel('Pixel Value')
plt.ylabel('Frequency')
plt.legend()


plt.subplot(1, 2, 2)
plt.hist(no_plume_images.ravel(),
         bins=256,
         range=(0.0, 1.0),
         density=True,
         color='red',
         alpha=0.7,
         label='no plume')
plt.title('No Plume Image Histogram')
plt.xlabel('Pixel Value')
plt.ylabel('Frequency')

plt.legend()
plt.show()


In [None]:
plume_1d = [img.ravel() for img in df_p['image'].values]
plume_1d_data = pd.DataFrame({'label': '1',
                              'pixel_value': np.concatenate(plume_1d)})
no_plume_1d = [img.ravel() for img in df_np['image'].values]
no_plume_1d_data = pd.DataFrame({'label': '0',
                              'pixel_value': np.concatenate(no_plume_1d)})

combined_data = pd.concat([plume_1d_data, no_plume_1d_data])

plt.figure(figsize=(10, 6))
sns.violinplot(data=combined_data, x='label', y='pixel_value', palette=["blue", "red"])
plt.title('Violin Plot of Pixel Values for Plume and No Plume Classes')
plt.xlabel('Class')
plt.ylabel('Pixel Value')
plt.show()