In [None]:
import os
import sys

import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd

from dask.distributed import Client
import dask.dataframe as dd

matplotlib.style.use("dark_background")

In [None]:
def plot_hist(df, feature_name, kind='hist', bins=100):
    df[feature_name].plot(kind='hist', 
                          bins=bins, 
                          figsize=(15, 5), 
                          title=f'Distribution of {feature_name}')
    plt.show()


def plot_ts(series, figsize=(20, 6), title=None, xlabel="", ylabel=""):
    """
    Plot Time Series data. The series object should have date or time as index.
    
    series: Series object to be plotted.
    """
    series.plot(figsize=figsize, title=title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.show()

# Goal

In this notebook, I am going to show **how Dask can be used to explore the train and test files related to various data segments.**

As per the documentation, each of these file contains ten minutes of logs from ten different sensors arrayed around a volcano.

There are 4432 data files under the train directory and 4521 files under test directory. As we will observe sooner, each of these files consists of 60K lines. On the disk, size of the files under train and test directory is 30G.

There are mostly two problems we encounter as a Data Scientist when dealing with such a large volume of data:

1. **Limited Memory:** For a normal laptop or desktop, memory (RAM) is often limited to 16 or 32 GB. So, it's kind of impossible to load all the data files together.
2. **Limited CPU:** Libraries like pandas or numpy can utilize only 1 CPU at any point of time. As a result, even though the laptop/desktop/VM have multiple cores, we can't use those.

**Dask** is a framework designed to overcome these limitations:

1. **Parallel Computing:** Dask enables parallel computation using multi-core CPUs.
2. **Out of Core Computing:** In case, the size of the data is larger than the memory (RAM), dask doesn't load all the data in-memory at a time. In turn, it streams the data from the disk as and when needed.

Dask can scale on thousand-machine clusters to handle hundreds of terabytes of data. At the same time, it works efficiently on a single machine as well, enabling analysis of moderately large datasets (100GB+) on relatively low power laptops.

In this notebook, we are going to focus on using Dask in a single machine.

#### **Note: Since the number of CPUs available in Kaggle Kernel is just 4, this notebook takes lot of time to get executed. If you have a VM or a laptop with higher number of CPUs, this notebook will take much lesser time and help you to fully understand the power of Dask**

# Start a Dask Client

By initiating a Client here, we are making sure that all the cores available for this Kernel is used while doing any computation.

In [None]:
client = Client(n_workers=3)
client

This has created a **local** Dask Cluster utilizing all the 4 Cores. 

# Read train & submission data

In [None]:
# Location of the parent directory for data
DATA_DIR = "/kaggle/input/predict-volcanic-eruptions-ingv-oe"

# Read train and submission data
train_df = pd.read_csv(f"{DATA_DIR}/train.csv")
submission_df = pd.read_csv(f"{DATA_DIR}/sample_submission.csv")

### Check the number of segments present in the training and submission files

In [None]:
# Check the number of segments present in the training and submission files
train_df.shape, submission_df.shape

There are 4431 segments for training and 4520 segments for testing

# Explore train and test data files related to different segments

In [None]:
# Define the datatypes for different sensor data
data_types = {"sensor_1" : np.float32, 
                 "sensor_2" : np.float32, 
                 "sensor_3" : np.float32,
                 "sensor_4" : np.float32,
                 "sensor_5" : np.float32,
                 "sensor_6" : np.float32,
                 "sensor_7" : np.float32,
                 "sensor_8" : np.float32,
                 "sensor_9" : np.float32,
                 "sensor_10" : np.float32}

### Read data across different training segments

To read the CSV files we are going to use Dask. 

#### Dask doesn't read all the files to memory immediately. Instead, it creates a reference and reads the CSV files only when there is a need (for any computaion)

In [None]:
%%time
dd_train_seg = dd.read_csv(
    urlpath=f"{DATA_DIR}/train/*.csv",
    blocksize=None,
    dtype=data_types, 
    include_path_column="segment_id")

dd_train_seg.head(2)

This process took just 4.24 seconds.

If we wanted to do the same process using pandas, we had to loop through all the files, read and load those one by one. That may take multiple minutes of time.

However, the column defined as `segment_id` holds the absolute path of individual CSV files. For our purpose, we just need the segment_id. Hence, we will retain just the segment_id in the next step.

In [None]:
# Just keep the segment ID
dd_train_seg.segment_id = dd_train_seg.segment_id.str.replace("/kaggle/input/predict-volcanic-eruptions-ingv-oe/train/", "")
dd_train_seg.segment_id = dd_train_seg.segment_id.str.replace(".csv", "")
# Convert the segment ID column from String to int64
dd_train_seg.segment_id = dd_train_seg.segment_id.astype("int64")

dd_train_seg.head()

This Dask DataFrame actually is a concatination of 4432 pandas DataFrame. Each Pandas DataFrame represents data for one of the segments and is identified by the value of the `segment_id`

#### Let's verify. The resulting number of segments present in `dd_train_seg` should match with the segments present in the `train.csv`

### Count the number of segments present in the Dask DataFrame.

In [None]:
%time
# How many segments are there in training data?
print("Number of segments in dd_train_seg: ", dd_train_seg.segment_id.nunique().compute())

This process took 10 micro seconds.

This is the first time, Dask reads all the train CSV files from the disk (using 4 workers), check the number of unique segments across the files

As we can see the number of segments present in this Dask DataFrame (4432) matches with the segments present in the train.csv

### Shape of the Dask DataFrame

#### How many columns are there in the Dask DF?

In [None]:
%%time
len(dd_train_seg.columns)

#### How many rows are there in the Dask DF?

In [None]:
%%time
len(dd_train_seg)

**There are approximately 265 million rows. This is huge.**

Here, the compute() function has been invoked implicitly by Dask. As expected, Dask needs to read all the CSV files from the disk, calculate the number or rows of each file and then add those values to compute the final result. It took arond 8 min 47 seconds. If we can increase the number of cores, this value will come down drastically.

### How big is the Dask DF if loaded in memory?

In [None]:
%%time
train_memory_usage = dd_train_seg.memory_usage(deep=True).compute()

print("Size (in GB) of the training segment DD (including index) : ", train_memory_usage.sum()/(10**9))

Size of the Dask DF is 12.76 GB. This is less than the size on the disk (15 GB) because while loading the CSV files the data types have been changed.

### Compute number of observations for every segment in the Dask DataFrame

In [None]:
%%time
_size = dd_train_seg.groupby("segment_id").size().compute()
print(f"In training DD, Length of the individual DataFrames for each segment {_size.unique()}")

### Check if there are missing values for different sensors across different training segments

In [None]:
# Helper function to make necessary computation on pandas dataframe related to individual segments. 
def get_missing_sensors(df, sensor_names):
    """
    Returns a DataFrame consisting of segment id, number of columns (sensors)
    and percentage of missing data per sensor
    
    sensor_names: A list consisting of column names related to sensors
    
    """
    # Get the segment_id
    segment_id = df.segment_id.unique()[0]
    # Get the percentage of missing data across sensors
    df_missing_percentage = df[sensor_names].isna().mean().to_frame().transpose()
    df_missing_percentage[sensor_names] = df_missing_percentage[sensor_names].astype(np.float16)
        
    df_missing_percentage["segment_id"] = segment_id
    return df_missing_percentage

In [None]:
%%time
# Get the column names related to sensors
sensor_names = [name for name in dd_train_seg.columns if "sensor" in name]
df_train_seg_missing = dd_train_seg.map_partitions(get_missing_sensors, sensor_names=sensor_names).compute()

df_train_seg_missing.head()

Columns with name `sensor_x` represents the percentage of missingness for that particular sensor for a segment. A value of `1` represents that particular sensor data is not present for the segment.

### Check if a sensor is completely missing for a particular segment?

In [None]:
df_missingness_across_sensors = df_train_seg_missing[sensor_names].eq(1).sum()

# Plot the missingness
df_missingness_across_sensors.plot.bar(figsize=(10, 6), title="Missing Sensors Across Training Segments")
plt.ylabel("Number of Segments with Missing Sensors")
plt.xlabel("Name of the Sensor")
plt.show()

### Get the groups of sensors which are missing across segments

In [None]:
def get_missing_groups(row):
    row_value = row.values.tolist()
    if row_value:
        return "_".join(row_value)

missing_sensor_groups = df_train_seg_missing[sensor_names].apply(lambda row: row[row == 1].index, axis=1)
missing_sensor_groups_count = missing_sensor_groups.apply(lambda row: get_missing_groups(row)).dropna().value_counts()

missing_sensor_groups_count.plot.bar(figsize=(10, 6), title="Missing Sensors Across Training Segments")
plt.ylabel("Number of Segments with Missing Sensor Groups")
plt.xlabel("Name of the Sensor Groups")
plt.show()