In [1]:
import pandas as pd
import numpy as np
from datetime import date

In [3]:
# source:
# https://www.kaggle.com/martincontreras/volcanic-eruptions-dataset-all-to-2020
df = pd.read_excel('INPUT/volcanic_dataset.xls', header=1)

# Cleaning the dataframe
In this excercise we will focus on the eruptions starting from 2015, as that is the satelite imagery available in our API.

In [4]:
df = df[df['Start Year'] >= 2015].reset_index()
df

Unnamed: 0,index,Volcano Number,Volcano Name,Eruption Number,Eruption Category,Area of Activity,VEI,VEI Modifier,Start Year Modifier,Start Year,...,Evidence Method (dating),End Year Modifier,End Year,End Year Uncertainty,End Month,End Day Modifier,End Day,End Day Uncertainty,Latitude,Longitude
0,0,233020,"Fournaise, Piton de la",22343,Confirmed Eruption,,,,,2020.0,...,Historical Observations,,2020.0,,2.0,,16.0,,-21.244,55.708
1,1,345020,Rincon de la Vieja,22346,Confirmed Eruption,,,,,2020.0,...,Historical Observations,,2020.0,,1.0,,31.0,,10.830,-85.324
2,2,353010,Fernandina,22347,Confirmed Eruption,,,,,2020.0,...,Historical Observations,,2020.0,,1.0,,12.0,,-0.370,-91.550
3,3,273070,Taal,22344,Confirmed Eruption,,,,,2020.0,...,Historical Observations,,2020.0,,1.0,,22.0,,14.002,120.993
4,4,282050,Kuchinoerabujima,22345,Confirmed Eruption,,,,,2020.0,...,Historical Observations,>,2020.0,,2.0,,13.0,,30.443,130.217
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178,178,263340,Raung,21070,Confirmed Eruption,,3.0,,,2015.0,...,Historical Observations,,2015.0,,8.0,,22.0,,-8.119,114.056
179,179,343100,San Miguel,20994,Confirmed Eruption,,1.0,,,2015.0,...,Historical Observations,,2015.0,,4.0,,11.0,,13.434,-88.269
180,180,352090,Sangay,22236,Confirmed Eruption,Summit crater,2.0,,,2015.0,...,Historical Observations,,2015.0,,4.0,,7.0,,-2.005,-78.341
181,181,266030,Soputan,20988,Confirmed Eruption,,3.0,,,2015.0,...,Historical Observations,,2015.0,,3.0,,7.0,,1.112,124.737


The distributor of this dataset provides the following information about the dataset's columns.

### Eruption list contains:

    Volcano Number = id of the volcano
    Volcano Name
    Eruption Number = eruption id
    Eruption Category = Confirmed Eruption or Uncertain Eruption
    Area of Activity = where in the volcano, the eruption occurs (crater, side walls, a certain area)
    VEI = volcanic eruption index, is a logaritmic scale of the eruptions magnitud (from 0 to 8)
    VEI Modifier = I supose it is a post modification to the VEI, but is almost full of nan values
    Start Year Modifier = same that above
    Start Year = Beginning of the eruption
    Start Year Uncertainty = the uncertainty related to the age datation
    Start Month = the month of the eruption
    Start Day Modifier = same as all modifiers
    Start Day = the day of the month were the eruption start
    Start Day Uncertainty = confidence intervals related to the datation method
    Evidence Methon (dating) = The method used to define the date of the eruption
    End Year Modifier =
    End Year = when the eruption finished
    End Day Modifier =
    End Day = the day that ends
    End Day Uncertainty = related to the datation method
    Latitude = coordinates y axes
    Longitude = coordinates x axes


In [5]:
df = df.drop(columns = ['index','VEI Modifier', 'Start Year Modifier', 'Start Day Modifier', 'End Year Modifier',
                   'End Day Modifier', 'Start Year Uncertainty', 'End Year Uncertainty', 'End Day Uncertainty',
                       'Area of Activity', 'Start Day Uncertainty', 'Evidence Method (dating)'])
df.columns

Index(['Volcano Number', 'Volcano Name', 'Eruption Number',
       'Eruption Category', 'VEI', 'Start Year', 'Start Month', 'Start Day',
       'End Year', 'End Month', 'End Day', 'Latitude', 'Longitude'],
      dtype='object')

In [6]:
df = df
df.columns = ['v_num', 'v_name', 'erup_num',
       'erup_cat', 'vei', 'start_y', 'start_m', 'start_d',
        'end_y', 'end_m', 'end_d', 'lat', 'long']

# Unify date into one column

In [7]:
toint = lambda x: int(x)
df['start_d'] = df['start_d'].map(toint)
df['start_m'] = df['start_m'].map(toint)
df['start_y'] = df['start_y'].map(toint)
df['end_d'] = df['end_d'].map(toint)
df['end_m'] = df['end_m'].map(toint)
df['end_y'] = df['end_y'].map(toint)

In [11]:
df['start'] = pd.to_datetime(df[['start_d','start_m','start_y']]
                   .astype(str).apply(' '.join, 1), format='%d %m %Y')
df['end'] = pd.to_datetime(df[['end_d','end_m','end_y']]
                   .astype(str).apply(' '.join, 1), format='%d %m %Y')

# Calculate duration of eruption
df['delta'] = df.end - df.start
df[['v_name', 'start', 'end', 'delta']].sort_values('delta', ascending=False)

Unnamed: 0,v_name,start,end,delta
166,Pacaya,2015-06-07,2020-02-19,1718 days
174,Turrialba,2015-03-08,2019-10-28,1695 days
149,Masaya,2015-10-03,2020-02-13,1594 days
141,"Chillan, Nevados de",2016-01-08,2020-02-19,1503 days
114,Ebeko,2016-10-20,2020-02-19,1217 days
...,...,...,...,...
93,Karangetang,2017-05-10,2017-05-10,0 days
43,Ketoi,2018-09-21,2018-09-21,0 days
156,San Miguel,2015-08-13,2015-08-13,0 days
131,San Cristobal,2016-04-22,2016-04-22,0 days


In [None]:
0/0

In [None]:
df.to_csv('OUTPUT/volcanic-explosions.csv')

In [12]:
df.start.value_counts()

2017-06-04    3
2015-06-16    2
2018-09-08    2
2019-07-23    2
2018-09-21    2
             ..
2016-04-22    1
2019-04-16    1
2015-11-12    1
2015-05-01    1
2018-05-11    1
Name: start, Length: 174, dtype: int64