In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#lets import libraries that we'll need 
import geopandas as gpd 
import matplotlib.pyplot as plt 
import seaborn as sns 

In [None]:
#lets import the data,and check it .
path='../input/volcanic-eruptions/database.csv'
df=pd.read_csv(path)
df.head()
#describe the data
# df.describe

In [None]:
#info
df.info()
#we see that there are a few null entries.
# df.isnull().sum()

In [None]:
#Top ten countries in terms of volcanic events recorded
Top_countries=df.groupby('Country')['Country'].count().sort_values(ascending=False)[:10]
Top_countries

In [None]:
#which region has recorded the most volcanic activity?
top_ten_regions=df.groupby(['Region'])['Region'].count().sort_values(ascending=False)[:10]
print(top_ten_regions)

In [None]:
#lets see the activity evidence counts:
plt.figure(figsize=(16,8))
ax=sns.countplot(df['Activity Evidence'])
plt.title('Counts of Activity Evidence types')
plt.tight_layout()
plt.show()


In [None]:
#the dataset has column 'Last Known Eruption'.Lets sort the data according to those values.
#the data is non numerical, so lets add a new feature 'Year' using which we can sort Dataset.
year_ce=[]
year_bce=[]
Unknown=[]
for i in range(len(df)):
    try:
        if df['Last Known Eruption'][i].split()[1].rstrip()[0].lower() == 'c':
            year_ce.append(i)
        elif df['Last Known Eruption'][i].split()[1].rstrip()[0].lower() == 'b':
            year_bce.append(i)
    except:
        Unknown.append(i)
        pass
#now we have indexes of all the BC,CE(AD) and Unknown values 
# we will loop over the indices and add 'year' to the dataset
df['year']=int()
for i in year_ce:
    df['year'][i]=int(df['Last Known Eruption'][i].split()[0])

#adding -ve for BCE years 
for i in year_bce:
    df['year'][i]=int(df['Last Known Eruption'][i].split()[0])*-1    

#sorting the data.
df.sort_values('year',inplace=True)


In [None]:
#the unknown columns have been assigned year =0.
# df[df['year']==0]

In [None]:
#now that we have our data sorted according to years lets plot them accordingly
#plot the full data :
gpd_df=gpd.GeoDataFrame(df,geometry=gpd.points_from_xy(df['Longitude'],df['Latitude']))
#setting the Coordinate Reference System:
gpd_df.crs={'init':'epsg:4326'}

In [None]:
# lets plot the world map
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
ax_world=world.plot(color='white',edgecolor='k',linewidth=1,linestyle='-',figsize=(16,16))


In [None]:
#now lets look at volcanoes which have been recorded After 1 CE
ax_world=world.plot(color='white',edgecolor='k',linewidth=1,linestyle='-',figsize=(15,12))
gpd_df[gpd_df['year']>0].plot(color='r',marker='^',ax=ax_world,label='Year:After CE')

#now lets look at volcanoes which have been recorded After 1 CE
gpd_df[gpd_df['year']<0].plot(color='g',marker='^',ax=ax_world,label='Year:Before CE')

#adding title,xlabel,ylabel.
plt.title('Distribution of Volcanoes')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend()

In [None]:
#lets plot eruptions for which we don't have the eruption year.
ax_world=world.plot(color='white',edgecolor='k',linewidth=1,linestyle='-',figsize=(16,12))
gpd_df[gpd_df['year']==0].plot(ax=ax_world,color='orange',marker='^',label='Year:Unknown')
plt.title('Distribution of Volcanoes')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend()

In [None]:
# Lets plot over the full data 
ax_world=world.plot(color='white',edgecolor='k',linewidth=1,linestyle='-',figsize=(15,12))
#ce
gpd_df[gpd_df['year']>0].plot(color='r',marker='^',ax=ax_world,label="Year:AFTER CE")

#bce
gpd_df[gpd_df['year']<0].plot(color='g',marker='^',ax=ax_world,label='Year:Before CE')

#unknown
gpd_df[gpd_df['year']==0].plot(ax=ax_world,color='orange',marker='^',label='Year:Unknown')


plt.title('Distribution of Volcanoes')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend()


In [None]:
#lets find the distribution of hieght of volcanoes:
sns.distplot(df[df['year']>0]['Elevation (Meters)'],hist=False,label='AD_volcanoes',color='g')
sns.distplot(df[df['year']<0]['Elevation (Meters)'],label='BC volcanoes',color='c',hist=False)
sns.distplot(df[df['year']==0]['Elevation (Meters)'],label='Unknown',color='k',hist=False)

plt.title('Distribution of hieght of volcanoes')


In [None]:
# lets find the tallest vol in data.
tallest=df[df['Elevation (Meters)']==df['Elevation (Meters)'].max()]
tallest


In [None]:
#Deepest 
deepest=df[df['Elevation (Meters)']==df['Elevation (Meters)'].min()]
deepest