In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# DISPLAYING THE DATASET USING PANDAS

In [None]:
# Displaying the dataset using Pandas
data = pd.read_csv("/kaggle/input/volcanic-eruptions/database.csv")
print(data)

# DROPPING THE NUMBER COLUMN

In [None]:
# Dropping the Number Column
data = data.drop(columns = ["Number"])
print(data)

# CHECKING THE PRESENCE OF NULL VALUES

In [None]:
# Checking the presence of Null values
data.isnull().sum()

# DROPPING THE ROWS WHICH CONTAIN NULL VALUES

In [None]:
# Dropping the rows which contain Null values
data = data.dropna()
data = data[data.Type != 'Unknown']
data = data[data["Last Known Eruption"] != 'Unknown']
data = data[data["Dominant Rock Type"] != 'No Data']
print(data)

# DATA PREPARATION - "TYPE" COLUMN

In [None]:
# Data Preparation - "Type" Column
data["Type"] = data["Type"].str.replace('[(,),?]', '') 
data["Type"] = data["Type"].replace(to_replace=["Calderas","Complexes","Fissure vents","Lava cones", "Maars", "Lava domes", "Pyroclastic cones","Shields", "Stratovolcanoes",
                              "Submarinees", "Tuff cones", "Volcanic fields"], 
                  value=["Caldera", "Complex","Fissure vent","Lava cone", "Maar", "Lava dome","Pyroclastic cone","Shield", "Stratovolcano",
                        "Submarine", "Tuff cone", "Volcanic field"])
print(data["Type"])

# COUNTING THE DUPLICATES IN COUNTRY COLUMN

In [None]:
# Counting the duplicates in Country column
dup = data.pivot_table(index = ['Country'], aggfunc ='size') 
dup = dup.reset_index()
dup.columns= ["Country", "Counts"]
print(dup)

# DISPLAYING COUNTRIES HAVING VOLCANOES FROM HIGHEST TO THE LOWEST

In [None]:
# Displaying Countries having volcanoes from highest to the lowest 
lar = dup.nlargest(80, ["Counts"])
print(lar)

# DISPLAYING TOP 20 COUNTRIES HAVING HIGHER NUMBER OF VOLCANOES

In [None]:
# Displaying first set of top 20 Countries having higher number of volcanoes
a = lar[0:20]
print(a)

# VISUALIZATION OF TOP 20 COUNTRIES HAVING HIGHER NUMBER OF VOLCANOES USING PIE CHART

In [None]:
# Importing Matplotlib Library
from matplotlib import pyplot as plt 

# Creating Pie Chart
fig = plt.figure(figsize =(15, 70)) 
plt.pie(a["Counts"], labels = a["Country"])
plt.legend(a["Counts"], fontsize=10)
plt.title("TOP 20 COUNTRIES HAVING HIGHER NUMBER OF VOLCANOES", fontsize=25)

# Displaying Pie Chart 
plt.show() 

# DISPLAYING SECOND SET OF 20 COUNTRIES HAVING HIGHER NUMBER OF VOLCANOES

In [None]:
# Displaying second set of 20 Countries having higher number of volcanoes next to first set
b = lar[20:40]
print(b)

# VISUALIZATION OF SECOND SET OF 20 COUNTRIES USING STACK PLOT

In [None]:
# Creating Stack Plot
fig = plt.figure(figsize =(20, 10)) 
plt.stackplot(b["Counts"], b["Country"], color="red") 
plt.xlabel("COUNTS", fontsize=20) 
plt.ylabel("COUNTRY", fontsize=20) 
plt.title("SECOND SET OF 20 COUNTRIES HAVING HIGHER NUMBER OF VOLCANOES", fontsize=25)

# Displaying Stack Plot
plt.show() 

# DISPLAYING THIRD SET OF 20 COUNTRIES HAVING HIGHER NUMBER OF VOLCANOES

In [None]:
# Displaying third set of 20 Countries having higher number of volcanoes next to second set
c = lar[40:60]
print(c)

# VISUALIZATION OF THIRD SET OF 20 COUNTRIES USING LINEAR PLOT WITH CUSTOMIZATIONS

In [None]:
# Creating Linear Plot with Customizations
fig = plt.figure(figsize =(15, 10)) 
plt.plot(c["Counts"], c["Country"], color='orange', linestyle='dashed', linewidth = 3, marker='o', markerfacecolor='blue', markersize=12)
plt.xlabel("COUNTS", fontsize=20) 
plt.xticks(fontsize=15)
plt.ylabel("COUNTRY", fontsize=20) 
plt.yticks(fontsize=15)
plt.title("THIRD SET OF 20 COUNTRIES HAVING HIGHER NUMBER OF VOLCANOES", fontsize=25)

# Displaying Linear Plot with Customizations
plt.show() 

# DISPLAYING SET OF 20 COUNTRIES HAVING LEAST NUMBER OF VOLCANOES

In [None]:
# Displaying set of 20 Countries having least number of volcanoes
d = lar[60:]
print(d)

# VISUALIZING A SET OF 20 COUNTRIES HAVING LEAST NUMBER OF VOLCANOES USING DONUT PLOT 

In [None]:
# Creating Donut Plot
fig = plt.figure(figsize =(15, 15)) 
circle = plt.Circle( (0,0), 0.5, color='white')
plt.pie(d["Counts"], labels=d["Country"])
p=plt.gcf()
p.gca().add_artist(circle)
plt.legend(d["Counts"])
plt.title("COUNTRIES HAVING LEAST NUMBER OF VOLCANOES", fontsize=25)

# Displaying Donut Plot
plt.show() 

# COUNTING THE DUPLICATES IN REGION COLUMN

In [None]:
# Counting the duplicates in Region column
reg = data.pivot_table(index = ['Region'], aggfunc ='size') 
reg = reg.reset_index()
reg.columns= ["Region", "Counts"]
print(reg)

# DISPLAYING REGIONS HAVING HIGHER NUMBER OF VOLCANOES TO THE LOWEST

In [None]:
# Displaying Regions having volcanoes from highest to the lowest 
lac = reg.nlargest(19, ["Counts"])
print(lac)

# VISUALIZATION OF REGIONS HAVING HIGHER NUMBER OF VOLCANOES TO THE LOWEST BY DONUT PLOT

In [None]:
# Creating Donut Plot
fig = plt.figure(figsize =(15, 15)) 
circle = plt.Circle( (0,0), 0.5, color='white')
plt.pie(lac["Counts"], labels=lac["Region"])
p=plt.gcf()
p.gca().add_artist(circle)
plt.legend(lac["Counts"], fontsize=8)
plt.title("REGIONS WITH VOLCANOES RANGING FROM HIGHEST TO LOWEST",fontsize=25)

# Displaying Donut Plot
plt.show() 

# COUNTING THE DUPLICATES IN TYPE COLUMN

In [None]:
# Counting the duplicates in Type column
tpe = data.pivot_table(index = ["Type"], aggfunc = 'size') 
tpe = tpe.reset_index()
tpe.columns= ["Type", "Counts"]
print(tpe)

# DISPLAYING DIFFERENT TYPES OF VOLCANOES AND THEIR COUNTS

In [None]:
# Displaying different types of volcanoes and their occurrence from highest to the lowest 
tpes = tpe.nlargest(18, ["Counts"])
print(tpes)

# VISUALIZING DIFFERENT TYPES OF VOLCANOES AND THEIR COUNTS USING BAR CHART

In [None]:
# Creating Bar Chart
fig = plt.figure(figsize =(40, 15)) 
plt.bar(tpes["Type"], tpes["Counts"]) 
plt.xlabel("VOLCANO TYPE", fontsize=25) 
plt.xticks(fontsize=15)
plt.ylabel("COUNTS", fontsize=25) 
plt.yticks(fontsize=15)
plt.title("DIFFERENT TYPES OF VOLCANOES AND THEIR COUNTS", fontsize=30)

# Displaying Bar Chart 
plt.show() 

# VISUALIZING DIFFERENT TYPES OF VOLCANOES IN DIFFERENT COUNTRIES

In [None]:
# Creating Scatter Plot
fig = plt.figure(figsize =(45, 45)) 
plt.scatter(data["Type"], data["Country"], marker="o", s=100, color="red") 
plt.xlabel("VOLCANO TYPE", fontsize=50) 
plt.xticks(fontsize=17)
plt.ylabel("COUNTRY", fontsize=75) 
plt.yticks(fontsize=20)
plt.title("DIFFERENT TYPES OF VOLCANOES IN DIFFERENT COUNTRIES", fontsize=55)

# Displaying Scatter Plot
plt.show() 

# VISUALIZING DIFFERENT TYPES OF VOLCANOES IN DIFFERENT REGIONS

In [None]:
# Creating Scatter Plot
fig = plt.figure(figsize =(40, 20)) 
plt.scatter(data["Type"], data["Region"], marker="^", s=100) 
plt.xlabel("VOLCANO TYPE", fontsize=35) 
plt.xticks(fontsize=15)
plt.ylabel("REGIONS", fontsize=35) 
plt.yticks(fontsize=20)
plt.title("DIFFERENT TYPES OF VOLCANOES IN DIFFERENT REGIONS", fontsize=45)

# Displaying Scatter Plot
plt.show() 

# VISUALIZING DIFFERENT TYPES OF VOLCANOES IN DIFFERENT REGIONS AND COUNTRIES

In [None]:
# Creating Scatter Plot
import plotly.express as px
fig = px.scatter(data, x='Region', y='Country', color='Type', title="DIFFERENT TYPES OF VOLCANOES IN DIFFERENT COUNTRIES AND REGIONS")

# Displaying Scatter Plot
fig.show()

# COUNTING THE DUPLICATES IN ACTIVITY EVIDENCE COLUMN

In [None]:
# Counting the duplicates in Activity Evidence column
ae = data.pivot_table(index = ["Activity Evidence"], aggfunc = 'size') 
ae = ae.reset_index()
ae.columns= ["Activity Evidence", "Counts"]
print(ae)

In [None]:
# Creating Donut Plot
fig = plt.figure(figsize =(15, 15)) 
circle = plt.Circle( (0,0), 0.5, color='white')
plt.pie(ae["Counts"], labels=ae["Counts"])
p=plt.gcf()
p.gca().add_artist(circle)
plt.legend(ae["Activity Evidence"])
plt.title("ACTIVITY EVIDENCE OF VOLCANOES",fontsize=25)

# Displaying Donut Plot
plt.show() 

# VISUALIZATION OF VOLCANOES WITH RESPECT TO COUNTRIES, REGIONS, LATITUDE AND LONGITUDE

In [None]:
# Creating and Displaying World map
import plotly.express as px
import geopandas as gpd
fig = px.scatter_geo(data, lat=data.Latitude, lon=data.Longitude, hover_name="Name", color="Region")
fig.show()

# COUNTING THE DUPLICATES IN DOMINANT ROCK TYPE COLUMN

In [None]:
# Counting the duplicates in Dominant Rock Type column

drt = data.pivot_table(index = ["Dominant Rock Type"], aggfunc = 'size') 
drt = drt.reset_index()
drt.columns= ["Dominant Rock Type", "Counts"]
print(drt)

# VISUALIZING DIFFERENT DOMINANT ROCK TYPES AND THEIR COUNTS USING HORIZONTAL BAR CHART

In [None]:
# Creating Bar Chart
fig = plt.figure(figsize =(50, 15)) 
plt.barh(drt["Dominant Rock Type"], drt["Counts"]) 
plt.ylabel("DOMINANT ROCK TYPE", fontsize=25) 
plt.xticks(fontsize=15)
plt.xlabel("COUNTS", fontsize=25) 
plt.yticks(fontsize=15)
plt.title("DOMINANT ROCK TYPES AND THEIR COUNTS", fontsize=30)

# Displaying Bar Chart 
plt.show() 

# COUNTING THE DUPLICATES IN TECTONIC SETTING COLUMN

In [None]:
# Counting the duplicates in Tectonic Setting columN

ts = data.pivot_table(index = ["Tectonic Setting"], aggfunc = 'size') 
ts = ts.reset_index()
ts.columns= ["Tectonic Setting", "Counts"]
print(ts)

# VISUALIZATION OF TECTONIC SETTING BY USING DONUT PLOT

In [None]:
# Creating Donut Plot
fig = plt.figure(figsize =(15, 15)) 
circle = plt.Circle( (0,0), 0.5, color='white')
plt.pie(ts["Counts"], labels=ts["Tectonic Setting"])
p=plt.gcf()
p.gca().add_artist(circle)
plt.legend(ts["Counts"], fontsize=20)
plt.title("TECTONIC SETTINGS AND THEIR COUNTS",fontsize=25)

# Displaying Donut Plot
plt.show() 

# SPLITTING INPUT AND TARGET FEATURES

In [None]:
#Input Features
x = data.drop(columns=["Name", "Type", "Last Known Eruption"])
print(x)

In [None]:
#Output features
y = data[["Type"]]
print(y)

# LABEL ENCODING

In [None]:
#Input Features - Label Encoding
from sklearn.preprocessing import LabelEncoder 
le = LabelEncoder() 
x = x.apply(le.fit_transform)
print(x)

In [None]:
#Output Features - Label Encoding
y = le.fit_transform(y) 
print(y)

# TRAIN-TEST SPLIT

In [None]:
#train-test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=42)

# TRAINING ML MODEL AND FINDING ITS ACCURACY

In [None]:
#Training ML model and Finding its accuracy
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=0, n_estimators=100, criterion='gini')
rfc.fit(x_train, y_train)
y_rfc = rfc.predict(x_test)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_rfc))