In [None]:
import pandas as pd
import streamlit as st
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.figure_factory as ff

import streamlit as st
## user_agent
import user_agents
# conda install -c conda-forge user-agents
from geopy.distance import great_circle

from ip2geotools.databases.noncommercial import DbIpCity as ip2geo

from geopy.geocoders import Nominatim


df = pd.read_csv("earthquake_1995-2023.csv")
df

## 1) Understand Data

* a) Understand Columns

* b) Datatype check and Convert

* c) .describe() -> to see all values

* d) check unique values of categorical and their count

In [None]:
df.columns = df.columns.str.lower()
df.columns

In [None]:
df.info()


In [None]:
df['date_time'] = pd.to_datetime(df['date_time'])

In [None]:
df.describe()

In [None]:
df.describe(include='O')

In [None]:
categorical = df.select_dtypes(include='O').columns

for cat in categorical:
    print(f"{cat} column has number of uniques : {df[cat].nunique()}")
    print(f"{cat} column has uniques : {df[cat].unique()}")
    print(f'\n{"-"*50}\n')
    print(f"{cat} column each unique repeated by:")
    print(df[cat].value_counts())
    print(f'\n{"-"*50}\n')

In [None]:
numerical = df.select_dtypes(exclude= 'object').columns

for cat in numerical:
    print(f"{cat} column has number of uniques : {df[cat].nunique()}")
    print(f"{cat} column has uniques : {df[cat].unique()}")
    print(f'\n{"-"*50}\n')
    print(f"{cat} column each unique repeated by:")
    print(df[cat].value_counts())
    print(f'\n{"-"*50}\n')

## 2) Data Cleaning

* NaNs check and Impute

* Duplicate check and remove

* Outliers check and handle

* Data Inconsistency

In [None]:
df.isnull().mean() * 100


In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='most_frequent')
df['alert'] = imputer.fit_transform(df[['alert']])
df.isnull().mean() * 100

#px.histogram(df, x= 'alert')
df['alert'].min()

In [None]:
df['location'] = imputer.fit_transform(df[['location']])
df.drop(['country','continent'], axis=1, inplace=True)
df.isnull().mean() * 100

In [None]:
df.duplicated().sum()

##  EDA

In [None]:
df['Year'] = df['date_time'].dt.year
df['Month_Name'] = df['date_time'].dt.month_name()
df['Week'] = df['date_time'].dt.isocalendar().week
df['Day_Name'] = df['date_time'].dt.day_name()
df['Hour'] = df['date_time'].dt.hour
df['Hour'] = df['Hour'].astype(float)
df.drop(['date_time'], axis=1, inplace=True)

dff = df
dff.info()
dff.isnull().mean() * 100


In [None]:
def format_hours(row):
    if row['Hour'] in range(0, 12):
        return 'AM'
    else:
        return 'BM'
    
dff['Hour'] = pd.to_numeric(df['Hour'], errors='coerce')

dff['AM_BM'] = df.apply(format_hours, axis=1)
dff

In [None]:
def format_tsunami(row):
    if row['tsunami'] == 0 :
        return 'residence'
    else:
        return 'oceanic'
dff['tsunami'] = df.apply(format_tsunami, axis=1)

dff 

In [None]:
# top 10 biggest number of sig and title location ?

dff['sig'].max()
dff.sort_values(by='sig', ascending=False)[['title', 'magnitude', 'location','sig']].head(10)


In [None]:
# What the top 10 earthquakes by magnitude and signature in recent years and where were those earthquakes?
dff['Year'].max()
dff.sort_values(by='Year', ascending=False)[['title','sig','magnitude', 'location','Year']].head(10)

In [None]:
# what is The most frequent and location of earthquakes 
dff.groupby(['title','location'])[['Year']].size().sort_values( ascending=False).reset_index().head(10)



In [None]:
#What  where tsunamis were recorded during each month, and how many tsunamis were recorded at each location?
dff.groupby(['Month_Name','location','Year'])[['tsunami']].count().sort_values(by='tsunami', ascending=False).head(10)


## Univariate Analysis

 Visuallization for categorical colums

In [None]:
# Show count of all categorical features
categorical = dff.select_dtypes(include='O').columns

for col in categorical:
    fig = px.bar(dff, x= col)
    fig_1 = px.box(dff, x=col)
    fig_1.show()
    fig.show()
    

Visuallization for numerical colums

In [None]:
# Show distribution of all numerical features
numerical = dff.select_dtypes(exclude= 'object').columns

for col in numerical:
    fig = px.histogram(dff, x= col)
    fig_1 = px.box(dff, x=col)
    fig_1.show()
    fig.show()


## Bivariate Analysis

In [None]:
## The most common alert and its impact on the event? ! tsunami 0 == Residential locations & tsunami 1 == Surrounding locations

figure = px.scatter(dff, x='alert' , y='sig',  color='tsunami',title='Relation Ship btn cdi  vs mmi')
figure = figure.update_layout(width=600, height=450)
figure



In [None]:
##  Which months witness the highest seismic activity during the year?
mon_highest = dff.groupby('Month_Name')['Year'].count().sort_values( ascending=False).reset_index().head(10)
px.pie(mon_highest, names='Month_Name', values='Year',color='Month_Name')


In [None]:
#Is there a change in the intensity of earthquakes over time? Is there any effect of a tsunami
df_time = df.groupby(['Year','tsunami'])['sig'].sum().reset_index().sort_values(by= 'sig')

fig_1 = px.line(df_time, x= 'sig', y= ['Year'],  color= 'tsunami', title= 'Total earthquakes & tsunami over Time Period (1995-2023)')
fig_1

In [None]:
##  What is the most media coverage of the event?
occu_avg = dff.groupby(['net','tsunami'])['sig'].size().sort_values(ascending= False).reset_index()
px.bar(occu_avg, x= 'net', y= 'sig',color='tsunami')


In [None]:
## What is the impact of the presence of stations inside residential areas?#
impact = dff.groupby(['location','nst'])['sig'].size().sort_values(ascending= False).reset_index().head(500)

px.bar(impact, x= 'location', y= 'sig',color='nst')



In [None]:
## Average impact of algorithms on event importance
algorithms = dff.groupby(['magtype','tsunami'])['sig'].mean().sort_values(ascending= False).reset_index()

px.bar(algorithms, x= 'magtype', y= 'sig', color= 'tsunami' )



In [None]:
## How many earthquakes have occurred in certain places and what is the magnitude of the earthquake’s intensity??
earthquakes = dff.groupby(['location','magnitude'])['title'].count().sort_values(ascending= False).reset_index().head(100)
px.bar(earthquakes, x= 'location', y= 'title', color= 'magnitude')



In [None]:
# What is the alert rate affected by the event?
affected = dff.groupby(['alert','mmi'])['sig'].count().sort_values(ascending= False).reset_index()

px.bar(affected, x= 'alert', y= 'sig',color= 'mmi')

In [None]:
## The relationship between all the data
px.imshow(dff.corr(), text_auto=True, width=1000, height=900)


## stremlit


In [None]:
dff.to_csv('cleaned_dff.csv', index = False)

In [None]:
%%writefile earthquake_1995.py

import plotly.express as px
import pandas as pd
import streamlit as st

dff = pd.read_csv('cleaned_dff.csv')
mon_highest = dff.groupby('Month_Name')['Year'].count().sort_values( ascending=False).reset_index().head(10)
df_time = dff.groupby(['Year','tsunami'])['sig'].sum().reset_index().sort_values(by= 'sig')
occu_avg = dff.groupby(['net','tsunami'])['sig'].size().sort_values(ascending= False).reset_index()
impact = dff.groupby(['location','nst'])['sig'].size().sort_values(ascending= False).reset_index().head(500)
algorithms = dff.groupby(['magtype','tsunami'])['sig'].mean().sort_values(ascending= False).reset_index()
earthquakes_avg = dff.groupby(['location','magnitude'])['title'].count().sort_values(ascending= False).reset_index().head(100)
affected = dff.groupby(['alert','mmi'])['sig'].count().sort_values(ascending= False).reset_index()
#earthquakes_avg = dff.groupby(['location','magnitude'])['alert'].count().sort_values(ascending= False).reset_index().head(100)



st.title('earthquake from 1995 to 2023')

page =  st.sidebar.radio('Select page', ['About','Univariate Analysis', 'Bivariate Analysis', 'Multivariate Analysis'])


if page == 'About':
    
    def main():
        
        st.title('About Suberstore Project')
        st.write(dff)
        
    if __name__ == '__main__':

        main()
        
if page == 'Univariate Analysis':

    def main():
    
        # create tabs of numerical and categorical features
        tab1, tab2 = st.tabs(['Numerical Features', 'Categorical Features'])
        
        # Numerical Features
        num_cols = dff.select_dtypes(exclude= 'object').columns
        
        for col in num_cols:
            tab1.plotly_chart(px.histogram(dff, x = col))
            
        # Categorical features
        cat_cols = dff.select_dtypes(include= 'object').columns
        
        for col in cat_cols:
            tab2.plotly_chart(px.histogram(dff, x = col))
            
            
    if __name__ == '__main__':

        main()
    
elif page == 'Bivariate Analysis':
    
    def main():
        
        st.write('# Numerical features VS Target Variable')
        
        # Create selection box for features and plots
        select_col = st.selectbox('Select Feature', ['magnitude', 'mmi', 'tsunami', 'sig', 'nst',
         'latitude', 'longitude','Year'])
        
        select_plot = st.selectbox('Select Plot Type', ['Box plot', 'Violin Plot', 'Bar Chart Plot'])
        
        if select_plot == 'Box plot':
            
            st.plotly_chart(px.box(dff, x= 'title', y= select_col))
            
        elif select_plot == 'Violin Plot':
            
            st.plotly_chart(px.violin(dff, x= 'title', y= select_col))

        else:
            
            st.plotly_chart(px.bar(dff, x= 'title', y= select_col))
        
    if __name__ == '__main__':

        main()
        
else:
    
    def main():
        
        st.header('1- The most common alert and its impact on the event?') 
        
        st.plotly_chart(px.scatter(dff, x='alert' , y='sig',  color='tsunami',title='Relation Ship btn cdi  vs mmi'))
        
        st.header('Which months witness the highest seismic activity during the year?')
        
        st.plotly_chart(px.pie(mon_highest, names='Month_Name', values='Year',color='Month_Name'))   
        
        st.header('Is there a change in the intensity of earthquakes over time? Is there any effect of a tsunami?')
        
        st.plotly_chart( px.line(df_time, x= 'sig', y= ['Year'],  color= 'tsunami', title= 'Total earthquakes & tsunami over Time Period (1995-2023)'))
        
        st.header(' What is the most media coverage of the event?')
        
        st.plotly_chart(px.bar(occu_avg, x= 'net', y= 'sig',color='tsunami'))
        
        st.header('What is the impact of the presence of stations inside residential areas?')
        
        st.plotly_chart(px.bar(impact, x= 'location', y= 'sig',color='nst'))
        
        st.header('Average impact of algorithms on event importance')
        
        st.plotly_chart(px.bar(algorithms, x= 'magtype', y= 'sig', color= 'tsunami' ))
        
        st.header('How many earthquakes have occurred in certain places and what is the magnitude of the earthquake’s intensity?')
        
        st.plotly_chart(px.bar(earthquakes_avg, x= 'location', y= 'title', color= 'magnitude'))
        
        st.header('What is the alert rate affected by the event?')
        
        st.plotly_chart(px.bar(affected, x= 'alert', y= 'sig',color= 'mmi'))
        
        #st.header('The relationship between all the data')
        
        #st.plotly_chart(px.bar(earthquakes, x= 'location', y= 'title', color= 'magnitude'))

      
    if __name__ == '__main__':
        
        main()    

In [None]:
! streamlit run earthquake_1995.py

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder
from category_encoders import BinaryEncoder
