# Import Packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Import Data

In [None]:
# Import data dari CSV
df = pd.read_csv('https://raw.githubusercontent.com/dataenergy/Exploratory-data-analysis-of-climate-change-and-natural-disasters/master/datasets/number-of-natural-disaster-events.csv')
# Fungsi head() untuk mengecek struktur data
df.head()

In [None]:
# Contoh pemanggilan kolom kalau harus menggunakan nama kolom lengkap
df[['Number of reported natural disasters (reported disasters)']]

In [None]:
# Fungsi type() untuk mengetahui jenis data. Dalam hal ini jenis data df[['Entity']]
type(df[['Entity']])

In [None]:
# Fungsi rename() untuk mengubah nama kolom. Syntax: <dataframe>.rename(columns={<nama lama>: <nama baru>}, inplace = True)
df.rename(columns={'Number of reported natural disasters (reported disasters)': 'Disaster Count'}, inplace = True)
df.head()

# Measure of Central Tendency

## Calculate Mean

In [None]:
# Fungsi mean() untuk menghitung rata-rata

df.mean(axis=0,numeric_only=True)

In [None]:
# Fungsi mean() dikenakan pada 1 kolom saja. dalam hal ini kolom dipanggil sebagai Series
my_var = df['Disaster Count'].mean(axis=0, numeric_only = True)
my_var

In [None]:
times_two = 2*my_var
times_two

In [None]:
# Fungsi mean() dikenakan pada 1 kolom saja. dalam hal ini kolom dipanggil sebagai DataFrame
df[['Disaster Count']].mean(axis=0, numeric_only = True)

## Calculate Median

In [None]:
# Fungsi median() untuk menghitung nilai tengah data
df.median(axis=0, numeric_only = True)

In [None]:
# Fungsi median() dikenakan pada 1 kolom saja. dalam hal ini kolom dipanggil sebagai Series
df['Disaster Count'].median(axis=0, numeric_only = True)

## Calculate mode

In [None]:
# Fungsi median() untuk mencari data yang paling sering muncul
df.mode(axis = 'index', numeric_only = True)

In [None]:
df.mode(axis = 'columns', numeric_only = True)

# Measure of Spread

## Calculate Range

In [None]:
# Untuk menghitung range, tidak ada fungsi bawaan. Tapi cukup dihitung data terbesar (fungsi max) kurang data terkecil (fungsi min)
df_range = df['Disaster Count'].max() - df['Disaster Count'].min()
print(df_range)

In [None]:
# def untuk mendefinisikan fungsi baru
def calculate_range(column):
    # Check apakah entri merupakan integer/float. Kalau ya, hitung range
    if column.dtype in [int, float]:
        return column.max() - column.min()
    # Kalau tidak, tidak dihitung
    else:
        return None

# Hitung range dengan fungsi baru yang dibuat
df.apply(calculate_range, axis=0)

In [None]:
df.apply(calculate_range, axis = 1)

## Calculate Quartile

In [None]:
# Fungsi quantile dapat digunakan untuk mencari quartil/desil/median, dll
# Parameter [0.25, 0.5, 0.75] digunakan untuk mencari kuartil
# Parameter [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] digunakan untuk mencari desil
df['Disaster Count'].quantile([0.25, 0.5, 0.75])

In [None]:
df['Disaster Count'].quantile([0.25, 0.5, 0.75])

In [None]:
df['Disaster Count'].quantile([0.5])

In [None]:
df['Disaster Count'].quantile([0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])

## Calculate Standard Deviation

In [None]:
# Fungsi std digunakan untuk mencari standar deviasi
df[['Disaster Count']].std()

In [None]:
df['Disaster Count'].std()

In [None]:
df.describe()

# Correlation

## Positive Correlation

In [None]:
df_1 = df
df_1.head()

In [None]:
df_2 = pd.read_csv('https://raw.githubusercontent.com/dataenergy/Exploratory-data-analysis-of-climate-change-and-natural-disasters/master/datasets/economic-damage-from-natural-disasters.csv')
df_2.rename(columns={'Total economic damage from natural disasters (US$)': 'Economic Damage'}, inplace = True)
df_2.head()

In [None]:
# Di sini kita melakukan join df_1 dan df_2, dengan memperhatikan/berpatokan pada entity, code, dan year yang sama dari kedua dataframe
merged_df = pd.merge(df_1, df_2, on = ['Entity','Code','Year'])
merged_df.head()

In [None]:
merged_df['Disaster Count'].corr(merged_df['Economic Damage'])

In [None]:
# Fungsi scatter digunakan untuk membuat scatter plot. Parameter lengkapnya bisa dicek di dokumentasi
plt.scatter(merged_df['Disaster Count'], merged_df['Economic Damage'], color = 'red', alpha = 0.1, marker = 'o')
plt.title('Scatter Plot of Disaster Count vs Economic Damage')
plt.xlabel('Disaster Count')
plt.ylabel('Economic Damage')
plt.grid(True)

plt.show()

## Negative Correlation

In [None]:
import pandas as pd

# Ini proses untuk input data dummy
data = {
    'Year': [2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020,
             2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021,
             2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022],
    'Month': ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December',
              'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December',
              'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'],
    'Rain Rate (inches)': [3.2, 2.8, 2.5, 1.5, 1.0, 0.5, 0.2, 0.4, 0.6, 1.0, 2.0, 2.8,
                           3.5, 3.0, 2.7, 1.8, 1.2, 0.6, 0.3, 0.5, 0.8, 1.2, 2.2, 2.7,
                           3.0, 2.6, 2.4, 1.6, 1.1, 0.7, 0.3, 0.6, 0.9, 1.3, 2.4, 2.9],
    'Ice Cream Sales (Cup)': [200, 280, 280, 300, 350, 340, 350, 320, 400, 340, 280, 280,
                        220, 230, 280, 400, 350, 300, 320, 300, 410, 330, 250, 200,
                        200, 200, 250, 350, 310, 310, 350, 350, 400, 330, 240, 210]
}

# Fungsi2 pandas yang sudah dipelajari tidak dapat dikenakan pada dictionary, jadi terlebih dahulu kita convert datanya ke Dataframe
ice_cream_df = pd.DataFrame(data)

# Nilai korelasi Pearson dihitung dengan menggunakan fungsi corr
ice_cream_df['Rain Rate (inches)'].corr(ice_cream_df['Ice Cream Sales (Cup)'])


In [None]:
plt.scatter(ice_cream_df['Rain Rate (inches)'], ice_cream_df['Ice Cream Sales (Cup)'], alpha=0.7)
plt.title('Scatter Plot of Rain Rate vs Ice Cream Sales')
plt.xlabel('Rain Rate (inches)')
plt.ylabel('Ice Cream Sales (Cup)')
plt.grid(True)

plt.show()

In [None]:
# Create a scatter plot
plt.scatter(ice_cream_df['Rain Rate (inches)'], ice_cream_df['Ice Cream Sales (Cup)'], alpha=0.7, label='Data Points')

# Calculate the line of best fit
slope, intercept = np.polyfit(ice_cream_df['Rain Rate (inches)'], ice_cream_df['Ice Cream Sales (Cup)'], 1)
x = np.array(ice_cream_df['Rain Rate (inches)'])
y = slope * x + intercept

# Plot the regression line
plt.plot(x, y, color='red', label=f'Regression Line (y = {slope:.2f}x + {intercept:.2f})')

plt.title('Scatter Plot of Rain Rate vs Ice Cream Sales')
plt.xlabel('Rain Rate (inches)')
plt.ylabel('Ice Cream Sales (Cup)')
plt.grid(True)

plt.show()

In [None]:
import plotly.express as px

# Untuk membuat scatter plot yang ada garis regresi nya dengan mudah, bisa menggunakan ffungsi scatter dari package plotly
fig = px.scatter(ice_cream_df, x='Rain Rate (inches)', y='Ice Cream Sales (Cup)', trendline="ols")
fig.show()

In [None]:
# Membuat matriks korelasi juga menggunakan fungsi corr, tetapi dengan urutan syntax yang berbeda dengan mencari nilai saja
correlation = ice_cream_df[['Rain Rate (inches)', 'Ice Cream Sales (Cup)']].corr()
print("\nCorrelation Matrix:")
print(correlation)

In [None]:
ice_cream_2020_df = ice_cream_df[ice_cream_df['Year'] == 2020]

# Create a figure and axis
fig, ax1 = plt.subplots(figsize=(10, 6))

# Plot the Rain Rate on the left y-axis
ax1.plot(ice_cream_2020_df['Month'], ice_cream_2020_df['Rain Rate (inches)'], color='b', marker='o', label='Rain Rate')
ax1.set_xlabel('Month (2020)')
ax1.set_ylabel('Rain Rate (inches)', color='b')
ax1.tick_params(axis='y', labelcolor='b')

# Create a second y-axis for Ice Cream Sales
ax2 = ax1.twinx()
ax2.plot(ice_cream_2020_df['Month'], ice_cream_2020_df['Ice Cream Sales (Cup)'], color='r', marker='s', label='Ice Cream Sales')
ax2.set_ylabel('Ice Cream Sales (Cup)', color='r')
ax2.tick_params(axis='y', labelcolor='r')

# Add a legend
lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
lines = lines1 + lines2
labels = labels1 + labels2
ax1.legend(lines, labels, loc='upper left')

# Set the title
plt.title('Rain Rate vs. Ice Cream Sales in 2020')

# Display the plot
plt.show()
