# Libraries

In [None]:
# numerical analysis
import numpy as np
# storing and processing in dataframes
import pandas as pd

# basic plotting
import matplotlib.pyplot as plt
# advanced plotting
import seaborn as sns

# Data

In [None]:
# read data
df = pd.read_csv("../input/forest-fires-in-brazil/amazon.csv", encoding='latin1')

# random rows
df.sample(5)

### Columns

In [None]:
# columns names
df.columns

> * **year** - Year when fire happened  
> * **state** - Brazilian State   
> * **month** - Month when fire happend  
> * **number** - Number of fire reported    
> * **date** - Date when fire were reported  

### Dataset Properties

In [None]:
# no. of rows and columns
df.shape

In [None]:
# consise summary of dataframe
df.info()

In [None]:
# descriptive statistics
df.describe(include='all')

### Missing and Unique values

In [None]:
# no. of missing values in each column
df.isnull().sum()

In [None]:
# number of unique values in each column
df.nunique()

In [None]:
# df['year'].value_counts()
df['year'].unique()

In [None]:
# df['month'].value_counts()
df['month'].unique()

In [None]:
# df['state'].value_counts()
df['state'].unique()

# Data Cleaning

In [None]:
# month no.
mn = {"Janeiro":1, "Fevereiro":2, "Março":3, "Abril":4, "Maio":5, "Junho":6, "Julho":7, 
      "Agosto":8, "Setembro":9, "Outubro":10, "Novembro":11, "Dezembro":12}

# english month names
mn_name = {"Janeiro":"January", "Fevereiro":"February", "Março":"March", "Abril":"April",
           "Maio":"May", "Junho":"June", "Julho":"July", "Agosto":"August", "Setembro":"September", 
           "Outubro":"October", "Novembro":"November", "Dezembro":"December"}

# create a new 'month no.' column
df['month_no'] = df['month'].map(mn)

# month name to english
df['month'] = df['month'].map(mn_name)

# first few rows
df.head()

In [None]:
# renaming column
df = df.rename(columns={'date':'date_reported'})

# date format
df['date'] = pd.to_datetime((df['year']*10000 + df['month_no']*100 + int(1)), format='%Y%m%d')

# rearrange columns
df = df[['state', 'month', 'month_no', 'year', 'date', 'number', 'date_reported']]

In [None]:
# get latitude and longitude data
# ===============================

# get state names
state_wise = pd.DataFrame(df.groupby('state')['number'].sum())
state_wise = state_wise.reset_index()
states = state_wise['state'].to_list()

# latitude and longitude
lat = [-9.0238, -9.5713, -0.9020, -3.4168, -12.5797, -5.4984, -15.7998, -19.1834, 
       -15.8270, -4.9609, -12.6819, -18.5122, -7.2400, -1.9981, -8.8137, -21.5072, 
       -22.9068, -11.5057, -2.7376, -27.2423, -23.5505, -10.5741, -10.1753]
lon = [-70.8120, -36.7820, -52.0030, -65.8561, -41.7007, -39.3206, -47.8645, -40.3089, 
       -49.8362, -45.2744, -56.9211, -44.5550, -36.7820, -54.9306, -36.9541, -43.3208, 
       -43.1729, -63.5806, -62.0751, -50.2189, -46.6333, -37.3857, -48.2982]

# create dictionaries
latitude = {st: la for st, la in zip(states, lat)}
longitude = {st: lo for st, lo in zip(states, lon)}

# add columns to the dataframe
df['Latitude'] = df['state'].map(latitude)
df['Longitude'] = df['state'].map(longitude)

# first few rows
df.head()

# Visual EDA

In [None]:
sns.catplot(data=df, x='month', y='number', aspect=2.5)
plt.title('No. of fires in each month')
plt.show()

In [None]:
state_wise = df.groupby('state')['number'].sum().sort_values(ascending=False).reset_index()
plt.figure(figsize=(7, 8))
sns.barplot(data=state_wise, x='number', y='state', orient='h', palette='Dark2')
plt.show()

In [None]:
def group_by_col(col):
    

In [None]:
# df_pivot = df[['year', 'month', 'number']].pivot_table(values='number', index='year', columns='month', aggfunc='sum')
# df_pivot.columns = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
# plt.figure(figsize=(20, 8))
# sns.heatmap(df_pivot, annot=True, fmt='.0f', cmap='Reds')
# plt.suptitle('Total fires')
# plt.xlabel('')
# plt.ylabel('')
# plt.show()

In [None]:
# df_pivot = df[['state', 'year', 'number']].pivot_table(values='number', index='state', columns='year', aggfunc='sum')
# plt.figure(figsize=(20, 8))
# sns.heatmap(df_pivot, annot=True, fmt='.0f', cmap='Reds')
# plt.suptitle('Total fires')
# plt.xlabel('')
# plt.ylabel('')
# plt.show()

In [None]:
# fig = px.scatter_geo(data_frame = state_wise, 
#                      scope='south america',
#                      hover_name='state',
#                      lon='longitude',
#                      lat='latitude',
#                      size='number',
#                      color = 'number',
#                      color_continuous_scale=px.colors.sequential.Peach)
# fig.update_layout(title_text = 'Reported Fires')
# fig.show()

In [None]:
# plt.figure(figsize=(7, 14))
# plt.barh(df['state'], df['number'])
# plt.xlabel('State')
# plt.ylabel('Number of fire')
# plt.show()

In [None]:
# sns.set_style('whitegrid')

# plt.figure(figsize=(15,5))
# ax = sns.lineplot(x ='year',y = 'number', data=df, estimator ='sum', err_style=None)
# plt.xlabel('Year')
# plt.ylabel('Number of Fires')
# ax.xaxis.set_major_locator(plt.MaxNLocator(19))
# ax.set_xlim(1998, 2017)