In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math, re
from collections import Counter
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

import plotly.express as px
import plotly.graph_objects as go
import folium

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Dataset description:

    

In [None]:
wf_input = pd.read_csv("/kaggle/input/california-wildfire-incidents-20132020/California_Fire_Incidents.csv")
wf_input['Started'] = pd.to_datetime(wf_input['Started'].astype(str))

California is one of the places having the most deadliest and destructive wildfire seasons. The dataset contains the list of Wildfires that has occurred in California between 2013 and 2019. The dataset contains the location where wildfires have occurred including the County name, latitude and longitude values and also details on when the wildfire has started.

This data helps to generate insights on what locations in California are under fire threat, what time do Wildfires usually occur and how frequent and devastating they are!!

In [None]:
wf_input.head()

In [None]:
wf_input.columns

## Data preprocessing

In [None]:
# Some Records may be counted in diffrent counties multiple times
wf = wf_input.drop_duplicates(subset=['Name', 'Started', 'AcresBurned','StructuresDamaged', 'StructuresDestroyed'], keep='first', inplace=False, ignore_index=False).reset_index().drop(columns=['index'])

## 1. Trend of wildfire in California from 2013 to 2019

In [None]:
wf['StartedMonth'] = [x.month for x in wf['Started']]
monthly_count = wf.groupby(["ArchiveYear","StartedMonth"])['AcresBurned'].count().reset_index()
monthly_count.rename(columns={"AcresBurned": "WildfireCount"}, inplace=True)
monthly_count

In [None]:
fig = px.line(monthly_count, x = "StartedMonth", y = "WildfireCount", color = "ArchiveYear", height=600, title='Widefire Count in Each Month, 2013-2019')
fig.show()

**Observation**:  

The line plot shows that California's wildfires usually occur in the summer (June to August). It might be caused by the dryness and high temperature in summer.  
In 2017, 111 wildfires occurred in July, and this is the highest monthly count from 2013 to 2019.

## 2. Trend of wildfire damage in California from 2013 to 2019

In [None]:
yearly_wf = wf.groupby("ArchiveYear").sum()[['AcresBurned', 'MajorIncident', 'Injuries', 'StructuresDamaged', 'StructuresDestroyed', 'StructuresThreatened']]
yearly_wf

**Observation**:  

The wildfires archived in 2018 burned the most acres of land and destroyed the most number of structures. This means those fires are more closed to the town.  
In 2013 and 2014, there are more injuries than in other years. And 2017 has the highest count of the major incidents.

## 3. Spatial distribution of wildfire in California from 2013 to 2019

In [None]:
m = folium.Map(location=[37.160317,-120.621407], tiles="Stamen Terrain", zoom_start=6)
for idx in range(len(wf)):
    folium.Circle(
        location=[wf.loc[idx,'Latitude'], wf.loc[idx,'Longitude']],
        radius=math.sqrt(float(wf.loc[idx,'AcresBurned'])*4047/3.14),
        popup=str(wf.loc[idx,'Name'])+', '+str(wf.loc[idx,'ArchiveYear']),
        color="crimson",
        fill=True,
        fill_color="crimson",
    ).add_to(m)

title_html = '''
             <h3 align="center" style="font-size:20px"><b>Spatial distribution of All Recorded Wildfire</b></h3>
             '''
m.get_root().html.add_child(folium.Element(title_html))    
    
m

**Observation**:   
The wildfires occur in the mountains or other areas covered by vegetation, and most of them are small fires.

## 4. Wordcloud of wildfire statement from the fire department

In [None]:
def extract_words(text):
    text = text.replace('nan', '').replace('\r', '').replace('\n', '').replace('<p>', '').replace('  ', '')
    text = re.sub(r'<a.*</a>', "", text)
    words = text.split(' ')

    for i in range(len(words)):
        if len(words[i]) > 0:
            if words[i][-1] in ['.', ',', ':']:
                words[i] = words[i][:-1]
        words[i] = words[i].lower()
        
    return words

In [None]:
words = wf['ConditionStatement'].astype(str).apply(extract_words)

word_list = []
for row in words:
    word_list += row

word_count = Counter(word_list)

# Create stopword list:
stopwords = set(STOPWORDS)
stopwords.update(["fire", "will", "continue", "area", "firefighters"])

for key in stopwords:
    word_count.pop(key, None)
word_count.pop('', None)


wc = WordCloud(background_color="white",max_words=500,relative_scaling=0.5, normalize_plurals=False).generate_from_frequencies(word_count)
# plt.title(tag_sentiment[i] + ' tags related to ' + idx)
plt.figure(figsize=(16, 12))
plt.imshow(wc)
plt.axis('off')
plt.show()

**Observation**:  

The word cloud is generated using the condition statement of the wildfire report. The top 3 most mentioned words are: "containment", "crews", and "lines".