## Data Retrieval and first looks
The data retrieved is from a kaggle dataset and contains meteorlogical data from Portugal fire service

## Import the necessary packages


In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [7]:
df = pd.read_csv('../data/forestfires.csv')

In [8]:
df.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


## Data Understanding 
Column Names and meaning

1. X: x-axis spatial coordinate with the Montesinho park: 1 to 9
2. Y: y- axis spatial coordinate within the Montesinho park map : 2 to 9
3. month - month of the year: 'jan' to 'dec'
4. day - day of the week : 'mon' to 'sun'
5. FFMC - FFMC index from the FWI system : 18.7 - 96.20
6. DMC - DMC index from the FWI system: 1.1 to 291.3 
7. DC - DC index from the FWI system: 7.9 to 860.6
8. ISI - ISI indes from the FWI system: 0.0 to 56.10
9. temp- temperature in Celsius degrees: 2.2 to 33.30
10. RH- relative humidity in %: 15.0 to 100
11. wind- wind speed in km/h: 0.40 to 9.40
12. rain - outside rain in mm/m^2: 0.0 to 6.4
13. area- the burned area of the forest in hecta-acres (ha): 0.00 to 1090.84 
(this output variable is very skewed towars 0.0, will likely need to do a logarithmic transformation

## Fun little Data Engineering moves
Before we run any models or get any statistics need to convert months and days into numerical data.

In [9]:
df.month.replace(('jan','feb','mar','apr','may','jun', 'jul', 'aug', 'sep','oct','nov','dec'), #convert months to integers
                 (1,2,3,4,5,6,7,8,9,10,11,12),
                 inplace = True )
df.day.replace(('sun','mon','tue','wed','thurs','fri','sat'), 
               (1,2,3,4,5,6,7),
              inplace = True)    #convert days into integers


In [10]:
##check work
df.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,3,6,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,10,3,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,10,7,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,3,6,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,3,1,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


## Check for missing Values



In [11]:
df.isna().sum()

X        0
Y        0
month    0
day      0
FFMC     0
DMC      0
DC       0
ISI      0
temp     0
RH       0
wind     0
rain     0
area     0
dtype: int64

In [13]:
df.duplicated().sum()

4

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517 entries, 0 to 516
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   X       517 non-null    int64  
 1   Y       517 non-null    int64  
 2   month   517 non-null    int64  
 3   day     517 non-null    object 
 4   FFMC    517 non-null    float64
 5   DMC     517 non-null    float64
 6   DC      517 non-null    float64
 7   ISI     517 non-null    float64
 8   temp    517 non-null    float64
 9   RH      517 non-null    int64  
 10  wind    517 non-null    float64
 11  rain    517 non-null    float64
 12  area    517 non-null    float64
dtypes: float64(8), int64(4), object(1)
memory usage: 52.6+ KB
