# Objective (Problem Statement): To predict the area burned in the Forest Fire.

## Import Library

In [None]:
# import library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import train_test_split


pd.set_option('display.float_format', lambda x: '{:.4f}'.format(x)) #Limiting 4 decimal
plt.rcParams["figure.figsize"] = [9,5]
plt.style.use('ggplot')



## Load & Describe Dataset

### Load

In [None]:
data_df = pd.read_csv("../input/forest-fires-data-set/forestfires.csv")

### First Five Rows

In [None]:
data_df.head()

#### Set Target Variable
We need to find area burned so we set target equals to area 

In [None]:
target = 'area' 

### Shape

In [None]:
data_df.shape

### Data types

In [None]:
data_df.dtypes

We need to convert month and day to either `int` or `float` from object data type

### Feature Columns

In [None]:
data_df.columns

### Describe Dataset

In [None]:
data_df.describe()

<p>As we can see count of every feature columns are same as shape of dataset. So we can say there is no missing data but we need to check to confirm. We will check later.</p>
<p>Wen can see 50% (the middle of the data) also called median of feature columns('X', 'Y', 'DMC', 'ISI', 'RH', 'wind', 'rain', 'area') have higher mean value than median i.e data is right skewed. In all feature columns, column: 'area' is highly skewed.    
</p>

#### How to handle right skewed data?
<p>Here data are right-skewed (clustered at lower values). We will perform operations like:- square root, cube root, logarithmic, etc. to transform the data. If the data are left-skewed (clustered at higher values). We will perform operations like:- cube, square, etc.</p>

## EDA(Explotary Data Analysis)

### Check missing values

In [None]:
# Calculating missing data in feature columns
data_mis = (data_df.isnull().sum() / len(data_df)) * 100
data_mis = data_mis.drop(data_mis[data_mis == 0].index).sort_values(ascending=False)
data_mis = pd.DataFrame({'Percentage' :data_mis})
data_mis['Id'] = data_mis.index
data_mis.reset_index(drop=True,level=0, inplace=True)
data_mis.head()

No missing value is found in the dataset.

#### Numerical and & Categorical Columns

In [None]:
dft = data_df.drop(columns=target)
cate_columns = dft.select_dtypes(include='object').columns.tolist()
nume_columns = dft.select_dtypes(exclude='object').columns.tolist()

In [None]:
print('Categorical columns: ',cate_columns)
print('Numerical columns: ',nume_columns)

### Univariate Check

### Skewness & Kurtosis

In [None]:
print("Skew: \n{}".format(data_df.skew()))
print("Kurtosis: \n{}".format(data_df.kurtosis()))

<p>Skew is the degree of distortion from a normal distribution. skewed, meaning there are a minority of very large values.</p>
<p>Kurtosis is all about the tails of the distribution — not the peakedness or flatness. It is used to describe the extreme values in one versus the other tail. It is actually the measure of outliers present in the distribution . High kurtosis in a data set is an indicator that data has heavy tails or outliers.</p>

<p>If skewness is positive, the data are positively skewed or skewed right, meaning that the right tail of the distribution is longer than the left. If skewness is negative, the data are negatively skewed or skewed left, meaning that the left tail is longer.</p>

<ul>
    <li>If skewness is less than −1 or greater than +1, the distribution is highly skewed.</li>
    <li>If skewness is between −1 and −½ or between +½ and +1, the distribution is moderately skewed.</li>
    <li>If skewness is between −½ and +½, the distribution is approximately symmetric.</li>
</ul>

<ul>
<li>A normal distribution has kurtosis exactly 3 (excess kurtosis exactly 0). Any distribution with kurtosis ≈3 (excess ≈0) is called mesokurtic.</li>
<li>A distribution with kurtosis &lt;3 (excess kurtosis &lt;0 ) is called platykurtic. Compared to a normal distribution, its tails are shorter and thinner, and often its central peak is lower and broader.</li>
<li>A distribution with kurtosis &gt;3 (excess kurtosis >0) is called leptokurtic. Compared to a normal distribution, its tails are longer and fatter, and often its central peak is higher and sharper.</li>
</ul>

<p>
<b> Feature columns:- 'ISI', & 'rain'  have +ve skewness, value more than +1 so, they are right skewed.</b>
</p>    

<p>
<b> Feature columns:- 'FFMC', & 'temp'  have -ve skewness, value less than -1 so, they are left skewed.</b>
</p>    

<p>
<b> Feature columns:- 'FFMC', 'ISI', & 'rain'  have higher kurtosis value. i,e have outliers.</b>
</p>    

Feature columns with (high, +ve, or -ve) outliers, skewness and kurtosis are: 
<ol>
<li>FFMC</li>
<li>ISI</li>
<li>rain</li>
</ol>

In [None]:
plt.figure(figsize=(15,5))
ax = sns.kdeplot(data_df[target],shade=True,color='b')
plt.xticks([i for i in range(0,1250,50)])
plt.show()

#### target i.e area is right skewed.

In [None]:
plt.figure(figsize=(15,5))
ax = sns.kdeplot(data_df['FFMC'],shade=True,color='b')
plt.xticks([i for i in range(0,100,5)])
plt.show()

#### FFMC is left skewed.

In [None]:
plt.figure(figsize=(15,5))
ax = sns.kdeplot(data_df['ISI'],shade=True,color='b')
plt.xticks([i for i in range(0,100,5)])
plt.show()

#### ISI is right skewed.

### Outliers

In [None]:
outl_dect = sns.boxplot(data_df[target])

In [None]:
outl_dect = sns.boxplot(data_df['FFMC'])

In [None]:
outl_dect = sns.boxplot(data_df['ISI'])

In [None]:
outl_dect = sns.boxplot(data_df['rain'])

### Outlier analysis

Outliers are found in the following columns:
<ol>
<li>area</li>
<li>FFMC</li>
<li>ISI</li>
<li>rain</li>
</ol>


<p>Instead of removing them we will transform the data to treat the outliers.</p>

In [None]:
outlier_columns = ['area','FFMC','ISI','rain']

In [None]:
np.log1p(data_df[outlier_columns]).skew()

In [None]:
np.log1p(data_df[outlier_columns]).kurtosis()

Even after transformation we still have high skewness and kurtosis in `FFMC` & `rain`

<p>Removing outliers by zscore method.</p>

In [None]:
mask = data_df.loc[:,['FFMC']].apply(zscore).abs() < 3

In [None]:
data_df = data_df[mask.values]
data_df.shape

In [None]:
# Since most of the values in rain are 0.0, we can convert it as a categorical column
data_df['rain'] = data_df['rain'].apply(lambda x: int(x > 0.0))


In [None]:
outlier_columns.remove('rain')
data_df[outlier_columns] = np.log1p(data_df[outlier_columns])

In [None]:
data_df[outlier_columns].skew()

In [None]:
data_df[outlier_columns].kurtosis() 

In [None]:
data_df.describe()

Dataset is ready for model preparation.

In [None]:
data_sel = data_df.copy()

## Applying xgboost

Encoding `day` & `month` column with label encoder

In [None]:
le = LabelEncoder() 
  
data_sel['day']= le.fit_transform(data_sel['day']) 
data_sel['month']= le.fit_transform(data_sel['month']) 

In [None]:
X, y = data_sel.iloc[:,:-1],data_sel.iloc[:,-1]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=7)

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)


Dividing dataset into 33% test sample and 67% training sample .

In [None]:
#xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
#                max_depth = 5, alpha = 10, n_estimators = 10)

xg_reg = xgb.XGBRegressor(base_score=0.3, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.24, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=102,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)



In [None]:
#eval_set = [(X_test, y_test)]
eval_set = [(X_train, y_train), (X_test, y_test)]
xg_reg.fit(X_train, y_train, eval_metric=["rmse"],eval_set=eval_set, verbose=False)
preds = xg_reg.predict(X_test)

#### verbose set to False so that we can hide results of model fit progress

In [None]:
def calc_ISE(X_train, y_train, model):
    '''returns the in-sample R^2 and RMSE; assumes model already fit.'''
    predictions = model.predict(X_train)
    mse = mean_squared_error(y_train, predictions)
    rmse = np.sqrt(mse)
    return model.score(X_train, y_train), rmse
    
def calc_OSE(X_test, y_test, model):
    '''returns the out-of-sample R^2 and RMSE; assumes model already fit.'''
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    return model.score(X_test, y_test), rmse


**Calculate In-Sample and Out-of-Sample R^2 and Error**

In [None]:
is_r2, ise = calc_ISE(X_train, y_train,xg_reg )
os_r2, ose = calc_OSE(X_test, y_test, xg_reg)

# show dataset sizes
data_list = (('R^2_in', is_r2), ('R^2_out', os_r2), 
             ('ISE', ise), ('OSE', ose))
for item in data_list:
    print('{:10}: {}'.format(item[0], item[1]))

**Clearly test error(OSE) is near to the training error(ISE). i.e our model is ok.
**

In [None]:
print('train/test: ',ose/ise)

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))


In [None]:
xgb.plot_tree(xg_reg,num_trees=0)

plt.rcParams['figure.figsize'] = [15, 15]
plt.show()


In [None]:
xgb.plot_importance(xg_reg)
plt.rcParams['figure.figsize'] = [7, 7]
plt.show()


In [None]:
# retrieve performance metrics
results = xg_reg.evals_result()
epochs = len(results['validation_0']['rmse'])
x_axis = range(0, epochs)
# plot RMSE
fig, ax = plt.subplots()
ax.plot(x_axis, results['validation_0']['rmse'], label='Train')
ax.plot(x_axis, results['validation_1']['rmse'], label='Test')
ax.legend()
plt.ylabel('RMSE')
plt.title('XGBoost RMSE')
plt.show()

In [None]:
xg_reg.save_model('0001.model_forest_fire')
