In [None]:
import pandas as pd
import numpy as np

fires = pd.read_csv('../input/forest-fires-data-set/forestfires.csv')
fires = fires.reset_index()
fires.head()

In [None]:
fires.describe()

In [None]:
attributes = ['FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH']
corr = fires[attributes].corr()
corr

In [None]:
%matplotlib inline
from pandas.plotting import scatter_matrix

attributes = ['FFMC', 'DMC', 'DC', 'ISI']
scatter_matrix(fires[attributes], figsize=(20, 15))

In [None]:
#Ploting the most corelatet attributes
fires.plot(kind="scatter", x="DMC", y="DC", alpha=0.4, figsize=(10,8))



In [None]:
from sklearn.ensemble import ExtraTreesRegressor

columns = ['X', 'Y', 'FFMC', 'DMC', 'DC', 'ISI', 'temp',
       'RH', 'wind', 'rain']
X = fires[columns]
Y = fires[['area']].values.ravel()

model = ExtraTreesRegressor(n_estimators=100)
model.fit(X, Y)
cols_to_drop = []

for c in zip(columns,model.feature_importances_.round(4)):
    if c[1] <0.01:
        cols_to_drop.append(c[0])
print('Columns to be droped: ',cols_to_drop)
# attribute rain is droped because we doesn't get enough informatin
fires = fires.drop(cols_to_drop,axis=1)

In [None]:
corr_matrix = fires.corr()
corr_matrix["area"].sort_values(ascending=False)



In [None]:
# attribute DC is also droped because it's corelated with DMC
fires.drop(labels=['DC'],axis=1,inplace=True)

In [None]:
# fancy plot for FFMC and DMC
import plotly.express as px 
df_long=pd.melt(fires,id_vars=['index'], value_vars=['FFMC', 'DMC']) 
fig = px.line(df_long, x='index', y='value', color='variable')
fig.show()

In [None]:
# another fancy plot for ISI, temp and wind
import plotly.express as px 
df_long=pd.melt(fires,id_vars=['index'], value_vars=['ISI',	'temp',	'wind']) 
fig = px.line(df_long, x='index', y='value', color='variable')
fig.show()

In [None]:
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt

#droping outliers
attributes = ['FFMC', 'DMC', 'ISI']
outliers_to_drop = []
for tmp in attributes:
    df_anomaly = fires.copy()
    clustering1 = DBSCAN(eps=2.5, min_samples=2).fit(np.array(df_anomaly[tmp]).reshape(-1,1))
    labels = clustering1.labels_
    outlier_pos = np.where(labels == -1)[0]

    outliers_to_drop += list(outlier_pos)

outliers_to_drop = np.unique(outliers_to_drop)
print(outliers_to_drop)

fires.drop(labels=outliers_to_drop,axis=0,inplace=True)

In [None]:
fires_cat = fires[['month', 'day']]
fires_num = fires[['X', 'Y', 'FFMC', 'DMC', 'ISI', 'temp', 'RH','wind']]
target = fires[['area']] 

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ('std_scaler', StandardScaler()),
    ])

fires_num_tr = num_pipeline.fit_transform(fires_num)


In [None]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()
days = fires_cat[['day']].values
days_encoded = ordinal_encoder.fit_transform(days)
fires_cat['day'] = days_encoded.astype(np.int8)

ordinal_encoder = OrdinalEncoder()
months = fires_cat[['month']].values
months_encoded = ordinal_encoder.fit_transform(months)
fires_cat['month'] = months_encoded.astype(np.int8)

In [None]:
from sklearn.model_selection import train_test_split

data = np.concatenate((fires_num_tr,fires_cat),axis=1)
X_train, X_test, y_train, y_test = train_test_split(data, target.values, test_size=0.33, random_state=42)
y_train = y_train.ravel()
y_test = y_test.ravel()

print(data.shape)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

param_grid = [
    {'kernel': ['rbf', 'sigmoid'], 'C': [1,50, 100 ,300],
     'epsilon': [0.2, 0.2,0.1]},
  ]

svr_cv =SVR()
svr_grid_search = GridSearchCV(svr_cv, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
svr_grid_search.fit(X_train,y_train)

In [None]:
svr_grid_search.best_estimator_

In [None]:
final_model = svr_grid_search.best_estimator_
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

final_predictions = final_model.predict(X_test)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

print('SMSE: ',final_rmse)
print('MAE: {}'.format(mean_absolute_error(y_test, final_predictions)))


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False,True], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
rfr_grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
rfr_grid_search.fit(X_train,y_train)

In [None]:
final_model = rfr_grid_search.best_estimator_

final_predictions = final_model.predict(X_test)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

print('MSE: ',final_rmse)
print('MAE: {}'.format(mean_absolute_error(y_test, final_predictions)))
