# Random Forest Regression

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import datetime

## Importing the dataset

In [None]:
dataset = pd.read_csv('Book1.csv')
dataset['Dates'] = pd.to_datetime(dataset['Dates'], format = '%d/%m/%Y', errors = 'coerce')
dataset['Dates'] = dataset['Dates'].map(datetime.datetime.toordinal)
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

## Taking care of missing data

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan,strategy='mean')
imputer.fit(X[:,2:-1])
X[:,2:-1] = imputer.transform(X[:,2:-1])

imputer = SimpleImputer(missing_values=np.nan,strategy='most_frequent')

z=X[:,-1]
z= z.reshape(len(y),1)
imputer.fit(z)
z = imputer.transform(z)

In [None]:
X=np.delete(X, -1, 1)
X=np.append(X, z, axis=1)

In [None]:
y = y.reshape(len(y),1)

### Encoding the Independent Variable

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [-1])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [None]:
y = y.reshape(len(y),1)

## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1/3, random_state = 0)
xtr=X_train[:,16]
xte=X_test[:,16]
ytr=y_train[:,0]
yte=y_test[:,0]

## Applying Attribute Selection Technique PLSR

In [None]:
from sklearn.cross_decomposition import PLSRegression
pls = PLSRegression(n_components=10)
pls.fit(X_train,y_train)

PLSRegression(copy=True, max_iter=500, n_components=10, scale=True, tol=1e-06)

In [None]:
X_train = pls.transform(X_train)
X_test = pls.transform(X_test)

## Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X_train = sc_X.fit_transform(X_train)
y_train = sc_y.fit_transform(y_train)

## Training the Random Forest Regression model on the whole dataset

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 150, random_state = 0)
regressor.fit(X_train, y_train)

  This is separate from the ipykernel package so we can avoid doing imports until


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=150, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

## Predicting the Test set results

In [None]:
y_pred = sc_y.inverse_transform(regressor.predict(sc_X.transform(X_test)))
np.set_printoptions(precision=2)

## Visualising the Training set results

In [None]:
dt3={'id':xtr,'consumption':ytr}
dt3=pd.DataFrame(dt3)
yp2=sc_y.inverse_transform(regressor.predict(X_train)) 
dt4={'id':xtr,'consumption':yp2}
dt4=pd.DataFrame(dt4)
dt3=dt3.sort_values(by=['id'])
dt4=dt4.sort_values(by=['id'])

In [None]:
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(x=dt3.id,y=dt3.consumption,mode='markers',name='Original Points'))
fig.add_trace(go.Scatter(x=dt4.id, y=dt4.consumption, mode='lines',name='Traning Model'))
fig.update_layout(
    title={
        'text': "Electricity consumption prediction for Delhi",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title="Entry ID",
    yaxis_title="Consumption",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    )
)
fig.show()

## Visualising the Test set results

In [None]:
dt1={'id':xte,'consumption':yte}
dt1=pd.DataFrame(dt1)
yp1=y_pred
dt2={'id':xte,'consumption':yp1}
dt2=pd.DataFrame(dt2)
dt1=dt1.sort_values(by=['id'])
dt2=dt2.sort_values(by=['id'])

In [None]:
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(x=dt1.id,y=dt1.consumption,mode='markers',name='Original Points'))
fig.add_trace(go.Scatter(x=dt2.id, y=dt2.consumption, mode='lines',name='Testing'))
fig.update_layout(
    title={
        'text': "Electricity consumption prediction for Delhi",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title="Entry ID",
    yaxis_title="Consumption",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    )
)
fig.show()

## Evaluating the Model Performance

In [None]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
print(r2_score(y_test, y_pred))
print(1-(1-r2_score(y_test, y_pred))*((len(X_test)-1)/(len(X_test)-len(X_test[0])-1)))
def MAPE(Y_actual,Y_Predicted):
    mape = np.mean(np.abs((Y_actual - Y_Predicted)/Y_actual))*100
    return mape
print(MAPE(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))
np.mean((y_pred-yte))

-0.21485120276554803
-0.2922302602665383
28.996817249477296
812.5585763783068


-0.15766269841269928