# Importing libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#importing libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import plotly.express as px 
sns.set_style('darkgrid')
from datetime import datetime
from dateutil.tz import *
import re 
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import xgboost as xgb
from xgboost import DMatrix
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Importing the dataset

In [None]:
#Importing the dataset
data = pd.read_csv('/kaggle/input/SolarEnergy/SolarPrediction.csv')
#Checking which data is available in the dataset and which data-type is associated to each column of the dataset
data.info()
df=data.copy()

# Data cleaning and preparation

> In this section, the dataset is analysed to identify whether there are missing values and whether all the data is identified by the correct data-type. 

In [None]:
#Checking if there are missing values
data.isnull().sum()

A preliminary analysis of the dataset indicates that there are no missing values.

> The UNIXTime is converted into a datetime object, and the right timezone is allocated to this feature.

In [None]:
data = data.sort_values(['UNIXTime'], ascending = [True])
data.head()

> Next step is to convert time and date parameters into a more useful format and add some coloumns that may be useful for visualisation and analysis.

In [None]:
from pytz import timezone
import pytz
hawaii= timezone('Pacific/Honolulu')
data.index =  pd.to_datetime(data['UNIXTime'], unit='s')
data.index = data.index.tz_localize(pytz.utc).tz_convert(hawaii)
data['MonthOfYear'] = data.index.strftime('%m').astype(int)
data['DayOfYear'] = data.index.strftime('%j').astype(int)
data['WeekOfYear'] = data.index.strftime('%U').astype(int)
data['TimeOfDay(h)'] = data.index.hour
data['TimeOfDay(m)'] = data.index.hour*60 + data.index.minute
data['TimeOfDay(s)'] = data.index.hour*60*60 + data.index.minute*60 + data.index.second
data['TimeSunRise'] = pd.to_datetime(data['TimeSunRise'], format='%H:%M:%S')
data['TimeSunSet'] = pd.to_datetime(data['TimeSunSet'], format='%H:%M:%S')
data['DayLength(s)'] = data['TimeSunSet'].dt.hour*60*60 \
                           + data['TimeSunSet'].dt.minute*60 \
                           + data['TimeSunSet'].dt.second \
                           - data['TimeSunRise'].dt.hour*60*60 \
                           - data['TimeSunRise'].dt.minute*60 \
                           - data['TimeSunRise'].dt.second
data.drop(['Data','Time','TimeSunRise','TimeSunSet'], inplace=True, axis=1)
data.head()

> If the data handling has been carried out correctly, then it would be reasonable to expect that the solar radiation, for any considered day, would be approximately zero before the sunrise time, and after the sunset time. 

In [None]:
data_one_day = data.loc['2016-09-29':'2016-09-30',:]

plt.figure(figsize = (12,3))
plt.plot(data_one_day.Radiation, 'o', markerfacecolor = 'w')


#Adjusting timezone of x-axis
plt.gca().xaxis_date('HST')

plt.legend()
plt.show()

The plot suggests that the various dates have been correctly manipulated. It is now possible to proceed with the preliminary data analysis of the dataset.

# Preliminary Data Analysis

> The first step of the preliminary data analysis is therefore to check the ranges of the various features of the dataset, and to do a cross-check whether these ranges are reasonable.

In [None]:
#Analysing the ranges of the various features of the datset
data.describe()

The ranges here identified look reasonable. In particular:

* The Solar radiation assumes only positive values, and has a maximum value of 1600 W/m^2 
* The temperature ranges from 30.4 F to 71 F
* The pressure variates very little, and in any case has a value of around 1 bar
* The Humidity has values over 100 %, but only very slightly
* Wind direction is correctly in the range from 0 to 360 degrees
* Wind speed is always positive, and its maximum value is reasonable as it corresponds to a grade 8 of the Beaufort scale

In [None]:
fig, ax = plt.subplots(figsize=(20, 6))
data['Radiation'].plot(ax=ax, style=['--'], color='red')
ax.set_title('Radiation as a Time Series', fontsize=18)
ax.set_ylabel('W/m2')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(20, 6))
data.groupby(pd.Grouper(freq="D"))['Radiation'].mean().plot(ax=ax, style=['--'], color='red')
ax.set_title('Radiation as a Time Series (Daily)', fontsize=18)
ax.set_ylabel('W/m2')
plt.show()

> Next step is to check for the distribution of the data, in order to understand how the various data is allocated between the lower and upper limits. This can be carried out by plotting either a distribution plot or a boxplot.

> Both are plotted in this case, as they enable to have a more comprehensive understanding of the data.

In [None]:
fig, ax = plt.subplots(nrows =2, ncols = 6, figsize = (25, 10))

sns.distplot(data.Radiation, ax = ax[0,0])
ax[0,0].set_xlabel('Solar radiation [W/m^2]', fontsize = 14)

sns.distplot(data.Temperature, ax = ax[0,1])
ax[0,1].set_xlabel('Temperature [F]', fontsize = 14)

sns.distplot(data.Pressure, ax = ax[0,2])
ax[0,2].set_xlabel('Pressure [Hg]', fontsize = 14)

sns.distplot(data.Humidity, ax = ax[0,3])
ax[0,3].set_xlabel('Humidity [%]', fontsize = 14)

sns.distplot(data.Speed, ax = ax[0,4])
ax[0,4].set_xlabel('Wind speed [miles/h]', fontsize = 14)

sns.distplot(data['WindDirection(Degrees)'], ax = ax[0,5])
ax[0,5].set_xlabel('Wind direction [Degrees]', fontsize = 14)


sns.boxplot(data.Radiation, ax = ax[1,0])
ax[1,0].set_xlabel('Solar radiation [W/m^2]', fontsize = 14)

sns.boxplot(data.Temperature, ax = ax[1,1])
ax[1,1].set_xlabel('Temperature [F]', fontsize = 14)

sns.boxplot(data.Pressure, ax = ax[1,2])
ax[1,2].set_xlabel('Pressure [Hg]', fontsize = 14)

sns.boxplot(data.Humidity, ax = ax[1,3])
ax[1,3].set_xlabel('Humidity [%]', fontsize = 14)

sns.boxplot(data.Speed, ax = ax[1,4])
ax[1,4].set_xlabel('Wind speed [miles/h]', fontsize = 14)

sns.boxplot(data['WindDirection(Degrees)'], ax = ax[1,5])
ax[1,5].set_xlabel('Wind direction [Degrees]', fontsize = 14)

fig.suptitle('Distribution and box plot of the various features', fontsize = 22)
fig.tight_layout()
fig.subplots_adjust(top=0.88)

plt.show()

In [None]:
for col in ['Radiation','Temperature', 'Pressure', 'Humidity', 'WindDirection(Degrees)', 'Speed']:
    fig, ax = plt.subplots(figsize=(20, 3))
    data[col].plot.box(ax=ax, vert=False, color='red')
    ax.set_title(f'{col} Distrubution', fontsize=18)
    plt.show()

Looking at the distribution of the data it is possible to conclude that most features have a skewed distribution, except for the wind directions, which is characterized by three peaks.

As it was possible to assume, roughtly 50 % of values of the solar radiation are located in the range between 0 W/^2 and 250 W/m^2 (there is no or little solar radiation at night). With respect to the wind speed, it seems that the high wind speeds areextreme outliers in a distribution that has most of its values in the range between 0 miles/h and 20 miles/h.

> As a last step in the preliminary data analysis, it makes good sense to plot the data for limited range of time. In this case, a five-day period is selected.

> Aside from the data, also the hourly-median of the data is represented in the following plots. This allows for an easier identification of potential patterns. The median is selected over the mean, because it is less affected by the presence of potential outliers.

In [None]:
#Creation of the median dataset
data_median = data.resample('H').median().dropna()

In [None]:
#Extraction of the data for a five-day period
data_5 = data.loc['2016-10-03':'2016-10-08',:]
data_5_median = data_median.loc['2016-10-03':'2016-10-08',:]


fig, ax = plt.subplots(nrows =6, ncols = 1, figsize = (23,25))

ax[0].plot(data_5.Radiation,'o', markerfacecolor='w')
ax[0].plot(data_5_median.Radiation, linewidth = 1.5, color = 'red', label = 'Hourly median')
ax[0].set_ylabel('Radiation [W/m^2]', fontsize = 14)
ax[0].legend(fontsize = 14)

ax[1].plot(data_5.Temperature,'o', markerfacecolor='w')
ax[1].plot(data_5_median.Temperature, linewidth = 1.5, color = 'red', label = 'Hourly median')
ax[1].set_ylabel('Temperature [F]', fontsize = 14)
ax[1].legend(fontsize = 14)

ax[2].plot(data_5.Pressure,'o', markerfacecolor='w')
ax[2].plot(data_5_median.Pressure, linewidth = 1.5, color = 'red', label = 'Hourly median')
ax[2].set_ylabel('Pressure [Hg]', fontsize = 14)
ax[2].legend(fontsize = 14)

ax[3].plot(data_5.Humidity,'o', markerfacecolor='w')
ax[3].plot(data_5_median.Humidity, linewidth = 1.5, color = 'red', label = 'Hourly median')
ax[3].set_ylabel('Humidity [%]', fontsize = 14)
ax[3].legend(fontsize = 14)

ax[4].plot(data_5.Speed,'o', markerfacecolor='w')
ax[4].plot(data_5_median.Speed, linewidth = 1.5, color = 'red', label = 'Hourly median')
ax[4].set_ylabel('Wind Speed [miles/h]', fontsize = 14)
ax[4].legend(fontsize = 14)

ax[5].plot(data_5['WindDirection(Degrees)'],'o', markerfacecolor='w')
ax[5].plot(data_5_median['WindDirection(Degrees)'], linewidth = 1.5, color = 'red', label = 'Hourly median')
ax[5].set_ylabel('Wind direction [degrees]', fontsize = 14)
ax[5].legend(fontsize = 14)

fig.suptitle('Trend of the various parameters over a five-day period', fontsize = 22)
fig.tight_layout(rect=[0, 0.03, 1, 0.97])

plt.show()

Looking at the plots it is possible to deduce the following:

1. The data for the Temperature, humidity, and wind speed seems to assume only discrete values. This could be connected with the type of sensors used for the data campaing;
2. The pressure data seems to follow some clear pattern in which high and low pressure values interchange each other;
3. The wind speed data is extremely volatile. The high volatility could make this feature a less "certain" one when carrying out the regression analysis;
4. As expected, solar radiation is constant at zero during the night, but high variability is experienced during the day-hours;
5. The wind direction data is volatile, but clear trends can be identified. Sometimes the variation of the measurements between 0 degrees and 360 degrees create a sense of "change" of the wind direction, which in practice is not there.

> In order to get a better understanding of the data, hourly and monthly means of several variables were visualised using bar plots.

In [None]:
grouped_m=data.groupby('MonthOfYear').mean().reset_index()
grouped_w=data.groupby('WeekOfYear').mean().reset_index()
grouped_d=data.groupby('DayOfYear').mean().reset_index()
grouped_h=data.groupby('TimeOfDay(h)').mean().reset_index()

f, ((ax1, ax2), (ax3, ax4), (ax5, ax6), (ax7, ax8)) = plt.subplots(4, 2, sharex='col', sharey='row', figsize=(14,12))
ax3.set_ylim(45,60)
ax5.set_ylim(30.36,30.46)
ax7.set_ylim(60,85)

ax1.set_title('Mean Radiation by Hour')
pal = sns.color_palette("mako", len(grouped_h))
rank = grouped_h['Radiation'].argsort().argsort() 
g = sns.barplot(x="TimeOfDay(h)", y='Radiation', data=grouped_h, palette=np.array(pal[::-1])[rank], ax=ax1)
ax1.set_xlabel('')

ax2.set_title('Mean Radiation by Month')
pal = sns.color_palette("mako", len(grouped_m))
rank = grouped_m['Radiation'].argsort().argsort() 
g = sns.barplot(x="MonthOfYear", y='Radiation', data=grouped_m, palette=np.array(pal[::-1])[rank], ax=ax2)
ax2.set_xlabel('')

ax3.set_title('Mean Temperature by Hour')
pal = sns.color_palette("mako", len(grouped_h))
rank = grouped_h['Temperature'].argsort().argsort() 
g = sns.barplot(x="TimeOfDay(h)", y='Temperature', data=grouped_h, palette=np.array(pal[::-1])[rank], ax=ax3)
ax3.set_xlabel('')

ax4.set_title('Mean Temperature by Month')
pal = sns.color_palette("mako", len(grouped_m))
rank = grouped_m['Temperature'].argsort().argsort() 
g = sns.barplot(x="MonthOfYear", y='Temperature', data=grouped_m, palette=np.array(pal[::-1])[rank], ax=ax4)
ax4.set_xlabel('')

ax5.set_title('Mean Pressure by Hour')
pal = sns.color_palette("mako", len(grouped_h))
rank = grouped_h['Pressure'].argsort().argsort() 
g = sns.barplot(x="TimeOfDay(h)", y='Pressure', data=grouped_h, palette=np.array(pal[::-1])[rank], ax=ax5)
ax5.set_xlabel('')

ax6.set_title('Mean Pressure by Month')
pal = sns.color_palette("mako", len(grouped_m))
rank = grouped_m['Pressure'].argsort().argsort() 
g = sns.barplot(x="MonthOfYear", y='Pressure', data=grouped_m, palette=np.array(pal[::-1])[rank], ax=ax6)
ax6.set_xlabel('')

ax7.set_title('Mean Humidity by Hour')
pal = sns.color_palette("mako", len(grouped_h))
rank = grouped_h['Humidity'].argsort().argsort() 
g = sns.barplot(x="TimeOfDay(h)", y='Humidity', data=grouped_h, palette=np.array(pal[::-1])[rank], ax=ax7)

ax8.set_title('Mean Humidity by Month')
pal = sns.color_palette("mako", len(grouped_m))
rank = grouped_m['Humidity'].argsort().argsort() 
g = sns.barplot(x="MonthOfYear", y='Humidity', data=grouped_m, palette=np.array(pal[::-1])[rank], ax=ax8)

plt.show()

From the above plots, its clear that:
* Temperature has strong corellation with solar irradiance. 
* Humidity has a negative correlation with solar irradiance, temperature and pressure.
* Solar irradiance and temperature both peak at approximately 12:00. 

Additionally, monthly means of both solar irradiance and temperature appear to decrease as winter approaches, with the exception of a very slight increase in solar irradiance from September to October.



# Feature engineering and correlation analysis

> it is time to carry out correlation analyses aimed at identifying if there are clear patterns (linear or non-linear) between the variable to be predicted (the solar radiation), and the features.

In [None]:
corrmat = data.drop(['TimeOfDay(h)', 'TimeOfDay(m)', 'TimeOfDay(s)', 'UNIXTime', 'MonthOfYear', 'WeekOfYear'], inplace=False, axis=1)
corrmat = corrmat.corr()
fig, ax = plt.subplots(figsize=(7,7))
sns.heatmap(corrmat, vmin=-.8, vmax=.8, square=True, cmap = 'coolwarm')
plt.show()

> The correlation matrix indicates a positive linear correlation between the ambient temperature and the solar radiation. No clear linear correlation appears for the other features, and the second highest correlation value is identified for the humidity.

In [None]:
#Plotting a heatmap of the various features in the dataset
fig, ax = plt.subplots(figsize = (10,10))
sns.heatmap(data.corr(), annot = True, cmap = 'YlGnBu')
fig.suptitle('Correlation matrix', fontsize = 16)
plt.show()

In [None]:
fig, ax = plt.subplots(nrows =2, ncols = 3, figsize = (23,8))

ax[0,0].plot(data.Temperature, data.Radiation,'o', markerfacecolor='w')
ax[0,0].set_xlabel('Temperature [F]', fontsize = 14)
ax[0,0].set_ylabel('Radiation [W/m^2]', fontsize = 14)

ax[0,1].plot(data.Pressure, data.Radiation,'o', markerfacecolor='w')
ax[0,1].set_xlabel('Pressure [Hg]', fontsize = 14)
ax[0,1].set_ylabel('Radiation [W/m^2]', fontsize = 14)

ax[0,2].plot(data.Humidity, data.Radiation,'o', markerfacecolor='w')
ax[0,2].set_xlabel('Humidity [%]', fontsize = 14)
ax[0,2].set_ylabel('Radiation [W/m^2]', fontsize = 14)

ax[1,1].plot(data.Speed, data.Radiation,'o', markerfacecolor='w')
ax[1,1].set_xlabel('Wind speed [miles/h]', fontsize = 14)
ax[1,1].set_ylabel('Radiation [W/m^2]', fontsize = 14)

ax[1,0].plot(data['WindDirection(Degrees)'], data.Radiation,'o', markerfacecolor='w')
ax[1,0].set_xlabel('Wind direction [degrees]', fontsize = 14)
ax[1,0].set_ylabel('Radiation [W/m^2]', fontsize = 14)

fig.delaxes(ax[1,2])

fig.suptitle('Scatter plots of the solar radiation as a function of the various features', fontsize = 22)
fig.tight_layout()
fig.subplots_adjust(top=0.88)

plt.show()

> The scatter plots show the distribution of the values of the various features as a function of the solar radiation. This allows to identify potential non-linear trends present.
> 
> The scatter plots suggest the following:
> 
> * It is confirmed a linear correlation between solar radiation and ambient temperature;
> * It seems that the highest values of the solar radiation are taking place when the ambient pressure is the highest;
> * It seems that the maximum present solar radiation decreases for high wind speeds.
> 

In [None]:
df['Month'] = df['Data'].apply(lambda y: re.search(r'^\d+', y).group(0))
df['Day'] = df['Data'].apply(lambda y: re.search(r'(?<=\/)\d+(?=\/)', y).group(0))
df['Year'] = df['Data'].apply(lambda y: re.search(r'(?<=\/)\d+(?=\s)', y).group(0))

In [None]:
df

In [None]:
df['Month'] = df['Data'].apply(lambda x: re.search(r'^\d+', x).group(0)).astype(np.int)
df['Day'] = df['Data'].apply(lambda y: re.search(r'(?<=\/)\d+(?=\/)', y).group(0)).astype(np.int)
df['Year'] = df['Data'].apply(lambda y: re.search(r'(?<=\/)\d+(?=\s)', y).group(0)).astype(np.int)

In [None]:
df

In [None]:
df=df.drop(['Data'],axis=1)

In [None]:
df

In [None]:
df['Hour'] = df['Time'].apply(lambda y: re.search(r'^\d+', y).group(0))
df['Minute'] = df['Time'].apply(lambda y: re.search(r'(?<=\:)\d+(?=\:)', y).group(0))
df['Second'] = df['Time'].apply(lambda y: re.search(r'\d+$', y).group(0))

In [None]:
df

In [None]:
df['Hour'] = df['Time'].apply(lambda y: re.search(r'^\d+', y).group(0)).astype(np.int)
df['Minute'] = df['Time'].apply(lambda y: re.search(r'(?<=\:)\d+(?=\:)', y).group(0)).astype(np.int)
df['Second'] = df['Time'].apply(lambda y: re.search(r'\d+$', y).group(0)).astype(np.int)

In [None]:
del df['Time']

In [None]:
df

In [None]:
df['SunriseHour'] = df['TimeSunRise'].apply(lambda x: re.search(r'^\d+', x).group(0)).astype(np.int)
df['SunriseMinute'] = df['TimeSunRise'].apply(lambda x: re.search(r'(?<=:)\d+(?=:)', x).group(0)).astype(np.int)

df['SunsetHour'] = df['TimeSunSet'].apply(lambda x: re.search(r'^\d+', x).group(0)).astype(np.int)
df['SunsetMinute'] = df['TimeSunSet'].apply(lambda x: re.search(r'(?<=:)\d+(?=:)', x).group(0)).astype(np.int)

df = df.drop(['TimeSunRise', 'TimeSunSet'], axis=1)

In [None]:
df

In [None]:
df.dtypes

In [None]:
y = df['Radiation'].copy()
X = df.drop('Radiation', axis=1).copy()

In [None]:
X

In [None]:
scaler = StandardScaler()

Z = scaler.fit_transform(X)

In [None]:
pd.DataFrame(Z)

In [None]:
df['Year'].unique()

In [None]:
df.drop(['SunriseHour'],axis=1)

# Modeling 

# Data Training

In [None]:
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression
kf = KFold(shuffle=True, random_state=19)

In [None]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor, XGBRFRegressor
from catboost import CatBoostRegressor

trees = {
    'linear': LinearRegression(),
    'randomfor': RandomForestRegressor(random_state=19), 
    'gradientb': GradientBoostingRegressor(random_state=19), 
    'xgb': XGBRegressor(random_state=19), 
    'xgbrf': XGBRFRegressor(random_state=19), 
    'catboost': CatBoostRegressor(random_state=19, silent=True),
    'DecisionTr': DecisionTreeRegressor(random_state=19),
    'extratre': ExtraTreesRegressor(random_state=19),
    
}

In [None]:
scores = []
rmse=[]
for train_index, test_index in kf.split(X):
  X_train, X_test = X.iloc[train_index], X.iloc[test_index]
  y_train, y_test = y.iloc[train_index], y.iloc[test_index]

# Linear regression

In [None]:
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)
regresor_pred = regression_model.predict(X_test)
scores.append(100*regression_model.score(X_test, y_test))
rmse.append(np.sqrt(mean_squared_error(y_test, regression_model.predict(X_test))))

# Random forest Regression

In [None]:
Randomforest= RandomForestRegressor(random_state=7).fit(X_train, y_train)
scores.append(100*Randomforest.score(X_test, y_test))
rmse.append(np.sqrt(mean_squared_error(y_test, Randomforest.predict(X_test))))

# Gardient boosting Regression

In [None]:
Gradientboost = GradientBoostingRegressor(random_state=19).fit(X_train, y_train)
scores.append(100*Gradientboost.score(X_test, y_test))
rmse.append(np.sqrt(mean_squared_error(y_test, Gradientboost.predict(X_test))))

# XGB Regression

In [None]:
XGB = XGBRegressor(random_state=19).fit(X_train, y_train)
scores.append(100*XGB.score(X_test, y_test))
rmse.append(np.sqrt(mean_squared_error(y_test, XGB.predict(X_test))))

# XGBRF Regression

In [None]:
XGBFFR = XGBRFRegressor(random_state=133).fit(X_train, y_train)
scores.append(100*XGBFFR.score(X_test, y_test))
rmse.append(np.sqrt(mean_squared_error(y_test, XGBFFR.predict(X_test))))

# CB Regression

In [None]:
Catboost= CatBoostRegressor(random_state=19, silent=True).fit(X_train, y_train)
scores.append(100*Catboost.score(X_test, y_test))
rmse.append(np.sqrt(mean_squared_error(y_test, Catboost.predict(X_test))))

# DT Regression

In [None]:
DecisionTr = DecisionTreeRegressor(random_state=19).fit(X_train, y_train)
scores.append(100*DecisionTr.score(X_test, y_test))
rmse.append(np.sqrt(mean_squared_error(y_test, DecisionTr.predict(X_test))))

ET Regression

In [None]:
Extratrees = ExtraTreesRegressor(random_state=19).fit(X_train, y_train)
scores.append(100*Extratrees.score(X_test, y_test))
rmse.append(np.sqrt(mean_squared_error(y_test, Extratrees.predict(X_test))))

# RMSE scores

In [None]:
rmse,scores

# Testing data : R-squared & rmse

In [None]:
test1=pd.DataFrame({"model":['linear', 'randomfor', 'gradientb', 'xgb', 'xgbrf', 'catboost', 'decisiontr',"extratr"],
                   "r2":[scores[0],scores[1],scores[2],scores[3],scores[4],scores[5],scores[6],scores[7]],
                   "rmse":[rmse[0],rmse[1],rmse[2],rmse[3],rmse[4],rmse[5],rmse[6],rmse[7]]
                   })
test1

# Models comparison

In [None]:
import plotly.graph_objects as go
fig = go.Figure(data=[
                      go.Bar(x=test1.model,y=test1.rmse),
                      go.Bar(x=test1.model,y=test1.r2),
])
fig.show()