# AML Project Work 2021


#Data Preprocessing

link to datasets : https://www.kaggle.com/nicholasjhana/energy-consumption-generation-prices-and-weather


#Preprocessing of weather data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings
filterwarnings('ignore')

#Reading weather data
Wdata = pd.read_csv('weather_features.csv', skipinitialspace=True) #Weather Data
Wdata.info()

#Dropping weather related columns with data of no interest
Wdata=Wdata.drop(['dt_iso','temp_min', 'temp_max', 'wind_deg', 'weather_id',
                  'weather_main', 'weather_description', 'weather_icon',
                  'rain_3h', 'snow_3h'], axis=1)

#Splitting the dataframe into 5 different dataframes by city name
print(Wdata['city_name'].unique())
grouped = Wdata.groupby(Wdata.city_name)

Valencia = grouped.get_group("Valencia") 
Madrid = grouped.get_group("Madrid") 
Bilbao = grouped.get_group("Bilbao") 
Barcelona = grouped.get_group("Barcelona") 
Seville = grouped.get_group("Seville") 

#Renaming columns in each dataframe according to cities

Valencia = Valencia.rename(columns={'city_name': 'city_valencia', 'temp':'val_temp',
                                    'pressure':'val_pressure', 'humidity':'val_humidity',
                                    'wind_speed':'val_wind_speed','rain_1h':'val_rain_1h',
                                    'clouds_all':'val_clouds_all'})

Madrid = Madrid.rename(columns={'city_name': 'city_madrid', 'temp':'mad_temp',
                                    'pressure':'mad_pressure', 'humidity':'mad_humidity',
                                    'wind_speed':'mad_wind_speed','rain_1h':'mad_rain_1h',
                                    'clouds_all':'mad_clouds_all'})

Bilbao = Bilbao.rename(columns={'city_name': 'city_bilbao', 'temp':'bil_temp',
                                    'pressure':'bil_pressure', 'humidity':'bil_humidity',
                                    'wind_speed':'bil_wind_speed','rain_1h':'bil_rain_1h',
                                    'clouds_all':'bil_clouds_all'})

Barcelona = Barcelona.rename(columns={'city_name': 'city_barcelona', 'temp':'bar_temp',
                                    'pressure':'bar_pressure', 'humidity':'bar_humidity',
                                    'wind_speed':'bar_wind_speed','rain_1h':'bar_rain_1h',
                                    'clouds_all':'bar_clouds_all'})

Seville = Seville.rename(columns={'city_name': 'city_seville', 'temp':'sev_temp',
                                    'pressure':'sev_pressure', 'humidity':'sev_humidity',
                                    'wind_speed':'sev_wind_speed','rain_1h':'sev_rain_1h',
                                    'clouds_all':'sev_clouds_all'})

#Resetting indexes of the individual dataframes 
Madrid = Madrid.reset_index().drop(['index'], axis=1)
Bilbao = Bilbao.reset_index().drop(['index'], axis=1)
Barcelona = Barcelona.reset_index().drop(['index'], axis=1)
Seville = Seville.reset_index().drop(['index'], axis=1)

#Merging the five dataframes back into one dataframe
merged_df = pd.concat([Valencia, Madrid, Bilbao, Barcelona, Seville], axis=1)

#Dropping all rows containing NaN-values
merged_df = merged_df.dropna()

#Calculating the mean values of the columns containing same type of data
#but from different cities

#Mean temp
merged_df['temp_mean'] = merged_df[['val_temp','mad_temp','bil_temp',
                               'bar_temp','sev_temp']].mean(axis=1)
#Mean air pressure
merged_df['pressure_mean'] = merged_df[['val_pressure','mad_pressure','bil_pressure',
                               'bar_pressure','sev_pressure']].mean(axis=1)
#Mean humidity
merged_df['humidity_mean'] = merged_df[['val_humidity','mad_humidity','bil_humidity',
                               'bar_humidity','sev_humidity']].mean(axis=1)
#Mean windspeed
merged_df['wind_speed_mean'] = merged_df[['val_wind_speed','mad_wind_speed','bil_wind_speed',
                               'bar_wind_speed','sev_wind_speed']].mean(axis=1)
#Mean hourly rain
merged_df['rain_1h_mean'] = merged_df[['val_rain_1h','mad_rain_1h','bil_rain_1h',
                               'bar_rain_1h','sev_rain_1h']].mean(axis=1)
#Mean cloudiness(%)
merged_df['clouds_all_mean'] = merged_df[['val_clouds_all','mad_clouds_all','bil_clouds_all',
                               'bar_clouds_all','sev_clouds_all']].mean(axis=1)

#Dropping all the unnecessary columns now that we have mean values 
merged_df.drop(merged_df.columns.difference(['temp_mean','pressure_mean','humidity_mean',
                                             'wind_speed_mean','rain_1h_mean',
                                             'clouds_all_mean']), 1, inplace=True)

#Storing the final result in an object "Wdata_final"                                          
Wdata_final = merged_df
Wdata_final.info()
print(Wdata_final)

#Preprocessing of energy data

In [None]:
#Reading the data
Edata = pd.read_csv('energy_dataset.csv') #Energy Data
print(Edata.head())
Edata.info()
#Checking the datatype and number of unique values in the columns
for i in Edata.columns:
    print(i,"--->",Edata[i].nunique(),"--->",Edata[i].dtypes)

#Checking the values of columns that have only one value

cols=['generation fossil coal-derived gas','generation fossil oil shale','generation fossil peat',
      'generation geothermal','generation marine','generation wind offshore']

for values in cols:
    print(Edata[values].unique())

#Dropping the columns that don't have any other values than 0
Edata=Edata.drop(['generation fossil coal-derived gas','generation fossil oil shale',
                  'generation fossil peat','generation geothermal','generation marine',
                  'generation wind offshore'],axis=1)

#Dropping columns that have no values in them
Edata=Edata.drop(['generation hydro pumped storage aggregated', 
                  'forecast wind offshore eday ahead'],axis=1)

#Changing the datatype of the time column 
Edata[['Date','Time']]=Edata['time'].str.split(" ",n=1,expand=True)
Edata['Date']=pd.to_datetime(Edata['Date'])
Edata[['Time','Spare']]=Edata['Time'].str.split("+",n=1,expand=True)
Edata=Edata.drop(["Spare","time"],axis=1)
Edata['Time']=pd.to_datetime(Edata['Time'],format='%H:%M:%S')
Edata['Time']=Edata['Time'].dt.time

#Moving the "Date" and "Time" columns to the front of the columns for a cleaner look
cols = Edata.columns.tolist()
cols = cols[-1:] + cols[:-1] #Moving 'Time' to the front of the columns
cols = cols[-1:] + cols[:-1] #Moving 'Date' to the front of the columns
Edata = Edata[cols] #Reordering the dataframe

#Dropping unneccessary columns, for example not interested in energy price in this case
Edata_final=Edata.drop(['price day ahead','price actual','total load forecast',
                        'forecast solar day ahead','forecast wind onshore day ahead',
                        'Date','Time'], axis=1)

#Edata_final = Edata[['Date','Time','total load actual']]
Edata_final.info()
print(Edata_final)

#Combining the two dataframes and finalizing the dataframe

In [None]:
#Combining weather and energy dataframes into one dataframe
final_dframe = pd.concat([Edata_final,Wdata_final], axis=1)

#Dropping all rows containing NaN-values
final_dframe = final_dframe.dropna()

final_dframe.info()
print(final_dframe)

#Further data analysis

In [None]:
#Checking for outliers in the weather data columns
sns.boxplot(final_dframe['temp_mean'])
plt.show()

sns.boxplot(final_dframe['pressure_mean'])
plt.show()

sns.boxplot(final_dframe['humidity_mean'])
plt.show()

sns.boxplot(final_dframe['wind_speed_mean'])
plt.show()

sns.boxplot(final_dframe['rain_1h_mean'])
plt.show()

sns.boxplot(final_dframe['clouds_all_mean'])
plt.show()

#Fixing the outliers in all columns, except 'temp_mean'
from scipy.stats.mstats import winsorize
final_dframe['pressure_mean']=winsorize(final_dframe['pressure_mean'],(0.1,0.05))
final_dframe['humidity_mean']=winsorize(final_dframe['humidity_mean'],(0.05,0.05))
final_dframe['wind_speed_mean']=winsorize(final_dframe['wind_speed_mean'],(0,0.05))
final_dframe['rain_1h_mean']=winsorize(final_dframe['rain_1h_mean'],(0,0.2))
final_dframe['clouds_all_mean']=winsorize(final_dframe['clouds_all_mean'],(0,0.025))

sns.boxplot(final_dframe['pressure_mean'])
plt.show()
sns.boxplot(final_dframe['humidity_mean'])
plt.show()
sns.boxplot(final_dframe['wind_speed_mean'])
plt.show()
sns.boxplot(final_dframe['rain_1h_mean'])
plt.show()
sns.boxplot(final_dframe['clouds_all_mean'])
plt.show()


#Checking if there still are some missing values
final_dframe.isnull().sum()



#Correlation between variables

In [None]:
#Findig the correlation between variables
final_dframe.corr()

#plotting the correlation between variables in which correlation is high
df5=final_dframe.corr()
plt.figure(figsize=(15, 10))
sns.heatmap(df5[(df5>0.5)|(df5<-0.5)],annot=True,cbar=False,linewidth=0.5,linecolor='blue')

#Final data preparation for model building

In [None]:
dframe_reg = final_dframe


X=dframe_reg.drop('total load actual',axis=1)
y=dframe_reg['total load actual']

#Splitting the dataframe X and y into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1)

#checking the dimensions of the train & test subset
print("The shape of X_train is:",X_train.shape)
print("The shape of X_test is:",X_test.shape)
print("The shape of y_train is:",y_train.shape)
print("The shape of y_test is:",y_test.shape)

# Model Construction and Validation


##ExtraTreesRegressor

In [None]:
#Building the ExtraTreesRegressor model
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import GridSearchCV

etr=ExtraTreesRegressor(n_estimators=25, max_depth=10).fit(X_train, y_train)

# Set the parameters by cross-validation
tuned_parameters = [{'max_depth': range(5,15), 
                     'n_estimators': range(20,30)}]

# Use the GridSearch to find out the best paramers using 5 fold cross validation
tune_et=GridSearchCV(etr, tuned_parameters, cv=5);
tune_et.fit(X_train, y_train);
optimal_et=tune_et.best_estimator_

#Training the regressor
train_pred=etr.predict(X_train)

#Using cross validation score to test the performance of the regressor
from sklearn.model_selection import cross_val_score
cv_score=cross_val_score(etr, X_train, y_train, cv=5).mean()
print("Accurary in crossvalidation...%f" % cv_score)

#Testing the regressor
test_pred=etr.predict(X_test)

#Reporting the train and test score of the regressor
from sklearn.metrics import r2_score
train_score = r2_score(y_train, train_pred)
test_score = r2_score(y_test, test_pred)
print("Accurary in the training set..%f" % train_score)
print("Accurary in the test set......%f" % test_score)
print(optimal_et)

#Vizualization of ExtraTreesRegressor results

In [None]:
#Plotting
import seaborn as sns
sns.regplot(x=y_test, y=test_pred, line_kws={"color": "red"})
plt.xlabel('Actual Energy Demand')
plt.ylabel('Predicted Energy Demand')
print()

##GradientBoostingRegressor 

In [None]:
#Building the GradientBoostingRegressor model
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(n_estimators=25, max_depth=10, loss='squared_error').fit(X_train, y_train)

# Set the parameters by cross-validation
tuned_parameters = [{'max_depth': range(5,15), 
                     'n_estimators': range(20,30)}]

# Use the GridSearch to find out the best paramers using 5 fold cross validation
tune_et=GridSearchCV(etr, tuned_parameters, cv=3);
tune_et.fit(X_train, y_train);
optimal_et=tune_et.best_estimator_

#Training the regressor
train_pred=gbr.predict(X_train)

#Using cross validation score to test the performance of the regressor
cv_score=cross_val_score(gbr, X_train, y_train, cv=5).mean()
print("Accurary in crossvalidation...%f" % cv_score)

#Testing the regressor
test_pred=gbr.predict(X_test)

#Reporting the train and test score of the regressor
train_score = r2_score(y_train, train_pred)
test_score = r2_score(y_test, test_pred)
print("Accurary in the training set..%f" % train_score)
print("Accurary in the test set......%f" % test_score)
print(optimal_et)

#Vizualization of GradientBoostingRegressor results

In [None]:
#plotting
sns.regplot(x=y_test, y=test_pred, line_kws={"color": "red"})
plt.xlabel('Actual Energy Demand')
plt.ylabel('Predicted Energy Demand')
print()

In [None]:
# Set the parameters by cross-validation
tuned_parameters = [{'max_depth': range(10,20), 'min_samples_split': range(2,5),
                     'n_estimators': range(30,50)}]

# Use the GridSearch to find out the best paramers using 5 fold cross validation
tune_et=GridSearchCV(etr, tuned_parameters, cv=3);
tune_et.fit(X_train, y_train);
optimal_et=tune_et.best_estimator_