## This was a competition hosted on [HackerEarth](https://www.hackerearth.com/), named [A Fine Windy Day](https://www.hackerearth.com/challenges/competitive/hackerearth-machine-learning-challenge-predict-windmill-power/instructions/).
## Every step is explained in detail, from feature engineering to data pre-processing, knowing how tricky and extremely off-putting at first glance the dataset was.
#### *If this helps you in learning, an upvote would be huge!*

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
train = pd.read_csv('../input/a-fine-windy-day-hackerearth-ml-challenge/train_data.csv')
test = pd.read_csv('../input/a-fine-windy-day-hackerearth-ml-challenge/test_data.csv')

### Converting "datetime" column to datetime64 datatype of python.

In [None]:
train['datetime'] = train['datetime'].astype('datetime64[ns]')
test['datetime'] = test['datetime'].astype('datetime64[ns]')

In [None]:
train.shape

In [None]:
#Few rows had the target value as NaN and are hence dropped!
train.dropna(axis=0, subset=['windmill_generated_power(kW/h)'], inplace=True)

### Wind direction is tweaked so that wind moving *against* the *designed* direction of windmill is *negative* , and for is *positive*.
### This is done by carefully examing the distribution of wind direction and understanding the reference point of direction measurement. 

In [None]:
train['wind_direction(°)'] = (((train['wind_direction(°)']-120)%360)-180)
test['wind_direction(°)'] = (((test['wind_direction(°)']-120)%360)-180)

### Labels which do not contribute to the model/dataset or do not have any significance with respect to the region of interest have been dropped.  
### This will be done throughout the process.

In [None]:
train['windmill_height(m)'] = (train['windmill_height(m)'] + 31.0)
test['windmill_height(m)'] = (test['windmill_height(m)'] + 31.0)

### Multiple columns had hidden values under the masquerade of values like "-999" or "-99"


In [None]:
plt.figure(figsize=(15,4))
plt.tight_layout()

plt.subplot(1,3,1)
plt.hist(train['blades_angle(°)'], bins = 10)
plt.title('-99 as missing value in Blade Angle')


plt.subplot(1,3,2)
plt.hist(train['shaft_temperature(°C)'], bins = 10)
plt.title('-99 as missing value in Shaft Temperature')


plt.subplot(1,3,3)
plt.hist(train['atmospheric_temperature(°C)'], bins = 10)
plt.title('-99 as missing value in Atmospheric Temperature')

plt.show()

In [None]:
def extract_nan(col, value):
    test[col].replace(value, np.nan, inplace=True)
    train[col].replace(value, np.nan, inplace=True)

In [None]:
columns = ['blade_length(m)','rotor_torque(N-m)', 'resistance(ohm)', 'blades_angle(°)', 'shaft_temperature(°C)' , 
           'gearbox_temperature(°C)', 'windmill_body_temperature(°C)', 'atmospheric_temperature(°C)' ]
for i in columns:
    extract_nan(i, -99)
    
    
extract_nan('windmill_body_temperature(°C)', -999)
extract_nan('area_temperature(°C)', -30)

### Columns having unscientific values have been clipped to their nearest naturally possible values. This maintains the integrity of the data while making sense out of the data.

In [None]:
def clip_temp(col, upper, lower):
    train[col].clip(lower=lower, upper=upper, inplace = True)
    test[col].clip(lower=lower, upper=upper, inplace = True)

In [None]:
columns = ['shaft_temperature(°C)', 'engine_temperature(°C)', 
           'atmospheric_temperature(°C)', 'area_temperature(°C)']
for i in columns:
    clip_temp(i, 60, -25)
clip_temp('gearbox_temperature(°C)', 100, -25)
clip_temp('windmill_body_temperature(°C)', 100, -25)

### All temperatures related columns have been converted from Celsius to Fahrenheit. 

In [None]:
def cel_to_fah(col):
    train[col] = (train[col]+17.778)/0.556
    test[col] = (test[col]+17.778)/0.556
    return None

In [None]:
columns = ['shaft_temperature(°C)' , 'gearbox_temperature(°C)',
           'windmill_body_temperature(°C)', 'engine_temperature(°C)', 
           'generator_temperature(°C)', 'atmospheric_temperature(°C)', 
           'area_temperature(°C)']
for i in columns:
    cel_to_fah(i)

### Pressure is converted from Pascals to Bar as the feature of interest is *atmospheric pressure*. This also helps in scaling the feature for future use. 

In [None]:
#Pascal to Bar
train['atmospheric_pressure(Pascal)'] = (train['atmospheric_pressure(Pascal)']/1e5)
test['atmospheric_pressure(Pascal)'] = (test['atmospheric_pressure(Pascal)']/1e5)

### Wind speed is touched up to make the values more natural. The values went as high as 600m/s which is not recorded ever. 

In [None]:
train['wind_speed(m/s)'] = (train['wind_speed(m/s)']/10)
test['wind_speed(m/s)'] = (test['wind_speed(m/s)']/10)

In [None]:
plt.hist(train['wind_speed(m/s)'], bins = 10)
plt.title('Distribution of wind speed')

### NOTE : The negative value can be a representation of wind flowing in opposite direction to the reference direction!

In [None]:
train = train.round(2)
test = test.round(2)

### Used the *DTale* python library for charts, correlations and complete description of each feature present in the dataset. 
### This is arguably the most useful and efficient way of *exploratory data analysis*!

In [None]:
# import dtale
# dtale.show(train)

In [None]:
corr= abs(train.corr())
core = abs(corr['windmill_generated_power(kW/h)'].sort_values(ascending = False))
print(core.sort_values(ascending = False))
plt.figure(figsize = (20,10))

ax = sns.heatmap(corr, vmax = 0.65, annot=True, linewidths=.5)

### We can see that the correlation are really strong for some pairs of features!
### Moreover, motor_torque(N-m), generator_temperature(°C), blades_angle(°), wind_direction(°), resistance(ohmengine_temperature(°C) and rotor_torque(N-m) are *highly* correlated with the *target variable*.
### These features would play particularly important role in making predictions!

# ---------------------------------------------------------------------------------------------------------------

# Feature Generation
## All the attempts of feature generations are taken after extensive research on how power is generated using windmills. This included the working of windmills, components required to build a windmill, factors affecting power generation, geographical, environmental and technical limitations for the process, physics related to drag friction in air, wind movement, temperature changes and humidity. 
## Tons of websites, blogs and research papers were scanned to achieve this and make educated and informed decisions. 

## Air Density
### Air density directly accounts to power generation of windmill as it is used in the *ideal* formula for the same.
#### Pressure is first converted to Pascals from bar. 
#### 287.058 accounts for gas constant
#### Temperature is converted from Fahrenheit to Kelvin. 

In [None]:
train['air_density'] = (train['atmospheric_pressure(Pascal)']*1e5)/(287.058*(273.3+(train['area_temperature(°C)']*.556-17.778)))
test['air_density'] = (test['atmospheric_pressure(Pascal)']*1e5)/(287.058*(273.3+(test['area_temperature(°C)']*.556-17.778)))

# Generator Status

In [None]:
chart_data = pd.concat([
    train['motor_torque(N-m)'],
    train['generator_temperature(°C)'],
], axis=1)
chart_data = chart_data.sort_values(['motor_torque(N-m)'])
chart_data = chart_data.rename(columns={'motor_torque(N-m)': 'x'})
chart_data = chart_data.dropna()

import plotly.graph_objs as go

chart = go.Scattergl(
    x=chart_data['x'], y=chart_data['generator_temperature(°C)'], mode='markers', opacity=0.7, name='all',
    marker={'size': 15, 'line': {'width': 0.5, 'color': 'white'}}
)

figure = go.Figure(data=[chart], layout=go.Layout({
    'legend': {'orientation': 'h'},
    'title': {'text': 'generator_temperature(°C) by motor_torque(N-m)'},
    'xaxis': {'title': {'text': 'motor_torque(N-m)'}},
    'yaxis': {'title': {'text': 'generator_temperature(°C)'}, 'type': 'linear'}
}))
figure

### We can see how threshold for generator is 1001 Nm of motor torque (Hover over the graph!).
### This signifies the threshold value of motor torque after which power generation starts in the windmill.
### This can be seen as a reflection of "cut-in speed" of windmill.  

In [None]:
train['generator_on'] = (train['motor_torque(N-m)']>1000).astype('int64')
test['generator_on'] = (test['motor_torque(N-m)']>1000).astype('int64')

# Turbulence
### Atmospheric turbulence is the set of seemingly random and continuously changing air motions that are superimposed on the wind’s average motion and impacts wind energy. 
###  Turbulence is quantified with a metric called turbulence intensity which is calculated by the standard deviation of the horizontal wind speed divided by the average wind speed over some time period, typically 10 minutes.


In [None]:
df_turb = pd.concat([train,test])
df_turb = df_turb.sort_values('datetime')
idx = [*range(0,df_turb.shape[0])]
df_turb['index'] = idx
df_turb = df_turb.set_index('index')

### For this we sort the dataset using *datetime* as index. To our advantage, the time difference between 2 observations are more often than not 10 minutes.

### Turbulence is calculated only if the observations are 10 minutes apart. if-statements for this condition to be met are coded accordingly. 

In [None]:
turbulence = []


first = []
first.append(train['wind_speed(m/s)'][0])
first.append(train['wind_speed(m/s)'][1])
t = np.std(first)/np.mean(first)
turbulence.append(t)

for i in range(1,df_turb.shape[0]):
    if (i==(df_turb.shape[0]-1)):
        continue
    turb = []
    turb.append(df_turb['wind_speed(m/s)'][i])
    if ((df_turb.datetime[i]-df_turb.datetime[i-1]).total_seconds()==600):
        turb.append(df_turb['wind_speed(m/s)'][i-1])
    if ((df_turb.datetime[i+1]-df_turb.datetime[i]).total_seconds()==600):
        turb.append(df_turb['wind_speed(m/s)'][i+1])
    if (np.mean(turb)==0):
        t = 0
        turbulence.append(t)
        continue
    t = np.std(turb)/np.mean(turb)
    turbulence.append(t)

last = []
last.append(df_turb['wind_speed(m/s)'][40077])
last.append(df_turb['wind_speed(m/s)'][40078])
t = np.std(last)/np.mean(last)
turbulence.append(t)
df_turb['turbulence'] = turbulence
#df_turb['turbulence'] = abs(df_turb['turbulence'])
#df_turb['turbulence'].clip(lower=0, upper=1, inplace = True)
del turbulence

In [None]:
df_turb = df_turb.set_index('tracking_id')
dict_turb = df_turb[['turbulence']].to_dict()
dict_turb = dict_turb['turbulence']

In [None]:
train["turbulence"] = train["tracking_id"].map(dict_turb)
test["turbulence"] = test["tracking_id"].map(dict_turb)

## As now, after tweaking the wind direction feature, the sign of wind direction is representative of its direction, wind speeds can be taken as their absolute value without the worry of any loss information due to its sign. 

In [None]:
train['wind_speed(m/s)'] = abs(train['wind_speed(m/s)'])
test['wind_speed(m/s)'] = abs(test['wind_speed(m/s)'])

# Hour and Month
## Hour of the day and months of a year can be decisive of winds speed(sunsets and rise cause significant change in temperature which in turn causes change in flow of wind. Similarly, Winds in monsoon are more intense than in summers.)
#### NOTE : As the geographical location of the establishments of windmill are unknown, it is difficult to group months to represent seasons, which vary across the globe. 

In [None]:
train['hour'] = train['datetime'].dt.hour
test['hour'] = test['datetime'].dt.hour
train['month'] = train['datetime'].dt.month
test['month'] = test['datetime'].dt.month

# Theoretical power output
## Tried to include a column which represents the theoretical power generated by the windmill. This feature however, did not contribute anything to the model and was thus commented out. 
#### Formula used :- 
#### power = [(air density) * (swept area of blades) * (wind speed cubed)] / 2. 
#### The area is in meters squared, air density is in kilograms per meters cubed and wind speed is in meters per second.

In [None]:
#train['expected_output'] = train['air_density']*((train['blade_length(m)']**2)*3.14)*(train['wind_speed(m/s)']**3)
#test['expected_output'] = test['air_density']*((test['blade_length(m)']**2)*3.14)*(test['wind_speed(m/s)']**3)
#train['expected_output'] = abs(train['expected_output'])
#test['expected_output'] = abs(test['expected_output'])

In [None]:
train['air_density'] = abs(train['air_density'])
test['air_density'] = abs(test['air_density'])

train['blade_length(m)'] = abs(train['blade_length(m)'])
test['blade_length(m)'] = abs(test['blade_length(m)'])
train['unique'] = (train.tracking_id).str[3:]
test['unique'] = (test.tracking_id).str[3:]
train.unique = train.unique.astype('int64')
test.unique = test.unique.astype('int64')

# Filling NaNs
### Values are imputed while prioritizing the distribution of features across train and test data, and correlations amongst features. 

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer

### Dataset has quite a number of missing values. 

In [None]:
train.isnull().sum()

In [None]:
#train['gearbox_temperature(°C)'].fillna(train['gearbox_temperature(°C)'].mean(), inplace = True)
#test['gearbox_temperature(°C)'].fillna(test['gearbox_temperature(°C)'].mean(), inplace = True)

#train['blade_length(m)'].fillna(train['blade_length(m)'].mean(), inplace = True)
#test['blade_length(m)'].fillna(test['blade_length(m)'].mean(), inplace = True)

In [None]:
train['atmospheric_pressure(Pascal)'].fillna(train['atmospheric_pressure(Pascal)'].mean(), inplace = True)
test['atmospheric_pressure(Pascal)'].fillna(test['atmospheric_pressure(Pascal)'].mean(), inplace = True)

### Motor torque and generator temperature have a correlation value of 0.94, therefore generator temperature is used to impute missing values of motor torque. 
### This is done by keeping in mind the importance of motor torque value with the target of the task, after properly comprehending the task in hand and reading multiple articles related to the domain. 

In [None]:
pd.options.mode.chained_assignment = None 


index = train.loc[((train['motor_torque(N-m)']).isnull()) & (train['generator_temperature(°C)'].notnull())].index.to_list()
for i in index:
    upper = (train['generator_temperature(°C)'][i])+2
    lower = (train['generator_temperature(°C)'][i])-2
    A = (train['generator_temperature(°C)']>lower)
    B = (train['generator_temperature(°C)']<upper)
    val = train.loc[A & B,'motor_torque(N-m)'].mean()
    train['motor_torque(N-m)'][i] = val
    
index = test.loc[((test['motor_torque(N-m)']).isnull()) & (test['generator_temperature(°C)'].notnull())].index.to_list()
for i in index:
    upper = (test['generator_temperature(°C)'][i])+2
    lower = (test['generator_temperature(°C)'][i])-2
    A = (test['generator_temperature(°C)']>lower)
    B = (test['generator_temperature(°C)']<upper)
    val = test.loc[A & B, 'motor_torque(N-m)'].mean()
    test['motor_torque(N-m)'][i] = val

In [None]:
#train.drop(labels='generator_temperature(°C)', axis=1, inplace=True)
#test.drop(labels='generator_temperature(°C)', axis=1, inplace=True)

### Processing *cloud_level* to analyze and impute it's missing values.

In [None]:
train['cloud_level'] = train['cloud_level'].replace({'Extremely Low':0, 'Low':2, 'Medium':3})
test['cloud_level'] = test['cloud_level'].replace({'Extremely Low':0, 'Low':2, 'Medium':3})

In [None]:
print(train.groupby('cloud_level')['windmill_generated_power(kW/h)'].describe())
train.loc[(train['windmill_generated_power(kW/h)']<2.000001) & train.cloud_level.isnull(), 'cloud_level'] = 0

In [None]:
print(test.groupby('cloud_level')['motor_torque(N-m)'].describe())
test.loc[(test['motor_torque(N-m)']<1070) & (test.cloud_level.isnull()), 'cloud_level'] = 0

### After analysing the relation of *cloud level* with various feature, we see that there is a distinct demarcation of "extremely low" cloud type with other cloud types from the huge difference in distribution of the power generated by the windmill. 
### Also, since there is not real difference in distribution of the target variable, or any other variable, with respect to the cloud types of "Low" and "Medium", therefore, they are treated the same. This helps in generalizing and better training of the model.

In [None]:
train['cloud_level'].fillna(3, inplace = True)
test['cloud_level'].fillna(3, inplace = True)

In [None]:
train = train.join(pd.get_dummies(train.cloud_level, prefix = 'cloud'))
train.drop(['cloud_level', 'cloud_2.0', 'cloud_3.0'], axis=1, inplace = True)

test = test.join(pd.get_dummies(test.cloud_level, prefix='cloud'))
test.drop(['cloud_level', 'cloud_2.0', 'cloud_3.0'], axis=1, inplace = True)

### Similarly it was found that turbine status does not influence the power generated by the windmill. 

In [None]:
print(train.groupby('turbine_status')['windmill_generated_power(kW/h)'].mean(),
      train.groupby('turbine_status')['motor_torque(N-m)'].mean())
#train.drop(labels='turbine_status', axis=1, inplace=True)
#test.drop(labels='turbine_status', axis=1, inplace=True)

In [None]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(train.turbine_status.astype('str'))


nan_index_train = train.loc[train.turbine_status.isnull()].index.to_list()[0]
nan_index_test = test.loc[test.turbine_status.isnull()].index.to_list()[0]


train.turbine_status = le.transform(train.turbine_status.astype('str'))
test.turbine_status = le.transform(test.turbine_status.astype('str'))


nan_value_train = train.turbine_status[nan_index_train]
nan_value_test = train.turbine_status[nan_index_test]


train['turbine_status'] = train['turbine_status'].replace({nan_value_train : np.nan})
test['turbine_status'] = test['turbine_status'].replace({nan_value_test : np.nan})

In [None]:
#for col in train.columns.to_list():
#    if train[col].dtype == 'float64':
#            print(col)
#            print(np.var(train[col]),'\n')

In [None]:
#train.drop(labels='blade_breadth(m)', axis=1, inplace=True)
#test.drop(labels='blade_breadth(m)', axis=1, inplace=True)

In [None]:
train.set_index('tracking_id', inplace = True)
test.set_index('tracking_id', inplace = True)

# Models 

In [None]:
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import sklearn.metrics as metrics 
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from sklearn.metrics import r2_score

# CatBoostRegressor  with HyperOpt

In [None]:
xtrain=train.copy()
xtrain.drop('datetime', axis = 1, inplace = True)
xtest=test.copy()
xtest.drop('datetime', axis = 1, inplace = True)


y = xtrain['windmill_generated_power(kW/h)'].values
X = xtrain.drop('windmill_generated_power(kW/h)', axis =1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0)

In [None]:
space = {
    'learning_rate':     hp.loguniform('learning_rate',np.log(0.01), np.log(0.2)),
    'max_depth':         hp.quniform("max_depth", 5, 16, 1),
    'colsample_bylevel': hp.uniform('colsample_bylevel', 0.5,1),
    'n_estimators':      500,
    'eval_metric':       'RMSE',
    'reg_lambda' :       hp.uniform('reg_lambda', 0,1)
}

In [None]:
def objective(space):
    clf=CatBoostRegressor(
                    n_estimators =space['n_estimators'], 
                    max_depth = space['max_depth'],
                    colsample_bylevel= space['colsample_bylevel'],
                    eval_metric = space['eval_metric'],
                    reg_lambda = space['reg_lambda'], 
                    learning_rate = space['learning_rate']
    )
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation,
            #early_stopping_rounds=10,
            verbose=False)
    

    pred = clf.predict(X_test)
    r2 = max(0,100*r2_score(y_test, pred))
    print ("SCORE:", r2)
    return {'loss': -r2, 'status': STATUS_OK, 'model' : clf }

## The below process of getting the best parameter for a particular "n_estimators" with and without early stopping was done several time with "n_estimator" being 500, 600, 750, 850, 900 and 1000. This is an extremely time consuming and computationally expensive process, therefore I have not run it again.
## However, 3 cells below is the list (best_param_list) of some of the best parameters obtained by the above process.

In [None]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 200,
                        trials = trials)

In [None]:
print("The best hyperparameters are : ","\n")
print(best_hyperparams)

In [None]:
best_param_list = []
sorted_trial_losses = sorted(trials.losses())
t = trials.trials
#print("Best params for the top losses")
#print(best_hyperparams)
for i in range(len(t)):
    if ((t[i]['result']['loss']== sorted_trial_losses[0]) | (t[i]['result']['loss']== sorted_trial_losses[9]) |
        (t[i]['result']['loss']== sorted_trial_losses[1]) | (t[i]['result']['loss']== sorted_trial_losses[2]) |
        (t[i]['result']['loss']== sorted_trial_losses[3]) | (t[i]['result']['loss']== sorted_trial_losses[4]) |
        (t[i]['result']['loss']== sorted_trial_losses[5]) | (t[i]['result']['loss']== sorted_trial_losses[6]) |
        (t[i]['result']['loss']== sorted_trial_losses[7]) | (t[i]['result']['loss']== sorted_trial_losses[8])):
        print(t[i]['result']['loss'], '\t', i)
        print(t[i]['misc']['vals'])
        best_param_list.append(t[i]['misc']['vals'])

In [None]:
best_param_list =[ 
                 {'colsample_bylevel': [0.7840776236431249],
  'learning_rate': [0.06646758913213958],
  'max_depth': [7.0],
  'reg_lambda': [0.781094012645824],
  'n_estimators' : [900],
  'early_stopping_rounds' : [10]},   #900 WES 97.112 ---> n_estimators = 900, WES = With Early stopping, 97.112 = Public test score
                   {'colsample_bylevel': [0.9514227590176606],
  'learning_rate': [0.04847664475604065],
  'max_depth': [8.0],
  'reg_lambda': [0.28689172062837387],
  'n_estimators' : [900],
  'early_stopping_rounds' : [10]},  #900 WES 97.116
                   
        
                  
                  
             
                 {'colsample_bylevel': [0.7714687677253086],
  'learning_rate': [0.1026578686058806],
  'max_depth': [7.0],
  'reg_lambda': [0.9403115684587895],
  'n_estimators' : [850],
  'early_stopping_rounds' : [10]}, #850 WES 97.1079 
                   
                  
                  
                  
                  {'colsample_bylevel': [0.9018718841848762],
  'learning_rate': [0.08907308842612886],
  'max_depth': [7.0],
  'reg_lambda': [0.2788033754905464],
  'n_estimators' : [850],
  'early_stopping_rounds' : [None]}, #850 NES 97.12736
                   {'colsample_bylevel': [0.9927906240334108],
  'learning_rate': [0.08354190505569654],
  'max_depth': [7.0],
  'reg_lambda': [0.23944221880461342],
  'n_estimators' : [850],
  'early_stopping_rounds' : [None]},  #850 NES 97.106
                  {'colsample_bylevel': [0.9531240033491211],
  'learning_rate': [0.07887402371682703],
  'max_depth': [8.0],
  'reg_lambda': [0.1528264376777892],
  'n_estimators' : [850],
  'early_stopping_rounds' : [None]}, #850 NES 97.113
                  
                  
                  


                   {'colsample_bylevel': [0.8165270016931238],
  'learning_rate': [0.1074426160195262],
  'max_depth': [7.0],
  'reg_lambda': [0.8709247833299882],
  'n_estimators' : [600],
  'early_stopping_rounds' : [None]},  #600 NES 97.101
                  {'colsample_bylevel': [0.9036631000437778],
  'learning_rate': [0.11934909736210365],
  'max_depth': [7.0],
  'reg_lambda': [0.15195405997837974],
  'n_estimators' : [600],
  'early_stopping_rounds' : [None]},  #600 NES 97.15204
    
    
    
    
    
    {'colsample_bylevel': [0.9713918522977533],
 'learning_rate': [0.06172263631910841],
 'max_depth': [8.0],
 'reg_lambda': [0.28629630830807196],
  'n_estimators' : [900],
  'early_stopping_rounds' : [None]},  #900 NES 97.15198
    {'colsample_bylevel': [0.7965427860702724],
 'learning_rate': [0.05817673067609091],
 'max_depth': [7.0],
 'reg_lambda': [0.030299421668642385],
  'n_estimators' : [900],
  'early_stopping_rounds' : [None]}, #900 NES 97.14199
    
    
    
    
    {'colsample_bylevel': [0.9936898562667233],
 'learning_rate': [0.10448908377830841],
 'max_depth': [7.0],
 'reg_lambda': [0.7821509420549156],
  'n_estimators' : [1000],
  'early_stopping_rounds' : [None]}, #1000 NES 97.13968
] 


In [None]:
subs = {}
for i in range(0,11):
    cat = CatBoostRegressor(loss_function = 'RMSE',
                            eval_metric='R2', 
                            random_seed=14,
                            colsample_bylevel = best_param_list[i]['colsample_bylevel'][0],
                            learning_rate = best_param_list[i]['learning_rate'][0], 
                            max_depth = best_param_list[i]['max_depth'][0],
                            reg_lambda = best_param_list[i]['reg_lambda'][0],
                            n_estimators = best_param_list[i]['n_estimators'][0],
                            early_stopping_rounds = best_param_list[i]['early_stopping_rounds'][0]
                           )
    cat.fit(X, y, verbose = 0)
    p_y = cat.predict(X)
    print(100*metrics.r2_score(y, p_y))
    p_y = cat.predict(X_test)
    print(100*metrics.r2_score(y_test, p_y), '\n')
    p_y = cat.predict(xtest)
    subs[i] = p_y

In [None]:
p_y = 0
for i in range(0,11):
    p_y = p_y + subs[i]
p_y = p_y/11
print(p_y, '\n', p_y.shape)

In [None]:
sample = test[['datetime']]
sample = pd.DataFrame(sample)
sample['datetime'] = sample['datetime'].astype('datetime64[ns]')
sample['windmill_generated_power(kW/h)'] = p_y
sample.to_csv('Predictions.csv')

In [None]:
import seaborn as sns
feature_imp = pd.DataFrame(sorted(zip(cat.feature_importances_, X_train.columns), reverse=True)[:50], 
                           columns=['Value','Feature'])
plt.figure(figsize=(15,15))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('Catboost Features')
plt.tight_layout()
plt.show()


# XGBoost Regressor with HyperOpt

In [None]:
xtrain=train.copy()
xtrain.drop('datetime', axis = 1, inplace = True)
xtest=test.copy()
xtest.drop('datetime', axis = 1, inplace = True)


y = xtrain['windmill_generated_power(kW/h)'].values
X = xtrain.drop('windmill_generated_power(kW/h)', axis =1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [None]:
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
       'learning_rate':    hp.loguniform('learning_rate',np.log(0.01), np.log(0.2)),
       'gamma': hp.uniform ('gamma', 1,9),
       'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
       'reg_lambda' : hp.uniform('reg_lambda', 0,1),
       'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
       'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
       'n_estimators': 500,
       'seed': 14,
    }

In [None]:
def objective(space):
    clf=XGBRegressor(
                    n_estimators =space['n_estimators'], 
                    max_depth = int(space['max_depth']),
                    gamma = space['gamma'],
                    reg_alpha = space['reg_alpha'],
                    min_child_weight=space['min_child_weight'], 
                    reg_lambda = space['reg_lambda'],
                    colsample_bytree= space['colsample_bytree'], 
                    learning_rate = space['learning_rate'],
                    seed = space['seed']
    
    )
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="rmse",
            early_stopping_rounds=10,verbose=False)
    

    pred = clf.predict(X_test)
    r2 = max(0,100*r2_score(y_test, pred))
    print ("SCORE:", r2)
    return {'loss': -r2, 'status': STATUS_OK, 'model' : clf }

## The below process of getting the best parameter for a particular "n_estimators" with and without early stopping was done several time with "n_estimator" being 500, 600, 750, 850, 900 and 1000. This is an extremely time consuming and computationally expensive process, therefore I have not run it again.
## However, 3 cells below is the list (hyper_param_list_xgb) of some of the best parameters obtained by the above process.

In [None]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 200,
                        trials = trials)

In [None]:
print("The best hyperparameters are : ","\n")
print(best_hyperparams)

In [None]:
hyper_param_list_xgb = []
sorted_trial_losses = sorted(trials.losses())
t = trials.trials
print("Best params for the top losses")
print(best_hyperparams)
for i in range(len(t)):
    if ((t[i]['result']['loss']== sorted_trial_losses[0]) |
       (t[i]['result']['loss']== sorted_trial_losses[1]) | (t[i]['result']['loss']== sorted_trial_losses[2]) |
       (t[i]['result']['loss']== sorted_trial_losses[3]) | (t[i]['result']['loss']== sorted_trial_losses[4])):
        print(t[i]['result']['loss'])
        print(t[i]['misc']['vals'])
        hyper_param_list_xgb.append(t[i]['misc']['vals'])

In [None]:
hyper_param_list_xgb = [{'colsample_bytree': [0.9892149956637211],
  'gamma': [4.151339507172721],
  'max_depth': [14.0],
  'reg_alpha': [115.0],
  'reg_lambda':[ 0.7803914410527434]},
 {'colsample_bytree': [0.9371155535338993],
  'gamma': [3.35490402669362],
  'max_depth': [16.0],
  'reg_alpha': [117.0],
  'reg_lambda': [0.8993500199753666]},
 {'colsample_bytree': [0.9463180516464882],
  'gamma': [4.644028461064384],
  'max_depth': [15.0],
  'reg_alpha': [70.0],
  'reg_lambda': [0.7966394133180845]},
 {'colsample_bytree': [0.9296947166702829],
  'gamma': [2.5828440605645158],
  'max_depth': [15.0],
  'reg_alpha': [114.0],
  'reg_lambda': [0.7577023363874883]},
 {'colsample_bytree': [0.9590719842000492],
  'gamma': [2.362607579231714],
  'max_depth': [17.0],
  'reg_alpha': [111.0],
  'reg_lambda': [0.8373380626534839]}]

In [None]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)
subs_xgb = {}
for i in range(0,5):
    model = XGBRegressor(   objective = 'reg:squarederror',
                            gamma = hyper_param_list_xgb[i]['gamma'][0],
                            colsample_bytree = hyper_param_list_xgb[i]['colsample_bytree'][0],
                            max_depth =int(hyper_param_list_xgb[i]['max_depth'][0]),
                            reg_lambda = hyper_param_list_xgb[i]['reg_lambda'][0],
                            reg_alpha = hyper_param_list_xgb[i]['reg_alpha'][0], 
                            n_estimators = 500
                           )
    model.fit(X_train, y_train, verbose = 0, 
              eval_metric='rmse', early_stopping_rounds = 10,
            eval_set = [(X_train, y_train),(X_test, y_test)])
        
    p_y = model.predict(X)
    print(100*metrics.r2_score(y, p_y))
    p_y = model.predict(X_test)
    print(100*metrics.r2_score(y_test, p_y), '\n')
    p_y = model.predict(xtest)
    subs_xgb[i] = p_y

In [None]:
p_y = 0
for i in range(0,5):
    p_y = p_y + subs_xgb[i]
p_y = p_y/5
print(p_y, '\n', p_y.shape)

sample = test[['datetime']]
sample = pd.DataFrame(sample)
sample['datetime'] = sample['datetime'].astype('datetime64[ns]')
sample['windmill_generated_power(kW/h)'] = p_y
sample.to_csv('Predictions1.csv')