### Importing all the required libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model


### Reading the data

In [None]:
df=pd.read_csv("../input/a-fine-windy-day-hackerearth-ml-challenge/train_data.csv")
test = pd.read_csv("../input/a-fine-windy-day-hackerearth-ml-challenge/test_data.csv")

In [None]:
df.head()

In [None]:
df.describe()

### Checking for null values

In [None]:
df.isnull().sum()

In [None]:
df.columns

### Generating heatmap through correlations

In [None]:
corr_data = df.corr()

plt.figure(figsize=(20,12))
sns.heatmap(corr_data.abs(), annot=True, fmt='.3f',cmap='hot',square=True)
plt.show()

### Distribution of dependent variable(power generated by windmill)

In [None]:
sns.distplot(df['windmill_generated_power(kW/h)'])

### Handling missing data through mean, median and mode

In [None]:
df['wind_speed(m/s)']=df['wind_speed(m/s)'].fillna(df['wind_speed(m/s)'].median())
df['atmospheric_temperature(°C)']=df['atmospheric_temperature(°C)'].fillna(df['atmospheric_temperature(°C)'].median())
df['shaft_temperature(°C)']=df['shaft_temperature(°C)'].fillna(df['shaft_temperature(°C)'].median())
df['blades_angle(°)']=df['blades_angle(°)'].fillna(df['blades_angle(°)'].median())
df['gearbox_temperature(°C)']=df['gearbox_temperature(°C)'].fillna(df['gearbox_temperature(°C)'].median())
df['engine_temperature(°C)']=df['engine_temperature(°C)'].fillna(df['engine_temperature(°C)'].median())
df['motor_torque(N-m)']=df['motor_torque(N-m)'].fillna(df['motor_torque(N-m)'].median())
df['generator_temperature(°C)']=df['generator_temperature(°C)'].fillna(df['generator_temperature(°C)'].median())
df['atmospheric_pressure(Pascal)']=df['atmospheric_pressure(Pascal)'].fillna(df['atmospheric_pressure(Pascal)'].median())
df['windmill_body_temperature(°C)']=df['windmill_body_temperature(°C)'].fillna(df['windmill_body_temperature(°C)'].median())
df['wind_direction(°)']=df['wind_direction(°)'].fillna(df['wind_direction(°)'].median())
df['resistance(ohm)']=df['resistance(ohm)'].fillna(df['resistance(ohm)'].median())
df['rotor_torque(N-m)']=df['rotor_torque(N-m)'].fillna(df['rotor_torque(N-m)'].median())
df['rotor_torque(N-m)']=df['rotor_torque(N-m)'].fillna(df['rotor_torque(N-m)'].median())
df['turbine_status']=df['turbine_status'].fillna("Unknown")
df['cloud_level']=df['cloud_level'].fillna("Unknown")
df['blade_length(m)']=df['blade_length(m)'].fillna(df['blade_length(m)'].median())
df['windmill_height(m)']=df['windmill_height(m)'].fillna(df['windmill_height(m)'].median())

In [None]:
test['wind_speed(m/s)']=test['wind_speed(m/s)'].fillna(test['wind_speed(m/s)'].median())
test['atmospheric_temperature(°C)']=test['atmospheric_temperature(°C)'].fillna(test['atmospheric_temperature(°C)'].median())
test['shaft_temperature(°C)']=test['shaft_temperature(°C)'].fillna(test['shaft_temperature(°C)'].median())
test['blades_angle(°)']=test['blades_angle(°)'].fillna(test['blades_angle(°)'].median())
test['gearbox_temperature(°C)']=test['gearbox_temperature(°C)'].fillna(test['gearbox_temperature(°C)'].median())
test['engine_temperature(°C)']=test['engine_temperature(°C)'].fillna(test['engine_temperature(°C)'].median())
test['motor_torque(N-m)']=test['motor_torque(N-m)'].fillna(test['motor_torque(N-m)'].median())
test['generator_temperature(°C)']=test['generator_temperature(°C)'].fillna(test['generator_temperature(°C)'].median())
test['atmospheric_pressure(Pascal)']=test['atmospheric_pressure(Pascal)'].fillna(test['atmospheric_pressure(Pascal)'].median())
test['windmill_body_temperature(°C)']=test['windmill_body_temperature(°C)'].fillna(test['windmill_body_temperature(°C)'].median())
test['wind_direction(°)']=test['wind_direction(°)'].fillna(test['wind_direction(°)'].median())
test['resistance(ohm)']=test['resistance(ohm)'].fillna(test['resistance(ohm)'].median())
test['rotor_torque(N-m)']=test['rotor_torque(N-m)'].fillna(test['rotor_torque(N-m)'].median())
test['rotor_torque(N-m)']=test['rotor_torque(N-m)'].fillna(test['rotor_torque(N-m)'].median())
test['turbine_status']=test['turbine_status'].fillna("Unknown")
test['cloud_level']=test['cloud_level'].fillna("Unknown")
test['blade_length(m)']=test['blade_length(m)'].fillna(test['blade_length(m)'].median())
test['windmill_height(m)']=test['windmill_height(m)'].fillna(test['windmill_height(m)'].median())
test['area_temperature(°C)']=test['area_temperature(°C)'].fillna(test['area_temperature(°C)'].median())

In [None]:
df.describe()

In [None]:
df = df.dropna(how='any',axis=0) 


In [None]:
df.isnull().sum()

In [None]:
test.isnull().sum()

### Using label encoder for categorical variables

In [None]:
labelencoder = LabelEncoder()
df['turbine_status'] = labelencoder.fit_transform(df['turbine_status'])
df['cloud_level'] = labelencoder.fit_transform(df['cloud_level'])
test['turbine_status'] = labelencoder.fit_transform(test['turbine_status'])
test['cloud_level'] = labelencoder.fit_transform(test['cloud_level'])

In [None]:
## Converting the feature "datetime" into pandas datetime format
df['datetime'] = pd.to_datetime(df['datetime'])
test['datetime'] = pd.to_datetime(test['datetime'])

In [None]:
## Extracting some informations for the feature "datetime" and making new features from it
df['dmonth'] = df['datetime'].dt.month
df['dday'] = df['datetime'].dt.day
df['ddayofweek'] = df['datetime'].dt.dayofweek

test['dmonth'] = test['datetime'].dt.month
test['dday'] = test['datetime'].dt.day
test['ddayofweek'] = test['datetime'].dt.dayofweek

### Generating train and test datasets

In [None]:
X = df.drop(columns=['windmill_generated_power(kW/h)','tracking_id', 'datetime'],axis=1)
y = df['windmill_generated_power(kW/h)']
y = y.values.ravel()

In [None]:
X_TEST = test.drop(columns=['tracking_id', 'datetime'],axis=1)

In [None]:
#scaler = RobustScaler()
#X = scaler.fit_transform(X)
#X_TEST = scaler.transform(X_TEST)

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state=1)
print(X.shape,y.shape)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

### Implementing Linear Regressor

In [None]:
model=LinearRegression()

In [None]:
model_1=model.fit(X_train,y_train)

In [None]:
print('Accuracy score for training: {:.4f}'.format(model_1.score(X_train,y_train)))
print('Accuracy score for testing: {:.4f}'.format(model_1.score(X_test,y_test)))

### Implementing Gradient Boosting

In [None]:
gb = GradientBoostingRegressor(criterion='mse',random_state=0,max_depth=5,n_estimators=500,min_samples_split=2,min_samples_leaf=2)
gb_1 = gb.fit(X_train,y_train)

In [None]:
print('Accuracy score for training: {:.4f}'.format(gb_1.score(X_train,y_train)))
print('Accuracy score for testing: {:.4f}'.format(gb_1.score(X_test,y_test)))

### Implementing Random Forest

In [None]:
rf = RandomForestRegressor()
rf_2 = rf.fit(X_train,y_train)

In [None]:
print('Accuracy score for training: {:.4f}'.format(rf_2.score(X_train,y_train)))
print('Accuracy score for testing: {:.4f}'.format(rf_2.score(X_test,y_test)))

### Hyperparameter tuning by RandomsearchCV (Not much effective)  

n_estimators = [int(x) for x in np.linspace(start = 5 , stop = 15, num = 10)] # returns 10 numbers 

max_features = ['auto', 'log2']


bootstrap = [True, False]

r_grid = {'n_estimators': n_estimators,

               'max_features': max_features,

               'max_depth': [3,5,8,10],

               'bootstrap': bootstrap}

print(r_grid)

rfr = RandomForestRegressor(random_state = 1)

rfr_random = RandomizedSearchCV(estimator=rfr, param_distributions=r_grid, n_iter = 20, scoring='neg_mean_absolute_error', cv = 3, verbose=2, random_state=42, n_jobs=-1, return_train_score=True)

rfr_random.fit(X_train, y_train);

print(rfr_random.best_params_)

rf = RandomForestRegressor(n_estimators = 12, max_features = 'auto', max_depth = 10, bootstrap = True)
rf_1 = rf.fit(X_train,y_train)

print('Accuracy score for training: {:.4f}'.format(rf_1.score(X_train,y_train)))
print('Accuracy score for testing: {:.4f}'.format(rf_1.score(X_test,y_test)))

### Implementing Extra trees regressor

In [None]:
et = ExtraTreesRegressor(criterion='mse', random_state=0, n_jobs=-1, min_samples_leaf=1, max_depth=20, min_samples_split=3, n_estimators=1000)

et_1 = et.fit(X_train, y_train)


In [None]:
print('Accuracy score for training: {:.4f}'.format(et_1.score(X_train,y_train)))
print('Accuracy score for testing: {:.4f}'.format(et_1.score(X_test,y_test)))

### Implementing XGBoost Regressor(Vanilla)

In [None]:
#xgb = XGBRegressor(colsample_bytree=1,gamma=0.5,max_depth=10,min_child_weight=5,subsample=0.8)
#xgb_1 = xgb.fit(X_train,y_train)
#print('Accuracy score for training: {:.4f}'.format(xgb_1.score(X_train,y_train)))
#print('Accuracy score for testing: {:.4f}'.format(xgb_1.score(X_test,y_test)))

### Parameters after hyperparameter tuning using gridsearch

In [None]:
#setting parameter grid for hyperparameter tuning
#param_grid = {
       'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5, 7, 10],
        'min_child_weight': [1, 3, 5],
        'subsample': [0.5, 0.7],
        'colsample_bytree': [0.5, 0.7],
        'n_estimators' : [100, 200, 500],
        'objective': ['reg:squarederror']
        }

In [None]:
#r2 = XGBRegressor(random_state= 11)
#grid_search2 = GridSearchCV(estimator = r2, param_grid = param_grid,cv = 3, n_jobs = -1)
#grid_search2.fit(X_train, y_train)

In [None]:
#grid_search2.best_params_

In [None]:
#xgb = XGBRegressor(colsample_bytree=1,gamma=0.5,max_depth=5,min_child_weight=1,subsample=1)
#xgb_1 = xgb.fit(X_train,y_train)
#print('Accuracy score for training: {:.4f}'.format(xgb_1.score(X_train,y_train)))
#print('Accuracy score for testing: {:.4f}'.format(xgb_1.score(X_test,y_test)))

In [None]:
#xgb = XGBRegressor(colsample_bytree=0.7,max_depth=7,learning_rate = 0.1,min_child_weight=3,subsample=0.7,n_estimators = 200,objective= 'reg:squarederror')
#xgb_1 = xgb.fit(X_train,y_train)
#print('Accuracy score for training: {:.4f}'.format(xgb_1.score(X_train,y_train)))
#print('Accuracy score for testing: {:.4f}'.format(xgb_1.score(X_test,y_test)))

In [None]:
xgb = XGBRegressor(n_estimators=500,max_depth=5,booster='gbtree',n_jobs=-1,learning_rate=0.1,reg_lambda=0.01,reg_alpha=0.3)
xgb_1 = xgb.fit(X_train,y_train)
print('Accuracy score for training: {:.4f}'.format(xgb_1.score(X_train,y_train)))
print('Accuracy score for testing: {:.4f}'.format(xgb_1.score(X_test,y_test)))

### Implementing Stacking

In [None]:
meta_model = linear_model.Lasso(alpha=0.1,tol = 0.001, random_state= 0)

In [None]:
y_pred_train_1 = gb_1.predict(X_train).reshape(-1,1)
y_pred_train_2 = xgb_1.predict(X_train).reshape(-1,1)
y_pred_train_3 = rf_2.predict(X_train).reshape(-1,1)
X_stack_train = np.concatenate([y_pred_train_1,
                                    y_pred_train_2,
                                    y_pred_train_3], axis=1)
    
y_pred_test_1 = gb_1.predict(X_test).reshape(-1,1)
y_pred_test_2 = xgb_1.predict(X_test).reshape(-1,1)
y_pred_test_3 = rf_2.predict(X_test).reshape(- 1,1)
X_stack_test = np.concatenate([y_pred_test_1,
                                   y_pred_test_2,
                                   y_pred_test_3], axis=1)
    
meta_model_1=meta_model.fit(X_stack_train, y_train)

In [None]:
print('Accuracy score for training: {:.4f}'.format(meta_model_1.score(X_stack_train,y_train)))
print('Accuracy score for testing: {:.4f}'.format(meta_model_1.score(X_stack_test,y_test)))

### Predicting test data and exporting it

In [None]:
sub = test[['tracking_id','datetime']]

In [None]:
predictions = xgb.predict(X_TEST)

In [None]:
predictions

In [None]:
sub['windmill_generated_power(kW/h)'] = predictions

In [None]:
sub.to_csv('./s3.csv',header=True,index=False)
sub