**A Fine Windy Day: HackerEarth Machine Learning challenge**

### **Importing Libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import PowerTransformer
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRFRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import VotingRegressor
from scipy import stats
from scipy.stats import kurtosis
from scipy.stats import skew
#import optuna
import sklearn
import pickle
import warnings
warnings.filterwarnings("ignore")

### **Read & Understand Data**

In [None]:
train = pd.read_csv("../input/a-fine-windy-day-hackerearth-ml-challenge/train_data.csv")
train.head()

In [None]:
test = pd.read_csv("../input/a-fine-windy-day-hackerearth-ml-challenge/test_data.csv")
test.head()

In [None]:
print("Train data contains % 2d rows and % 2d columns" %(train.shape[0],train.shape[1]), "\n")
print("Test data contains {} rows and {} columns" .format(test.shape[0],test.shape[1]))

In [None]:
for columns in train.columns:
  if columns not in test.columns:
    print("Column not present in the Test Data is: ", columns)

### **EDA for Train Data**

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
train.hist(bins=50, figsize=(15, 15))

In [None]:
train.skew()

In [None]:
#print(data_train.isnull().any())
print(train.isnull().sum())

In [None]:
sns.set(rc={'figure.figsize':(5,5)})
sns.distplot(train['windmill_generated_power(kW/h)'], bins=30)
plt.show()

### **EDA for Test Data**

In [None]:
test.info()

In [None]:
test.describe()

In [None]:
test.hist(bins=50, figsize=(15, 15))

In [None]:
test.skew()

In [None]:
#print(data_train.isnull().any())
print(test.isnull().sum())

### **Merging Two Data for Data Pre-Processing**

In [None]:
trainNew = train.copy()
#trainNew.drop(['windmill_generated_power(kW/h)'], axis=1,inplace=True)
testNew = test.copy()
testNew['windmill_generated_power(kW/h)'] = 0
df = pd.concat([trainNew.assign(ind="train"), test.assign(ind="test")])
df

### **Identifying the Numerical and Categorical Columns**

In [None]:
categoricalData = [column for column in df.columns if df[column].dtype == object ]
print("Categorical Columns in data are: ", categoricalData)

numericalData  = [column for column in df.columns if df[column].dtype != object]
print("Numerical Columns in data are: ", numericalData)

### **Cleaning Data**

#### **Removing negative values from Data**

Few columns cannot be negative in value, hence took the absolute value for those features

In [None]:
print(df[(df['wind_speed(m/s)'] < 0 )].shape)
print(df[(df['blade_length(m)'] < 0 )].shape)
print(df[(df['blade_breadth(m)'] < 0 )].shape)
print(df[(df['windmill_height(m)'] < 0 )].shape)

In [None]:
df['wind_speed(m/s)'] = np.absolute(df['wind_speed(m/s)'])
df['blade_length(m)'] = np.absolute(df['blade_length(m)'])
df['windmill_height(m)'] = np.absolute(df['blade_length(m)'])

In [None]:
for column in numericalData:
  Q1,Q3 = np.nanpercentile(df[column] , [25,75])
  IQR = Q3 - Q1
  cutOff = 1.5 * IQR
  lower_range = Q1 - cutOff
  upper_range = Q3 + cutOff
  print("Column is {} Q1 Value is {} Q3 value is {} lower bound is {} and upper bound is {}".format(column,Q1,Q3,lower_range,upper_range))

#### **Handling outliers**

Using the mean of q1 and q3 values to replace the values that does not meet the upper bound/lower bound criteria

In [None]:
df.loc[df['wind_speed(m/s)'] > 186, ['wind_speed(m/s)']] = (34+96)/2

In [None]:
print(df[(df['blade_length(m)'] > 7)].shape)
df.loc[df['blade_length(m)'] > 7, ['blade_length(m)']] = (2+3)/2

In [None]:
print(df[(df['area_temperature(°C)'] < 11)].shape)
df.loc[df['area_temperature(°C)'] < 11 , ['area_temperature(°C)']] = (27+38)/2

In [None]:
print(df[(df['engine_temperature(°C)'] < 38)].shape)
df.loc[df['engine_temperature(°C)'] < 11 , ['engine_temperature(°C)']] = (42+45)/2

In [None]:
sns.scatterplot(x='gearbox_temperature(°C)',y='windmill_generated_power(kW/h)',data=df)

In [None]:
print(df[(df['gearbox_temperature(°C)'] < -200)].shape)
print(df[(df['gearbox_temperature(°C)'] > 300)].shape)
df.loc[(df['gearbox_temperature(°C)'] < -200) | (df['gearbox_temperature(°C)'] > 300) , ['gearbox_temperature(°C)']] = df['gearbox_temperature(°C)'].mean()

In [None]:
df['atmospheric_temperature(°C)'].plot(kind='kde')

In [None]:
 print(df[(df['atmospheric_temperature(°C)'] < -50)].shape)
 #df['atmospheric_temperature(°C)'].mean()

### **Handling Null Values in Numerical data**

In [None]:
for column in numericalData:
  if column == 'windmill_generated_power(kW/h)':
    continue
  else:
    df[column] = df[column].fillna(df[column].mean())
  
df.isnull().sum()

### **Handling Null Values in Categorical Data**

##### **Finding unique value for the categorical columns**

In [None]:
for col in df[categoricalData]:
  if not (col == 'tracking_id') and not (col == 'datetime'):
    print("Unique values for: ", col)
    print(df[col].unique())
    print("\n")

In [None]:
print('Total null values in Cloud Level Feature: {}'.format(df['cloud_level'].isnull().sum()))
print('Total null values in Turnbine Status Feature: {}'.format(df['turbine_status'].isnull().sum()))

##### **Replace Null values with MODE**

In [None]:
for column in categoricalData:
  if column == 'tracking_id' or column == 'datetime' or column == 'ind':
    continue;
  else:
    modeValue = df[column].mode()[0]
    print('Mode for {} is: {}'.format(column, modeValue), "\n")
    df[column] = df[column].fillna(modeValue)
    

df.isnull().sum()

### **Removing Duplicated values, as it won't give any insight**

In [None]:
df.duplicated().any()

### **Converting Categorical Data to Numerical Data**

In [None]:
df = pd.get_dummies(df, columns=['turbine_status','cloud_level'])
df.head(1)

### **Converting date to datetime format splitting its value according to Date, Day and year**



In [None]:
df['datetimeNew'] = pd.to_datetime(df['datetime'])
df.head(1)

In [None]:
df['month'] = df['datetimeNew'].dt.month
df['day'] = df['datetimeNew'].dt.day
df['year'] = df['datetimeNew'].dt.year
df['hour'] = df['datetimeNew'].dt.hour
df['minute'] = df['datetimeNew'].dt.minute
df.drop(['datetimeNew'], axis=1, inplace= True)
df.head(1)

### **z-score**

In [None]:
for column in numericalData:
  z_score_Count = df[(np.abs(stats.zscore(df[column])) < 3)].shape
  print("Column: ", column,"\t\t", "Z-score: ", z_score_Count)

In [None]:
# df.loc[(np.abs(stats.zscore(df['atmospheric_pressure(Pascal)']))) > 3, ['atmospheric_pressure(Pascal)']] = df['atmospheric_pressure(Pascal)'].mean()
# df.loc[(np.abs(stats.zscore(df['atmospheric_pressure(Pascal)']))) > 3].shape

In [None]:
zscoreCol = ['engine_temperature(°C)', 'shaft_temperature(°C)', 'atmospheric_pressure(Pascal)', 'resistance(ohm)', 'rotor_torque(N-m)','blade_length(m)', 'windmill_height(m)']
for column in zscoreCol:
  df.loc[(np.abs(stats.zscore(df[column]))) > 3, [column]] = df[column].mean()
  print("Column: ", column, "Shape: ", df.loc[(np.abs(stats.zscore(df[column]))) > 3].shape)

### **Handling Skewness of the Data**

In [None]:
skewedColumns = ['wind_speed(m/s)','atmospheric_temperature(°C)', 'shaft_temperature(°C)', 'engine_temperature(°C)', 'windmill_body_temperature(°C)', 'rotor_torque(N-m)', 'blade_length(m)']
power = PowerTransformer(method='yeo-johnson', standardize=True) 
for column in skewedColumns:
  columnName = df[column].values.reshape(-1,1)
  df[column] = power.fit_transform(columnName)

df.head()

### **Spliting Test and Train Data**

In [None]:
data_test, data_train = df[df["ind"].eq("test")], df[df["ind"].eq("train")]
data_train.drop(['ind'], axis=1, inplace=True)
data_test.drop(['ind', 'windmill_generated_power(kW/h)'], axis=1, inplace=True)
print("Test Data shape: ", data_test.shape, "\n")
print('Train Data shape: ', data_train.shape)

In [None]:
data_train = data_train.dropna(how='any',axis=0)
data_train.shape

### **Separating features and labels**

In [None]:
newDataFrame = data_train.drop(['tracking_id','datetime','motor_torque(N-m)','windmill_generated_power(kW/h)', 'windmill_body_temperature(°C)'], axis=1)

In [None]:
features = newDataFrame
label = data_train['windmill_generated_power(kW/h)']

### **Feature Engineering**

In [None]:
 plt.figure(figsize=(30,20))
sns.heatmap(data_train.corr(),annot=True,cmap='BuGn_r',fmt='.2f')

In [None]:
vif = pd.DataFrame()
def calc_vif(X):
 # Calculating VIF
 vif["variables"] = X.columns
 vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
 return(vif)
X =  newDataFrame
calc_vif(X)
vif_high = vif.sort_values(by = 'VIF', ascending=False)
vif_high

In [None]:
# 1. Initialize the model algorithm
from sklearn.linear_model import LinearRegression

modelLR = LinearRegression()

# 2. Apply RFE to model (ALL FEATURES AND LABEL)
from sklearn.feature_selection import RFE
selectFeaturesFromRFE = RFE(estimator=modelLR,step=1)

# Fit the data with RFE
selectFeaturesFromRFE.fit(features,label)

# 3. Get Features with High Ranking (1,2,3,4,...) (Get features that has Rank 1. Sometimes Rank 2 is considered)
print(selectFeaturesFromRFE.ranking_)


In [None]:
# Initialize the model algorithm
from sklearn.linear_model import LinearRegression
modelLR = LinearRegression()
# 2. Apply SBM to model (ALL FEATURES AND LABEL)
from sklearn.feature_selection import SelectFromModel
selectFeaturesFromSFM = SelectFromModel(modelLR)
# Fit the data with SFM
selectFeaturesFromSFM.fit(features,label)
# 3. Get Features with True value
print(selectFeaturesFromSFM.get_support())


### **Feature Engineering - OLS**

In [None]:
import statsmodels.regression.linear_model as sm
regressor_OLS = sm.OLS(endog = label, exog = features).fit()
regressor_OLS.summary()

In [None]:
 featureFinal = ['minute', 'windmill_height(m)', 'shaft_temperature(°C)', 'cloud_level_Extremely Low', 'turbine_status_D', 'turbine_status_B', 'turbine_status_BBB', 'turbine_status_ABC', 'turbine_status_AB', 'turbine_status_AAA', 'turbine_status_BB', 'turbine_status_BCB', 'turbine_status_B2', 'turbine_status_A2', 'turbine_status_A', 'turbine_status_BD', 'turbine_status_AC', 'turbine_status_BA']
# featureFinal.append('turbine_status_BA')
# print(featureFinal)

The above featureFinal list was geenrated using OLS, until all the fetuares had p value < 0.05

In [None]:
# features.drop(['turbine_status_BA'], axis=1, inplace=True)
# regressor_OLS = sm.OLS(endog = label, exog = features).fit()
# regressor_OLS.summary()

### **Final feature building**

In [None]:
selectedFeatures = newDataFrame
for column in featureFinal:
  selectedFeatures.drop([column], axis=1, inplace =True)

selectedFeatures.columns

In [None]:
newDataFrameTest = data_test.drop(['tracking_id','datetime','motor_torque(N-m)', 'windmill_body_temperature(°C)'], axis=1)
featuresTest = newDataFrameTest
for column in featureFinal:
  featuresTest.drop([column], axis=1, inplace = True)

featuresTest.columns

### **APPLYING STANDARD SCALER**

In [None]:
#initialize scalar
standardScaler = StandardScaler()
selectedFeatures = standardScaler.fit_transform(selectedFeatures)
featuresTest = standardScaler.fit_transform(featuresTest)

In [None]:
# from sklearn.preprocessing import RobustScaler
# scaler = RobustScaler()
# selectedFeatures = scaler.fit_transform(selectedFeatures)
# featuresTest = scaler.transform(featuresTest)

### **Building the Model**

#### **Train Test Split with selected data**

##### **Finding optimum no random state**

In [None]:
for i in range(1,20):
  X_train,X_test,y_train,y_test = train_test_split(selectedFeatures, label, test_size=0.2, random_state = i)
  model1 = RandomForestRegressor()
  model1.fit(X_train,y_train)
  
  train_score = model1.score(X_train,y_train)
  test_score = model1.score(X_test,y_test)
  #if (test_score > 0.95):
  print("Test: {} , Train: {} , RS : {}".format(test_score,train_score,i))



In [None]:
x_train,x_test,y_train,y_test = train_test_split(selectedFeatures,label,train_size=0.8,random_state=15)
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

#### **Building model**

Already Performed standard scalar above, hence commenting out below code. To reduce the no of lines in the code, pipeline can be used

In [None]:
# def ModelTypes():
#   modelType = []
#   modelType.append(('LinearRegression'   , make_pipeline(StandardScaler(), LinearRegression())))
#   modelType.append(('Lasso'  ,make_pipeline(StandardScaler(), Lasso())))
#   modelType.append(('Ridge', make_pipeline(StandardScaler(), Ridge(alpha=1.0))))
#   modelType.append(('ElasticNet'  , make_pipeline(StandardScaler(), ElasticNet())))
#   modelType.append(('KNN'   , make_pipeline(StandardScaler(), KNeighborsRegressor(n_neighbors=2))))
#   modelType.append(('ExtraTreesRegressor', make_pipeline(StandardScaler(), ExtraTreesRegressor(n_jobs=-1, min_samples_leaf=1, max_depth=20, min_samples_split=3, n_estimators=1000))))
#   modelType.append(('DecisionTree'  , make_pipeline(StandardScaler(), DecisionTreeRegressor())))
#   modelType.append(('RandomForest'   , make_pipeline(StandardScaler(), RandomForestRegressor(n_estimators= 19, max_depth= 21.183668300467755, n_jobs=-1))))
#   modelType.append(('XGBRF'  , make_pipeline(StandardScaler(), XGBRFRegressor(n_jobs=-1, silent=True))))
#   modelType.append(('GradientBoostingRegressor', make_pipeline(StandardScaler(), GradientBoostingRegressor(criterion='mse',random_state=2,max_depth=5,n_estimators=500,min_samples_split=2,min_samples_leaf=2))))
#   modelType.append(('XGBRegressor', make_pipeline(StandardScaler(), XGBRegressor(n_estimators=500,max_depth=5,booster='gbtree',n_jobs=-1,learning_rate=0.1,reg_lambda=0.01,reg_alpha=0.3)))) 
  
#   return modelType

In [None]:
def ModelTypes():
  modelType = []
  modelType.append(('LinearRegression', LinearRegression()))
  modelType.append(('Lasso', Lasso()))
  modelType.append(('Ridge', Ridge(alpha=1.0)))
  modelType.append(('ElasticNet', ElasticNet()))
  modelType.append(('KNN', KNeighborsRegressor(n_neighbors=5)))
  modelType.append(('ExtraTreesRegressor', ExtraTreesRegressor(n_jobs=-1, min_samples_leaf=1, max_depth=20, min_samples_split=3, n_estimators=1000)))
  modelType.append(('DecisionTree', DecisionTreeRegressor()))
  modelType.append(('RandomForest', RandomForestRegressor()))
  modelType.append(('XGBRF', XGBRFRegressor(n_jobs=-1, silent=True)))
  modelType.append(('GradientBoostingRegressor', GradientBoostingRegressor(criterion='mse',random_state=2,max_depth=5,n_estimators=500,min_samples_split=2,min_samples_leaf=2)))
  modelType.append(('XGBRegressor', XGBRegressor(n_estimators=500,max_depth=5,booster='gbtree',n_jobs=-1,learning_rate=0.1,reg_lambda=0.01,reg_alpha=0.3))) 
  
  return modelType

In [None]:
def ModelBuilding(X_train, y_train, models):
  num_folds = 10
  scoring = 'neg_mean_squared_error'
  SEED = 15
  modelScoreDict = {}
  for name, model in models:
    kfold = KFold(n_splits=num_folds, random_state=SEED, shuffle= True)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold)
    modelScoreDict[name] = cv_results
    scores = "{}: {} ({})" .format(name, cv_results.mean(), cv_results.std())
    print(scores)

  return modelScoreDict

In [None]:
models = ModelTypes()
modelScoreDict = ModelBuilding(x_train, y_train, models)

In [None]:
gbr = GradientBoostingRegressor(criterion='mse',random_state=2,max_depth=5,n_estimators=500,min_samples_split=2,min_samples_leaf=2)
# rf = RandomForestRegressor(n_jobs=-1)
et = ExtraTreesRegressor(n_jobs=-1, min_samples_leaf=1, max_depth=20, min_samples_split=3, n_estimators=1000)
xgb = XGBRegressor(n_estimators=500,max_depth=5,booster='gbtree',n_jobs=-1,learning_rate=0.1,reg_lambda=0.01,reg_alpha=0.3)

In [None]:
%%time

modelFinal = VotingRegressor([('gbr', gbr),('xgb',xgb),('et', et)],n_jobs=-1)
modelFinal.fit(x_train, y_train)

#y_test_pre = modelFinal.predict(featuresTest)
y_train_pre = modelFinal.predict(x_train)
r2_train = r2_score(y_train, y_train_pre)
rmse_train  = np.sqrt(mean_squared_error(y_train, y_train_pre))
print("-----Training Data Evalution-----")
print("R2 Value: ", r2_train)
print("RMSE: ", rmse_train)

### **Predicting the Value from Test Data**

In [None]:
predictedValue = modelFinal.predict(featuresTest)

In [None]:
print("The length of the predicted vlue is: {}".format(len(predictedValue)), "\n")
print(predictedValue)

In [None]:
finalDataFrame = test.loc[:,['tracking_id','datetime']]
finalDataFrame['windmill_generated_power(kW/h)'] = predictedValue 

In [None]:
finalDataFrame