In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

In [None]:
train_data = pd.read_csv('../input/a-fine-windy-day-hackerearth-ml-challenge/train_data.csv')
test_data = pd.read_csv('../input/a-fine-windy-day-hackerearth-ml-challenge/test_data.csv')
test_data.head()

### Data Pre-processing

In [None]:
drop_columns = ['tracking_id','datetime']
submission = test_data[drop_columns]
train_data.drop(drop_columns,inplace=True,axis=1)
test_data.drop(drop_columns,inplace=True,axis=1)

In [None]:
train_data.isna().sum() / len(train_data) * 100

In [None]:
train_data.dtypes

In [None]:
train_data.loc[:,train_data.dtypes == 'object'].nunique()

In [None]:
print(train_data['turbine_status'].value_counts(),'\n'*3,
train_data['cloud_level'].value_counts())

In [None]:
continous_cols = train_data.dtypes[train_data.dtypes != 'object'].index
continous_cols_test = test_data.dtypes[test_data.dtypes != 'object'].index
(continous_cols)

In [None]:
for col in continous_cols:
    train_data[col] = train_data[col].fillna(train_data[col].mean())
for col in continous_cols_test:
    test_data[col] = test_data[col].fillna(test_data[col].mean())

In [None]:
train_data['cloud_level'].value_counts()
train_data['cloud_level'].fillna('Low',inplace = True)

In [None]:
train_data['turbine_status'].value_counts()
train_data['turbine_status'].fillna('BB',inplace = True)


In [None]:
test_data['cloud_level'].value_counts()
test_data['cloud_level'].fillna('Low',inplace = True)

In [None]:
test_data['turbine_status'].value_counts()
test_data['turbine_status'].fillna('BB',inplace = True)


In [None]:
test_data.isna().sum() / len(test_data) * 100

In [None]:
train_data.describe()

In [None]:
test_data.describe()

#### Standardizing the data

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
for col in continous_cols:
    train_data[col] = sc.fit_transform(train_data[col].values.reshape(-1,1))
    if(col == 'windmill_generated_power(kW/h)'):
        continue
    test_data[col] = sc.transform(test_data[col].values.reshape(-1,1))

In [None]:
train_data.describe()

In [None]:
test_data.describe()

In [None]:
label_enc = LabelEncoder()
train_data['cloud_level'] = label_enc.fit_transform(train_data['cloud_level'])
train_data['cloud_level'].value_counts()
test_data['cloud_level'] = label_enc.fit_transform(test_data['cloud_level'])
test_data['cloud_level'].value_counts()

In [None]:
train_data['cloud_level'] = train_data['cloud_level'].apply(lambda x:  x + 1)
test_data['cloud_level'] = test_data['cloud_level'].apply(lambda x:  x + 1)
train_data['cloud_level']
train_data['cloud_level'].value_counts()

In [None]:
enc = pd.get_dummies(train_data['turbine_status'])
train_data.drop('turbine_status',axis = 1,inplace = True)
train_data = train_data.join(enc)

In [None]:
enc = pd.get_dummies(test_data['turbine_status'])
test_data.drop('turbine_status',axis = 1,inplace = True)
test_data = test_data.join(enc)

### Predictive Modeling

In [None]:
x = train_data.drop(['windmill_generated_power(kW/h)'],axis = 1)
y = train_data['windmill_generated_power(kW/h)']

In [None]:
from sklearn.model_selection import cross_val_score,ShuffleSplit,GridSearchCV
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor,StackingRegressor,AdaBoostRegressor,GradientBoostingRegressor,BaggingRegressor,VotingRegressor
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor

In [None]:
def cross_val_calc(x,y,model):
    cv = ShuffleSplit(n_splits = 10, random_state= 42)
    return cross_val_score(model,x,y,scoring='r2',cv = cv,n_jobs = -1,error_score='raise')

In [None]:
models = [
LinearRegression(),
Ridge(),
Lasso(),
DecisionTreeRegressor(),
LinearSVR(),
VotingRegressor(estimators=[('dtree', DecisionTreeRegressor()), ('lr', LinearRegression()),('svr',LinearSVR())]),
StackingRegressor(estimators = [('dtree', DecisionTreeRegressor()),('lr',LinearRegression())],final_estimator = LinearSVR(),cv = 10),
BaggingRegressor(LinearSVR(),random_state = 42),
RandomForestRegressor(),
AdaBoostRegressor(),
GradientBoostingRegressor(),
XGBRegressor(), 
MLPRegressor()
] #we can add more models here


best_score = -(10e7+8)
best_model = models[0]

for model in models:
    score = (cross_val_calc(x,y,model).mean() * 100)
    print('Model: %s \nScore: %f \n' % (model, score))
    if(score > best_score):
        best_score = score
        best_model = model

In [None]:
print('Best Model: %s \n\nBest Score: %f \n' % (best_model, best_score))

### Hyper-parameter tuning

In [None]:
# param_grid = {
#     'n_estimators': [100,200,300], 
#     'max_depth': [9, 11, 13],    
#     'criterion': ['mse','mae']
# }
# gscv = GridSearchCV(RandomForestRegressor(random_state = 0),param_grid = param_grid, cv = 7,scoring = 'r2')
# gscv.fit(x,y)
# gscv.best_params_

In [None]:
best_model = RandomForestRegressor(n_estimators = 20,max_depth = 9,criterion = 'mse',random_state = 42)
best_model.fit(x,y)

### Creating the submission file

In [None]:
predictions = best_model.predict(test_data)
predictions

In [None]:
r2_score(y,best_model.predict(x)) * 100

In [None]:
submission = submission.join(pd.DataFrame(predictions))
target = 'windmill_generated_power(kW/h)'
submission[target] = submission[0]
submission.drop([0],inplace = True,axis = 1)
submission

In [None]:
submission.to_csv('submission.csv',header = True,index = False)