In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# A Fine Windy Day: HackerEarth Machine Learning challenge

**Points:**
* Trying to learn Machine Learning by participating in the competitions.
* Achieved accuracy of 96.4%, and trying to further improve it.
* Beginner in Machine Leraning, so may be there will be naive mistake, **feedbacks are most welcome**.
* Used feature_engine library. Install it on kaggle using `!pip install feature_engine`
* Used sklearn pipeline to minimize the code and make it simple

In [None]:
!pip install feature_engine

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# from feature-engine
from feature_engine.imputation import (
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer
)
from feature_engine.encoding import OrdinalEncoder
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline

In [None]:
NUM_COLUMNS_WITH_MISSING_VALUES = ['wind_speed(m/s)', 'atmospheric_temperature(°C)',
       'shaft_temperature(°C)', 'blades_angle(°)', 'gearbox_temperature(°C)',
       'engine_temperature(°C)', 'motor_torque(N-m)',
       'generator_temperature(°C)', 'atmospheric_pressure(Pascal)', 'area_temperature(°C)',
       'windmill_body_temperature(°C)', 'wind_direction(°)', 'resistance(ohm)',
       'rotor_torque(N-m)', 'blade_length(m)', 'blade_breadth(m)',
       'windmill_height(m)']

CAT_COLUMNS_WITH_MISSING_VALUES = ['turbine_status', 'cloud_level']
CAT_COLUMNS = ['turbine_status', 'cloud_level']

pipe = Pipeline([
    ('missing_indicator_imputer', AddMissingIndicator(missing_only=True, variables = NUM_COLUMNS_WITH_MISSING_VALUES)),

    ('meanImputer', MeanMedianImputer(imputation_method='mean',
                                         variables=NUM_COLUMNS_WITH_MISSING_VALUES)),

    ('missing_cat_imputer', CategoricalImputer(variables=CAT_COLUMNS_WITH_MISSING_VALUES)),
    
    ('ord_encoder', OrdinalEncoder(encoding_method='ordered', variables=CAT_COLUMNS)),
    

#     ('reg', GradientBoostingRegressor(
#         #n_estimators=500, learning_rate=0.3,
#         #                               max_features=2, max_depth=2, random_state=0)
#         #criterion='mse',random_state=0,max_depth=5,n_estimators=500,min_samples_split=2,min_samples_leaf=2 #96.40
        
#     ))
    
      ('reg',XGBRegressor(
          n_estimators=500,max_depth=5,booster='gbtree',n_jobs=-1,learning_rate=0.1,reg_lambda=0.01,reg_alpha=0.3 #96.43
      ))
])

In [None]:
df_train = pd.read_csv('/kaggle/input/a-fine-windy-day-hackerearth-ml-challenge/train_data.csv')
#Dropping rows where target variable is null
df_train = df_train[df_train['windmill_generated_power(kW/h)'].isna() == False]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
                df_train.drop(['windmill_generated_power(kW/h)'], axis=1),
                df_train['windmill_generated_power(kW/h)'], test_size=0.2, random_state=0)

In [None]:
COLUMNS_TO_DROP = ['tracking_id', 'datetime']
X_train = X_train.drop(COLUMNS_TO_DROP, axis = 1)
X_test = X_test.drop(COLUMNS_TO_DROP, axis = 1)

In [None]:
pipe.fit(X_train, y_train)

In [None]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

In [None]:
df_train = pd.read_csv('/kaggle/input/a-fine-windy-day-hackerearth-ml-challenge/train_data.csv')
df_train = df_train[df_train['windmill_generated_power(kW/h)'].isna() == False]

df_test = pd.read_csv('/kaggle/input/a-fine-windy-day-hackerearth-ml-challenge/test_data.csv')
df_sol = df_test[['tracking_id', 'datetime']]

df_train = df_train.drop(COLUMNS_TO_DROP, axis = 1)
df_test = df_test.drop(COLUMNS_TO_DROP, axis = 1)

pipe.fit(df_train.drop(['windmill_generated_power(kW/h)'], axis = 1), df_train['windmill_generated_power(kW/h)'])
sol = pipe.predict(df_test)
df_sol['windmill_generated_power(kW/h)'] = sol
df_sol.to_csv('submission.csv', index = False)