In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Data Exploration

In [None]:
pd.pandas.set_option('display.max_columns', None)
dataset = pd.read_csv('../input/a-fine-windy-day-hackerearth-ml-challenge/train_data.csv')
dataset.head()

In [None]:
dataset.columns

## Null Values

In [None]:
dataset.isnull().sum()

### Removing null values from numerical dataset

In [None]:
null_col = [i for i in dataset.columns if dataset[i].isnull().sum()>0 and dataset[i].dtypes != 'O']
null_col

In [None]:
for i in null_col:
    dataset[i].fillna(dataset[i].mean(), inplace=True)
dataset.isnull().sum()

### Removing null values from categorical dataset

In [None]:
dataset.cloud_level.value_counts()

In [None]:
dataset['cloud_level'].fillna('Low', inplace=True)
dataset.isnull().sum()

In [None]:
dataset['turbine_status'].unique()

In [None]:
# dum = ['turbine_status']
# df_dum = pd.get_dummies(dataset[dum])
# dataset = pd.concat([dataset,df_dum],axis=1)
# dataset.drop('turbine_status', axis=1, inplace=True)

In [None]:
dataset['turbine_status'].fillna('Missing', inplace=True)
l = ['BA', 'A2', 'ABC', 'AAA', 'BD', 'AC', 'BB', 'BCB', 'B', 'AB', 'Missing', 'B2', 'BBB', 'A', 'D']
feat_tur = dict()
for i in range(len(l)):
    feat_tur[l[i]] = i
feat_tur
dataset['turbine_status']=dataset['turbine_status'].map(feat_tur)

dataset.isnull().sum()

## Feature Engineering

### Seperating data and time from 'datetime' feature

In [None]:
dataset['datetime'] = pd.to_datetime(dataset['datetime'])
dataset['day'] = dataset['datetime'].dt.date
dataset['time'] = dataset['datetime'].dt.time
# dataset.drop('datetime', axis=1, inplace=True)
dataset['day'] = pd.to_datetime(dataset['day'])
dataset['time']= pd.to_datetime(dataset['time'].astype(str))
dataset.dtypes

#### Further seperating date into date, month, year

In [None]:
dataset['date']=dataset['day'].dt.day
dataset['month']=dataset['day'].dt.month
dataset['year']=dataset['day'].dt.year
dataset.drop('day', axis=1, inplace=True)
dataset.head(2)

#### Similarly seperating hour and minute from time

In [None]:
dataset['time_hour'] = dataset['time'].dt.hour
dataset['time_minute'] = dataset['time'].dt.minute
dataset.drop('time', axis=1, inplace=True)
dataset.head(2)

## Data Visualization

In [None]:
dataset.dtypes

In [None]:
plt_feat = ['datetime', 'wind_speed(m/s)',
       'atmospheric_temperature(°C)', 'shaft_temperature(°C)',
       'blades_angle(°)', 'gearbox_temperature(°C)', 'engine_temperature(°C)',
       'motor_torque(N-m)', 'generator_temperature(°C)',
       'atmospheric_pressure(Pascal)', 'area_temperature(°C)',
       'windmill_body_temperature(°C)', 'wind_direction(°)', 'resistance(ohm)',
       'rotor_torque(N-m)', 'cloud_level', 'blade_length(m)',
       'blade_breadth(m)', 'windmill_height(m)']

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
for i in plt_feat:
    plt.figure(figsize=(8,4))
    sns.scatterplot(data=dataset, x=i, y='windmill_generated_power(kW/h)')

## Removing Outliers

In [None]:
def change(s):
    if s<-200 or s>300:
        return dataset["gearbox_temperature(°C)"].mean()
    else:
        return s
dataset["gearbox_temperature(°C)"] = dataset["gearbox_temperature(°C)"].apply(change)

In [None]:
def remove_99_atmospheric_temperature(s):
    if s == -99.000000:
        return dataset['atmospheric_temperature(°C)'].mean()
    else:
        return s
    

dataset[ 'atmospheric_temperature(°C)'] = dataset[ 'atmospheric_temperature(°C)'].apply(remove_99_atmospheric_temperature)
sns.scatterplot(data=dataset, x= 'atmospheric_temperature(°C)', y='windmill_generated_power(kW/h)')

In [None]:
def remove_99_shaft_temperature(s):
    if s == -99.000000:
        return dataset['shaft_temperature(°C)'].mean()
    else:
        return s
    
dataset[ 'shaft_temperature(°C)'] = dataset[ 'shaft_temperature(°C)'].apply(remove_99_shaft_temperature)
sns.scatterplot(data=dataset, x= 'shaft_temperature(°C)', y='windmill_generated_power(kW/h)')

In [None]:
def remove_99_blade_length(s):
    if s == -99.000000:
        return dataset['blade_length(m)'].mean()
    else:
        return s
    
dataset[ 'blade_length(m)'] = dataset[ 'blade_length(m)'].apply(remove_99_blade_length)
sns.scatterplot(data=dataset, x= 'blade_length(m)', y='windmill_generated_power(kW/h)')

In [None]:
def remove_99_area_temperature(s):
    if s < -10:
        return dataset['area_temperature(°C)'].mean()
    else:
        return s
    
dataset[ 'area_temperature(°C)'] = dataset[ 'area_temperature(°C)'].apply(remove_99_area_temperature)
sns.scatterplot(data=dataset, x= 'area_temperature(°C)', y='windmill_generated_power(kW/h)')

In [None]:
def remove_99_engine_temperature(s):
    if s < 38:
        return dataset['engine_temperature(°C)'].mean()
    else:
        return s
    
dataset[ 'engine_temperature(°C)'] = dataset[ 'engine_temperature(°C)'].apply(remove_99_engine_temperature)
sns.scatterplot(data=dataset, x= 'engine_temperature(°C)', y='windmill_generated_power(kW/h)')

In [None]:
dataset.drop('windmill_body_temperature(°C)', axis=1, inplace=True)

### Creating new feature

In [None]:
dataset['radius'] = dataset['blade_length(m)'] * dataset['blade_length(m)'] * 3.14

In [None]:
sns.scatterplot(data=dataset, x= 'radius', y='windmill_generated_power(kW/h)')

In [None]:
dataset['air_density'] = dataset['atmospheric_pressure(Pascal)'] / (dataset['atmospheric_temperature(°C)'] * 287.058)
sns.scatterplot(data=dataset, x= 'air_density', y='windmill_generated_power(kW/h)')

In [None]:
def air_density_remover(s):
    if s < -38000:
        return dataset['air_density'].mean()
    else:
        return s
    
dataset[ 'air_density'] = dataset[ 'air_density'].apply(air_density_remover)
sns.scatterplot(data=dataset, x= 'air_density', y='windmill_generated_power(kW/h)')

In [None]:
# list_99 = [ 'atmospheric_temperature(°C)', 'shaft_temperature(°C)', 'blade_length(m)']
## area_temperature less than -10
## engine_temp less than 38 engine_temperature
## drop 'windmill_body_temperature(°C)'

In [None]:
dataset['cloud_level'].value_counts()

In [None]:
# cloud_level = {'Low':'Low', 'Medium':'Medium', 'Extremely Low':'Low'}
# dataset['cloud_level']=dataset['cloud_level'].map(cloud_level)
# dataset['cloud_level'].value_counts()

In [None]:
cloud = pd.get_dummies(dataset['cloud_level'])
cloud.head(2)

In [None]:
train_data = pd.concat([dataset, cloud], axis=1)
train_data.drop('cloud_level', axis=1, inplace=True)
train_data.head(2)

In [None]:
train_data.info()

## Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
feature_scale = [i for i in train_data.columns if i not in ['tracking_id','datetime', 'windmill_generated_power(kW/h)']]
scale.fit(train_data[feature_scale])

In [None]:
scaled = pd.DataFrame(scale.transform(train_data[feature_scale]), columns=feature_scale)
scaled.head()

In [None]:
data = pd.concat([train_data[['tracking_id','datetime', 'windmill_generated_power(kW/h)']], scaled], axis=1)
data.head()

## Model Training

### Seperating Dependent and Independent Variable

In [None]:
X_train = data.drop(['windmill_generated_power(kW/h)','datetime', 'tracking_id'], axis=1)
y_train = data['windmill_generated_power(kW/h)']
X_train.shape, y_train.shape

### Model Fitting

### Neural Network

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
def baseline_model():
    model = Sequential([
        Dense(27, input_dim=27, kernel_initializer='normal', activation='relu'),
        Dense(15,  kernel_initializer='normal', activation='relu'),
        Dense(5,  kernel_initializer='normal', activation='relu'),
        Dense(1, kernel_initializer='normal')
    ])

    model.compile(optimizer='adam', 
                 loss='mean_squared_error',
    )
    return model

estimator = KerasRegressor(build_fn=baseline_model, epochs=30, batch_size=5, verbose=1)
kfold = KFold(n_splits=2)
results = cross_val_score(estimator, X_train, y_train, cv=kfold)
# results = cross_val_score(estimator, X_train, y_train)
print("Baseline: %.2f (%.2f) MSE" % (results.mean(), results.std()))

estimator.fit(X_train, y_train)

# model.fit(X_train, y_train, epochs=3)

#### Random Forest --> 99.3998%
#### Testing Accuracy --> 95.835418%

In [None]:
# from sklearn.ensemble import RandomForestRegressor

# model = RandomForestRegressor(n_estimators=1000).fit(X_train, y_train)
# print(f'Training Accuracy: {model.score(X_train, y_train)}')

#### XGBoost --> 98.6206%
#### Testing Accuracy --> 95.65499%

In [None]:
# from xgboost import XGBRegressor
# xgb = XGBRegressor(n_estimators=500,max_depth=5,booster='gbtree',n_jobs=-1,learning_rate=0.1,reg_lambda=0.01,reg_alpha=0.3)
# xgb.fit(X_train, y_train)
# print(f'Training Accuracy: {xgb.score(X_train, y_train)}')

#### Gradient Boosting --> 98.5157%
#### Testing Accuracy --> 94.1229%

In [None]:
# from sklearn.ensemble import GradientBoostingRegressor
# gboost = GradientBoostingRegressor(criterion='mse',random_state=0,max_depth=5,n_estimators=500,min_samples_split=2,min_samples_leaf=2)
# gboost.fit(X_train,y_train)
# print(f'Training Accuracy: {gboost.score(X_train, y_train)}')

#### Decision Tree --> 99.2893%
#### Training Accuracy --> 94.1229%

In [None]:
# from sklearn.tree import DecisionTreeRegressor
# tree = DecisionTreeRegressor(min_samples_leaf=2)
# tree.fit(X_train, y_train)
# print(f'Training Accuracy: {tree.score(X_train, y_train)}')

In [None]:
# scene

## Working with test data

In [None]:
test_data = pd.read_csv('../input/a-fine-windy-day-hackerearth-ml-challenge/test_data.csv')
test_data.head(2)

In [None]:
test_data.columns

In [None]:
test_data.isnull().sum()

In [None]:
test_null_col = [i for i in test_data.columns if test_data[i].isnull().sum()>0 and test_data[i].dtypes != 'O']

for i in test_null_col:
    test_data[i].fillna(test_data[i].mean(), inplace=True)

test_data['cloud_level'].fillna(dataset['cloud_level'].mode(), inplace=True)
# test_data.dropna(how='any',axis=0,inplace=True)
# test_data['datetime'] = encode.fit_transform(test_data['datetime'])
# test_data['tracking_id'] = encode.fit_transform(test_data['tracking_id'])

test_data['turbine_status'].fillna('Missing', inplace=True)
l = ['BA', 'A2', 'ABC', 'AAA', 'BD', 'AC', 'BB', 'BCB', 'B', 'AB', 'Missing', 'B2', 'BBB', 'A', 'D']
test_feat_tur = dict()
for i in range(len(l)):
    test_feat_tur[l[i]] = i
test_feat_tur
test_data['turbine_status']=test_data['turbine_status'].map(test_feat_tur)

test_data['datetime'] = pd.to_datetime(test_data['datetime'])
test_data['day'] = test_data['datetime'].dt.date
test_data['time'] = test_data['datetime'].dt.time
# test_data.drop('datetime', axis=1, inplace=True)
test_data['day'] = pd.to_datetime(test_data['day'])
test_data['time']= pd.to_datetime(test_data['time'].astype(str))
test_data['date']=test_data['day'].dt.day
test_data['month']=test_data['day'].dt.month
test_data['year']=test_data['day'].dt.year
test_data.drop('day', axis=1, inplace=True)
test_data['time_hour'] = test_data['time'].dt.hour
test_data['time_minute'] = test_data['time'].dt.minute
test_data.drop('time', axis=1, inplace=True)

# dum = ['turbine_status']
# df_dum = pd.get_dummies(test_data[dum])
# test_data = pd.concat([test_data,df_dum],axis=1)
# test_data.drop('turbine_status', axis=1, inplace=True)

# test_data['cloud_level']=test_data['cloud_level'].map(cloud_level)
# test_data['cloud_level'].value_counts()

##feature changing start

# low = test_data['gearbox_temperature(°C)'] < -200
# high = test_data['gearbox_temperature(°C)'] > 300
# low = np.where(low)
# high = np.where(high)
# test_data.drop(low[0],inplace=True)
# test_data.drop(high[0],inplace=True)
# test_data.index = range(test_data.shape[0])

# low = test_data['engine_temperature(°C)'] < 38
# low = np.where(low)
# test_data.drop(low[0],inplace=True)
# test_data.index = range(test_data.shape[0])

# low = test_data['area_temperature(°C)'] < 10
# low = np.where(low)
# test_data.drop(low[0],inplace=True)
# test_data.index = range(test_data.shape[0])

# low = test_data['blade_length(m)'] < -20
# low = np.where(low)
# test_data.drop(low[0],inplace=True)
# test_data.index = range(test_data.shape[0])

## feature changing ends

### replacing outliers with mean
#----------------------------START

test_data["gearbox_temperature(°C)"] =      test_data["gearbox_temperature(°C)"].apply(change)
test_data[ 'atmospheric_temperature(°C)'] = test_data[ 'atmospheric_temperature(°C)'].apply(remove_99_atmospheric_temperature)    
test_data[ 'shaft_temperature(°C)'] =       test_data[ 'shaft_temperature(°C)'].apply(remove_99_shaft_temperature)    
test_data[ 'blade_length(m)'] =             test_data[ 'blade_length(m)'].apply(remove_99_blade_length)    
test_data[ 'area_temperature(°C)'] =        test_data[ 'area_temperature(°C)'].apply(remove_99_area_temperature)    
test_data[ 'engine_temperature(°C)'] =      test_data[ 'engine_temperature(°C)'].apply(remove_99_engine_temperature)
test_data.drop('windmill_body_temperature(°C)', axis=1, inplace=True)

#----------------------------END

test_data['radius'] = test_data['blade_length(m)'] * test_data['blade_length(m)'] * 3.14
test_data['air_density'] = test_data['atmospheric_pressure(Pascal)'] / (test_data['atmospheric_temperature(°C)'] * 287.058)
test_data[ 'air_density'] = test_data[ 'air_density'].apply(air_density_remover)

cloud = pd.get_dummies(test_data['cloud_level'])

test_data = pd.concat([test_data, cloud], axis=1)
test_data.drop('cloud_level', axis=1, inplace=True)

test_feature_scale = [i for i in test_data.columns if i not in ['tracking_id','datetime']]
scale.fit(test_data[test_feature_scale])

test_scaled = pd.DataFrame(scale.transform(test_data[test_feature_scale]), columns=test_feature_scale)
final_testdata = pd.concat([test_data[['tracking_id', 'datetime']], test_scaled], axis=1)
final_testdata.head()

In [None]:
final_testdata.isnull().sum()

In [None]:
X_test = final_testdata.drop(['tracking_id','datetime'], axis=1)

In [None]:
X_test.shape, X_train.shape

In [None]:
X_test.columns, X_train.columns

In [None]:
test = final_testdata.copy()

In [None]:
test.head()

In [None]:
# test.fillna(test.mean())

## Submission

In [None]:
test['preds']=estimator.predict(X_test)
prediction=test[['tracking_id','datetime','preds']]
prediction.columns=['tracking_id','datetime','windmill_generated_power(kW/h)']
# preds['tracking_id']=encode.inverse_transform(preds['tracking_id'])
# preds['datetime']=encode.inverse_transform(preds['datetime'])
# preds.drop('Unnamed: 0', inplace=True)
prediction.to_csv('my_submission.csv',index=False, header=True)