In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Task 1 - Importing libraries and dataset

In [None]:
df = pd.read_csv('../input/a-fine-windy-day-hackerearth-ml-challenge/train_data.csv')
df_test = pd.read_csv('../input/a-fine-windy-day-hackerearth-ml-challenge/test_data.csv')

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
plt.style.use('seaborn-deep')
plt.style.use('fivethirtyeight')
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.monospace'] = 'Ubunto Mono'
plt.rcParams['font.size'] = 10
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.titlesize'] = 12
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['legend.fontsize'] = 12
plt.rcParams['figure.titlesize'] = 14
plt.rcParams['figure.figsize'] = (16,10)

import warnings
warnings.filterwarnings('ignore')
pd.options.mode.chained_assignment = None
pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 400)

In [None]:
df

In [None]:
df_test

# Task 2 - Exploratory Data Analysis (EDA)

In [None]:
df.info()

In [None]:
## Showing the number of unique values of every feature 
df.nunique()

In [None]:
df_test.nunique()

In [None]:
## Missing values in train dataset
sns.heatmap(df.isnull(),cbar=False,yticklabels=False,cmap = 'viridis')

In [None]:
df.isna().sum()

In [None]:
## Missing values in test dataset
sns.heatmap(df_test.isnull(),cbar=False,yticklabels=False,cmap = 'viridis')

In [None]:
df_test.isna().sum()

In [None]:
corr = df.corr()
plt.figure(figsize=(20,10))
mask = np.zeros_like(corr,dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr,mask=mask,annot=True)
plt.show()

From the above plot, features "motor_torque(N-m)" and "generator_temperature(°C)" are highly correlated and therefore we will be dropping one of them in the end.

In [None]:
## Spliting the train dataset into categorical and numerical features
def getFeatures(df):
    num_features = df.select_dtypes(include=[np.number])
    cat_features = df.select_dtypes(include=[np.object])
    return num_features,cat_features

In [None]:
num_features,cat_features = getFeatures(df)

In [None]:
num_features

In [None]:
cat_features

In [None]:
## Box plot of numerical features
fig = plt.figure(figsize=(30,20))
for i in range(len(num_features.columns)):
    fig.add_subplot(4,5,i+1)
    sns.boxplot(y = num_features.iloc[:,i])
plt.tight_layout()
plt.show()

In [None]:
## Hist plot for categorical features
'''fig = plt.figure(figsize=(20,10))
for i in range(len(cat_features.columns)):
    fig.add_subplot(4,1,i+1)
    cat_features.iloc[:,i].hist()
    plt.xlabel([cat_features.columns[i]])
plt.tight_layout()
plt.show()'''

In [None]:
sns.pairplot(df)
plt.show()

In [None]:
df.columns

In [None]:
skew_features = num_features.apply(lambda x :x.skew()).sort_values(ascending=True)
skew_features

In [None]:
## Copying the train dataframe into new dataframe and we will be performing changes on the new dataframe
df_cpy = df.copy()

## Now let us begin with analysing each and every feature

In [None]:
def comparing_train_and_test_feature(df,df_test,col):
    fig = plt.figure(figsize=(16,10))
    ax0 = fig.add_subplot(1,2,1)
    ax1 = fig.add_subplot(1,2,2)
    df[col].plot(kind='kde',ax=ax0)
    df_test[col].plot(kind='kde',ax=ax1)
    ax0.set_xlabel(col)
    ax1.set_xlabel(col)
    ax0.set_title("Density plot of " + str(col) + " of training set")
    ax1.set_title("Density plot of " + str(col) + " of testing set")
    plt.show()

#### wind_speed(m/s)

In [None]:
## Comparing the density plot of features in training and testing set
comparing_train_and_test_feature(df,df_test,'wind_speed(m/s)')
## Distribution of Feature wind_speed(m/s) of training and testing dataset are very similar

In [None]:
sns.boxplot(y='wind_speed(m/s)',data=df)

In [None]:
sns.scatterplot(x='wind_speed(m/s)',y='windmill_generated_power(kW/h)',hue='cloud_level',data=df)

In [None]:
df['wind_speed(m/s)'].value_counts()

#### atmospheric_temperature(°C)

In [None]:
## Comparing the density plot of features in training and testing set
comparing_train_and_test_feature(df,df_test,'atmospheric_temperature(°C)')
## Distribution of Feature "atmospheric_temperature(°C)" of training and testing dataset is very similar

In [None]:
sns.boxplot(y='atmospheric_temperature(°C)',data=df)

In [None]:
sns.scatterplot(x='atmospheric_temperature(°C)',y='windmill_generated_power(kW/h)',hue='cloud_level',data=df)

In [None]:
df[df['atmospheric_temperature(°C)'] < -50]

From the above density plot of feature "atmospheric_temperature(°C)", we found that the distribution of "atmospheric_temperature(°C)" is almost same in training and testing dataset, so we are not changing anything in it.

#### shaft_temperature(°C)

In [None]:
## Comparing the density plot of features in training and testing set
comparing_train_and_test_feature(df,df_test,'shaft_temperature(°C)')
## Distribution of Feature "shaft_temperature(°C)" of training and testing dataset are almost same

In [None]:
sns.boxplot(y='shaft_temperature(°C)',data=df)

In [None]:
sns.scatterplot(x='shaft_temperature(°C)',y='windmill_generated_power(kW/h)',hue='cloud_level',data=df)

From the above density plot of feature "shaft_temperature(°C)", we found that the distribution of "shaft_temperature(°C)" is almost same in training and testing dataset, so we are not changing anything in it.

In [None]:
plt.figure(figsize=(20,10))
sns.scatterplot(x='shaft_temperature(°C)',y='wind_speed(m/s)',hue='cloud_level',data=df)

#### blades_angle(°)

In [None]:
## Comparing the density plot of features in training and testing set
comparing_train_and_test_feature(df,df_test,'blades_angle(°)')
## Distribution of Feature "blades_angle(°)" of training and testing dataset are almost same

In [None]:
sns.boxplot(y='blades_angle(°)',data=df)

In [None]:
sns.scatterplot(x='blades_angle(°)',y='windmill_generated_power(kW/h)',hue='cloud_level',data=df)

From the above density plot of feature "blades_angle(°)", we found that the distribution of "blades_angle(°)" is almost same in training and testing dataset, so we are not changing anything in it.


#### gearbox_temperature(°C)

In [None]:
## Comparing the density plot of features in training and testing set
comparing_train_and_test_feature(df,df_test,'gearbox_temperature(°C)')
## Distribution of Feature "gearbox_temperature(°C)" of training and testing dataset are almost similar

In [None]:
sns.boxplot(y='gearbox_temperature(°C)',data=df)

In [None]:
sns.scatterplot(x='gearbox_temperature(°C)',y='windmill_generated_power(kW/h)',hue='cloud_level',data=df)

In [None]:
## Here we are removing extreme outliers which are present in a very less number
low = df_cpy['gearbox_temperature(°C)'] < -200
high = df_cpy['gearbox_temperature(°C)'] > 300
low = np.where(low)
high = np.where(high)
df_cpy.drop(low[0],inplace=True)
df_cpy.drop(high[0],inplace=True)
df_cpy.index = range(df_cpy.shape[0])

After removing extreme outliers of feature "gearbox_temperature(°C)", it's scatterplot is shown below.

In [None]:
plt.figure(figsize=(20,10))
sns.scatterplot(x='gearbox_temperature(°C)',y='windmill_generated_power(kW/h)',hue='cloud_level',data=df_cpy)

#### engine_temperature(°C)

In [None]:
## Comparing the density plot of features in training and testing set
comparing_train_and_test_feature(df,df_test,'engine_temperature(°C)')
## Distribution of Feature "engine_temperature(°C)" of training and testing dataset are almost similar

In [None]:
sns.boxplot(y='engine_temperature(°C)',data=df)

In [None]:
sns.scatterplot(x='engine_temperature(°C)',y='windmill_generated_power(kW/h)',hue='cloud_level',data=df)

In [None]:
## Dropping extreme outliers
low = df_cpy['engine_temperature(°C)'] < 38
low = np.where(low)
df_cpy.drop(low[0],inplace=True)
df_cpy.index = range(df_cpy.shape[0])

After dropping extreme outliers, the scatterplot of "engine_temperature(°C)" is shown.

In [None]:
plt.figure(figsize=(20,10))
sns.scatterplot(x='engine_temperature(°C)',y='windmill_generated_power(kW/h)',hue='cloud_level',data=df_cpy)

#### motor_torque(N-m)

In [None]:
## Comparing the density plot of features in training and testing set
comparing_train_and_test_feature(df,df_test,'motor_torque(N-m)')
## Distribution of Feature "motor_torque(N-m)" of training and testing dataset are almost similar

In [None]:
sns.boxplot(y='motor_torque(N-m)',data=df)

In [None]:
sns.scatterplot(x='motor_torque(N-m)',y='windmill_generated_power(kW/h)',hue='cloud_level',data=df)

From the above density plot of feature "motor_torque(N-m)", we found that the distribution of "motor_torque(N-m)" is almost same in training and testing dataset, so we are not changing anything in it.


#### generator_temperature(°C)

In [None]:
## Comparing the density plot of features in training and testing set
comparing_train_and_test_feature(df,df_test,'generator_temperature(°C)')
## Distribution of Feature "generator_temperature(°C)" of training and testing dataset are almost same

In [None]:
sns.boxplot(y='generator_temperature(°C)',data=df)

In [None]:
sns.scatterplot(x='generator_temperature(°C)',y='windmill_generated_power(kW/h)',hue='cloud_level',data=df)

From the above density plot of feature "generator_temperature(°C)", we found that the distribution of "generator_temperature(°C)" is almost same in training and testing dataset, so we are not changing anything in it.


#### atmospheric_pressure(Pascal)

In [None]:
## Comparing the density plot of features in training and testing set
comparing_train_and_test_feature(df,df_test,'atmospheric_pressure(Pascal)')
## Distribution of Feature "atmospheric_pressure(Pascal)" of training and testing dataset are almost same

In [None]:
sns.boxplot(y='atmospheric_pressure(Pascal)',data=df)

In [None]:
sns.scatterplot(x='atmospheric_pressure(Pascal)',y='windmill_generated_power(kW/h)',hue='cloud_level',data=df)

From the above density plot of feature "atmospheric_pressure(Pascal)", we found that the distribution of "atmospheric_pressure(Pascal)" is almost same in training and testing dataset, so we are not changing anything in it.


#### area_temperature(°C)

In [None]:
## Comparing the density plot of features in training and testing set
comparing_train_and_test_feature(df,df_test,'area_temperature(°C)')
## Distribution of Feature "area_temperature(°C)" of training and testing dataset are almost same

In [None]:
sns.boxplot(y='area_temperature(°C)',data=df)

In [None]:
sns.scatterplot(x='area_temperature(°C)',y='windmill_generated_power(kW/h)',hue='cloud_level',data=df)

In [None]:
## Droping extreme outliers
low = df_cpy['area_temperature(°C)'] < 10
low = np.where(low)
df_cpy.drop(low[0],inplace=True)
df_cpy.index = range(df_cpy.shape[0])

After dropping the extreme outliers, scatterplot of feature "area_temperature(°C)" is shown below

In [None]:
sns.scatterplot(x='area_temperature(°C)',y='windmill_generated_power(kW/h)',hue='cloud_level',data=df_cpy)

#### windmill_body_temperature(°C)

In [None]:
## Comparing the density plot of features in training and testing set
comparing_train_and_test_feature(df,df_test,'windmill_body_temperature(°C)')
## Distribution of Feature "windmill_body_temperature(°C)" of training and testing dataset is little bit different
## as density plot of windmill_body_temperature(°C) in testing dataset is broader than in training set

In [None]:
sns.boxplot(y='windmill_body_temperature(°C)',data=df)

In [None]:
sns.scatterplot(x='windmill_body_temperature(°C)',y='windmill_generated_power(kW/h)',hue='cloud_level',data=df)

In [None]:
df[df['windmill_body_temperature(°C)']< -90]

This feature "windmill_body_temperature(°C)" doesn't have same distribution in training and testing set and it is not much correlated with targetFeature as well. So, we concluded to drop it in the end.

#### wind_direction(°)

In [None]:
## Comparing the density plot of features in training and testing set
comparing_train_and_test_feature(df,df_test,'wind_direction(°)')
## Distribution of Feature "wind_direction(°)" of training and testing dataset is almost same


In [None]:
sns.boxplot(y='wind_direction(°)',data=df)

In [None]:
sns.scatterplot(x='wind_direction(°)',y='windmill_generated_power(kW/h)',hue='cloud_level',data=df)

From the above density plot of feature "wind_direction(°)", we found that the distribution of "wind_direction(°)" is almost same in training and testing dataset, so we are not changing anything in it.


#### resistance(ohm)

In [None]:
## Comparing the density plot of features in training and testing set
comparing_train_and_test_feature(df,df_test,'resistance(ohm)')
## Distribution of Feature "resistance(ohm)" of training and testing dataset is almost same


In [None]:
sns.boxplot(y='resistance(ohm)',data=df)

In [None]:
sns.scatterplot(x='resistance(ohm)',y='windmill_generated_power(kW/h)',hue='cloud_level',data=df)

From the above density plot of feature "resistance(ohm)", we found that the distribution of "resistance(ohm)" is almost same in training and testing dataset, so we are not changing anything in it.


#### rotor_torque(N-m)

In [None]:
## Comparing the density plot of features in training and testing set
comparing_train_and_test_feature(df,df_test,'rotor_torque(N-m)')
## Distribution of Feature "rotor_torque(N-m)" of training and testing dataset is almost same


In [None]:
sns.boxplot(y='rotor_torque(N-m)',data=df)

In [None]:
sns.scatterplot(x='rotor_torque(N-m)',y='windmill_generated_power(kW/h)',hue='cloud_level',data=df)

From the above density plot of feature "rotor_torque(N-m)", we found that the distribution of "rotor_torque(N-m)" is almost same in training and testing dataset, so we are not changing anything in it.


#### blade_length(m)

In [None]:
## Comparing the density plot of features in training and testing set
comparing_train_and_test_feature(df,df_test,'blade_length(m)')
## Distribution of Feature "blade_length(m)" of training and testing dataset is almost same


In [None]:
sns.boxplot(y='blade_length(m)',data=df)

In [None]:
sns.scatterplot(x='blade_length(m)',y='windmill_generated_power(kW/h)',hue='cloud_level',data=df)

In [None]:
## Removing extreme outliers
low = df_cpy['blade_length(m)'] < -20
low = np.where(low)
df_cpy.drop(low[0],inplace=True)
df_cpy.index = range(df_cpy.shape[0])

After removing extreme outliers, the scatterplot of feature "blade_length(m)" is shown

In [None]:
plt.figure(figsize=(20,10))
sns.scatterplot(x='blade_length(m)',y='windmill_generated_power(kW/h)',hue='cloud_level',data=df_cpy)

#### blade_breadth(m)

In [None]:
## Comparing the density plot of features in training and testing set
comparing_train_and_test_feature(df,df_test,'blade_breadth(m)')
## Distribution of Feature "blade_breadth(m)" of training and testing dataset is almost same


In [None]:
sns.boxplot(y='blade_breadth(m)',data=df)

In [None]:
sns.scatterplot(x='blade_breadth(m)',y='windmill_generated_power(kW/h)',hue='cloud_level',data=df)

From the above density plot of feature "blade_breadth(m)", we found that the distribution of "blade_breadth(m)" is almost same in training and testing dataset, so we are not changing anything in it.


#### windmill_height(m)

In [None]:
## Comparing the density plot of features in training and testing set
comparing_train_and_test_feature(df,df_test,'windmill_height(m)')
## Distribution of Feature "windmill_height(m)" of training and testing dataset is almost same


In [None]:
sns.boxplot(y='windmill_height(m)',data=df)

In [None]:
sns.scatterplot(x='windmill_height(m)',y='windmill_generated_power(kW/h)',hue='cloud_level',data=df)

From the above density plot of feature "windmill_height(m)", we found that the distribution of "windmill_height(m)" is almost same in training and testing dataset, so we are not changing anything in it.


# Task 3 - Data preparation

In [None]:
df_cpy.drop(['generator_temperature(°C)','windmill_body_temperature(°C)'],inplace=True,axis=1)
df_test.drop(['generator_temperature(°C)','windmill_body_temperature(°C)'],inplace=True,axis=1)

In [None]:
df_cpy.info()

In [None]:
df_cpy.describe()

Since, there are missing values in testing data as well so we have to make arrangments for them and therefore
replacing the missing values using statistical tools like mean, median and mode

In [None]:
df_cpy['gearbox_temperature(°C)'].fillna(df_cpy['gearbox_temperature(°C)'].mean(),inplace=True)
df_cpy['area_temperature(°C)'].fillna(df_cpy['area_temperature(°C)'].mean(),inplace=True)
df_cpy['rotor_torque(N-m)'].fillna(df_cpy['rotor_torque(N-m)'].mean(),inplace=True)
df_cpy['blade_length(m)'].fillna(df_cpy['blade_length(m)'].mean(),inplace=True)
df_cpy['blade_breadth(m)'].fillna(df_cpy['blade_breadth(m)'].mean(),inplace=True)
df_cpy['windmill_height(m)'].fillna(df_cpy['windmill_height(m)'].mean(),inplace=True)
df_cpy['cloud_level'].fillna(df_cpy['cloud_level'].mode()[0],inplace=True)
df_cpy['atmospheric_temperature(°C)'].fillna(df_cpy['atmospheric_temperature(°C)'].mean(),inplace=True)
df_cpy['atmospheric_pressure(Pascal)'].fillna(df_cpy['atmospheric_pressure(Pascal)'].mean(),inplace=True)
df_cpy['wind_speed(m/s)'].fillna(df_cpy['wind_speed(m/s)'].mean(),inplace=True)
df_cpy['shaft_temperature(°C)'].fillna(df_cpy['shaft_temperature(°C)'].mean(),inplace=True)
df_cpy['blades_angle(°)'].fillna(df_cpy['blades_angle(°)'].mean(),inplace=True)
df_cpy['engine_temperature(°C)'].fillna(df_cpy['engine_temperature(°C)'].mean(),inplace=True)
df_cpy['motor_torque(N-m)'].fillna(df_cpy['motor_torque(N-m)'].mean(),inplace=True)
df_cpy['wind_direction(°)'].fillna(df_cpy['wind_direction(°)'].mean(),inplace=True)

In [None]:
df_test['gearbox_temperature(°C)'].fillna(df_test['gearbox_temperature(°C)'].mean(),inplace=True)
df_test['area_temperature(°C)'].fillna(df_test['area_temperature(°C)'].mean(),inplace=True)
df_test['rotor_torque(N-m)'].fillna(df_test['rotor_torque(N-m)'].mean(),inplace=True)
df_test['blade_length(m)'].fillna(df_test['blade_length(m)'].mean(),inplace=True)
df_test['blade_breadth(m)'].fillna(df_test['blade_breadth(m)'].mean(),inplace=True)
df_test['windmill_height(m)'].fillna(df_test['windmill_height(m)'].mean(),inplace=True)
df_test['cloud_level'].fillna(df_test['cloud_level'].mode()[0],inplace=True)
df_test['atmospheric_temperature(°C)'].fillna(df_test['atmospheric_temperature(°C)'].mean(),inplace=True)
df_test['atmospheric_pressure(Pascal)'].fillna(df_test['atmospheric_pressure(Pascal)'].mean(),inplace=True)
df_test['wind_speed(m/s)'].fillna(df_test['wind_speed(m/s)'].mean(),inplace=True)
df_test['shaft_temperature(°C)'].fillna(df_test['shaft_temperature(°C)'].mean(),inplace=True)
df_test['blades_angle(°)'].fillna(df_test['blades_angle(°)'].mean(),inplace=True)
df_test['engine_temperature(°C)'].fillna(df_test['engine_temperature(°C)'].mean(),inplace=True)
df_test['motor_torque(N-m)'].fillna(df_test['motor_torque(N-m)'].mean(),inplace=True)
df_test['wind_direction(°)'].fillna(df_test['wind_direction(°)'].mean(),inplace=True)

In [None]:
df_cpy.info()

In [None]:
df_cpy.dropna(how='any',axis=0,inplace=True)

In [None]:
df_cpy.info()

In [None]:
df_test.info()

In [None]:
## Feature "cloud_level" is categorical with 3 unique values
df_cpy['cloud_level'].replace(['Medium', 'Low', 'Extremely Low'],[2,1,0],inplace=True)
df_test['cloud_level'].replace(['Medium', 'Low', 'Extremely Low'],[2,1,0],inplace=True)

In [None]:
df_cpy['turbine_status'].value_counts()

In [None]:
## Using dummy variables for feature "turbine_status"
dum = ['turbine_status']
df_dum = pd.get_dummies(df_cpy[dum])
df_test_dum = pd.get_dummies(df_test[dum])
df_dum

In [None]:
df_cpy = pd.concat([df_cpy,df_dum],axis=1)
df_test = pd.concat([df_test,df_test_dum],axis=1)

In [None]:
## Converting the feature "datetime" into pandas datetime format
df_cpy['datetime'] = pd.to_datetime(df_cpy['datetime'])
df_test['datetime'] = pd.to_datetime(df_test['datetime'])

In [None]:
## Extracting some informations for the feature "datetime" and making new features from it
df_cpy['dmonth'] = df_cpy['datetime'].dt.month
df_cpy['dday'] = df_cpy['datetime'].dt.day
df_cpy['ddayofweek'] = df_cpy['datetime'].dt.dayofweek

df_test['dmonth'] = df_test['datetime'].dt.month
df_test['dday'] = df_test['datetime'].dt.day
df_test['ddayofweek'] = df_test['datetime'].dt.dayofweek

In [None]:
corr = df_cpy.corr()
plt.figure(figsize=(20,10))
mask = np.zeros_like(corr,dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr,mask=mask,annot=True)
plt.show()

In [None]:
corr = df_test.corr()
plt.figure(figsize=(20,10))
mask = np.zeros_like(corr,dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr,mask=mask,annot=True)
plt.show()

In [None]:
df_cpy.info()

# Task 4 - Data modelling

In [None]:
X = df_cpy.drop(['tracking_id','datetime','windmill_generated_power(kW/h)','turbine_status'],axis=1)
Y = df_cpy['windmill_generated_power(kW/h)']
X_test = df_test.drop(['tracking_id','datetime','turbine_status'],axis=1)
print(X.shape,Y.shape)
print(X_test.shape)

In [None]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,Y,train_size=0.8,random_state=42)
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import r2_score
lr = LinearRegression()
lr.fit(x_train,y_train)
y_train_pred = lr.predict(x_train)
y_test_pred = lr.predict(x_test)
print(r2_score(y_true=y_train,y_pred=y_train_pred))
print(r2_score(y_true=y_test,y_pred=y_test_pred))

### Ridge Regression

In [None]:
from sklearn.linear_model import RidgeCV
ridge_model = RidgeCV(scoring="r2",
                          alphas=[0.0001,0.0005,0.001,0.005,0.01,0.1,1.0,10],cv=5)
ridge_model.fit(x_train,y_train)
y_train_pred = ridge_model.predict(x_train)
y_test_pred = ridge_model.predict(x_test)
print(r2_score(y_true=y_train,y_pred=y_train_pred))
print(r2_score(y_true=y_test,y_pred=y_test_pred))

### Lasso Regression

In [None]:
from sklearn.linear_model import LassoCV
lasso_model = LassoCV(alphas=[0.0001,0.0005,0.001,0.005,0.01,0.1,1.0,10],cv=5)
lasso_model.fit(x_train,y_train)
y_train_pred = lasso_model.predict(x_train)
y_test_pred = lasso_model.predict(x_test)
print(r2_score(y_true=y_train,y_pred=y_train_pred))
print(r2_score(y_true=y_test,y_pred=y_test_pred))

### ElasticNet

In [None]:
from sklearn.linear_model import ElasticNetCV
enet_model = ElasticNetCV(l1_ratio = [0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1],
                    alphas = [1, 0.1, 0.01, 0.001, 0.0005], cv=5)
enet_model.fit(x_train, y_train)

# predict
y_train_pred = enet_model.predict(x_train)
y_test_pred = enet_model.predict(x_test)
print(r2_score(y_train,y_train_pred))
print(r2_score(y_test,y_test_pred))

### ExtraTreesRegressor

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
extra_model = ExtraTreesRegressor(criterion='mse', random_state=0, n_jobs=-1, 
                                min_samples_leaf=1, max_depth=20, 
                                min_samples_split=3, n_estimators=1000
                               )

extra_model.fit(x_train, y_train)

# predict
y_train_pred = extra_model.predict(x_train)
y_test_pred = extra_model.predict(x_test)
print(r2_score(y_train,y_train_pred))
print(r2_score(y_test,y_test_pred))

### RandomForestRegressor 

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(x_train,y_train)
y_train_pred = rf.predict(x_train)
y_test_pred = rf.predict(x_test)
print(r2_score(y_train,y_train_pred))
print(r2_score(y_test,y_test_pred))

### GradientBoostingRegressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
gb_model = GradientBoostingRegressor(criterion='mse',random_state=0,max_depth=5,
                                     n_estimators=500,min_samples_split=2,min_samples_leaf=2)
gb_model.fit(x_train,y_train)
y_train_pred = gb_model.predict(x_train)
y_test_pred = gb_model.predict(x_test)
print(r2_score(y_train,y_train_pred))
print(r2_score(y_test,y_test_pred))

### XGBRegressor

In [None]:
from xgboost import XGBRegressor
xgb = XGBRegressor(n_estimators=500,max_depth=5,booster='gbtree',n_jobs=-1,learning_rate=0.1,reg_lambda=0.01,reg_alpha=0.3)
xgb.fit(x_train,y_train)
y_train_pred = xgb.predict(x_train)
y_test_pred = xgb.predict(x_test)
print(r2_score(y_train,y_train_pred))
print(r2_score(y_test,y_test_pred))

## Mixed model of three different models

In [None]:
import sklearn.base as skb
class MixModel(skb.BaseEstimator,skb.RegressorMixin,skb.TransformerMixin):
    def __init__(self,algs):
        self.algs = algs
    def fit(self,X,y):
        self.algs_ = [skb.clone(x) for x in self.algs]
        for alg in self.algs_:
            alg.fit(X,y)
        return self
    def predict(self,X):
        predictions = np.column_stack([
            stacked_model.predict(X) for stacked_model in self.algs_
        ])
        return np.mean(predictions,axis=1)

In [None]:
## Using mixed model of random forest, gradient boosting and XGB Regressor
mixed_model = MixModel(algs = [xgb,rf,gb_model])
mixed_model.fit(x_train, y_train)
y_train_pred = mixed_model.predict(x_train)
y_test_pred = mixed_model.predict(x_test)
print(r2_score(y_train,y_train_pred))
print(r2_score(y_test,y_test_pred))

# Task 5 - Test Evaluation and Submission

In [None]:
df_test = pd.read_csv('../input/a-fine-windy-day-hackerearth-ml-challenge/test_data.csv')

In [None]:
df_sub = df_test[['tracking_id','datetime']]

In [None]:
results = mixed_model.predict(X_test)

In [None]:
results

In [None]:
df_sub['windmill_generated_power(kW/h)'] = results

In [None]:
df_sub.to_csv('./sub.csv',header=True,index=False)
df_sub

# Last Notes
This dataset is not based on the real world scenario and is synthetically generated as some of the features like blade_width, wind_speed and so on, have negative values which are meaningless.
The testing set is also the subset of training set as both are having almost same distributions among almost every feature. By removing all outliers from the training set, its distribution will differ highly from the testing set which results in less score on testing set and therefore only extreme outliers are removed by us, so that the distribution remains almost same.

Here, we used models like Linear Regression ,Lasso Regression, Ridge Regression, Random Forest Regressor, ExtraTress Regressor, Gradient Boosting Regressor and XGB Regressor with manual hyper parameter tunning. For me, Random Forest , Gradient Boosting and XGB Regressor was performing best and hence, used a mixed model of these three models with brings me the best results.
I would suggest to do more feature engineering and hyper parameter tunning with different models may bring excellent results.

If you like my work, show your appreciation with an upvote and share this notebook.

Thank You............!!!