## Importing required modules

In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import  LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

from xgboost import XGBRegressor
from sklearn.metrics import r2_score

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


## Read in the data

In [None]:
# Read the training dataset
train=pd.read_csv("/kaggle/input/a-fine-windy-day-hackerearth-ml-challenge/train_data.csv")
train.head()

In [None]:
# Read the testing Dataset
test=pd.read_csv("/kaggle/input/a-fine-windy-day-hackerearth-ml-challenge/test_data.csv")
test.head()

# Feature Engineering

### Handeling Nan values

In [None]:
# nan values in training dataset
train.isnull().sum()

In [None]:
# Nan values in testing dataset
test.isnull().sum()

### Encoding each column with categorical values

In [None]:
# Encoding training data/
s=[]
# Encoding each column og object datatype
for f in train.columns: 
    if train[f].dtype=='object':
        s.append(f)
        lbl = LabelEncoder() 
        lbl.fit(list(train[f].values)) 
        train[f] = lbl.transform(list(train[f].values))
        
train.head()


In [None]:
# Encoding testing dataset
s_test=[]
# Encoding each column og object datatype
for f in test.columns: 
    if test[f].dtype=='object':
        s_test.append(f)
        lbl = LabelEncoder() 
        lbl.fit(list(test[f].values)) 
        test[f] = lbl.transform(list(test[f].values))

## Handel NaN values for categorical data

In [None]:
# train dataset
imputer = SimpleImputer()
for i in s:
# filling the nan values
    data = np.array(train[i], dtype=object)
    train[i]=imputer.fit_transform(data.reshape(-1,1))

In [None]:
# test dataset
imputer = SimpleImputer()
for i in s:
# filling the nan values
    data = np.array(test[i], dtype=object)
    test[i]=imputer.fit_transform(data.reshape(-1,1))

#### We have Removed all the Null values in categorical variables

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

## Finding correlation

In [None]:
def get_redundant_pairs(train):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = train.columns
    for i in range(0, train.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_abs_correlations(train, n=5):
    au_corr = train.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(train)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr[0:n]

print("Top Absolute Correlations")
print(get_top_abs_correlations(train, 3))

### Removing Unnecessary Rows

In [None]:
df=train.drop(['tracking_id', 'datetime', "motor_torque(N-m)"],axis=1)
df.head()

In [None]:
df_test=test.drop(['tracking_id', 'datetime', "motor_torque(N-m)"],axis=1)
df_test.head()

## Handeling NaN values for continuous data using random Sampeling

In [None]:
def impute_nan(df,variable):
    df[variable]=df[variable]
    ##It will have the random sample to fill the na
    random_sample=df[variable].dropna().sample(df[variable].isnull().sum(),random_state=0)
    ##pandas need to have same index in order to merge the dataset
    random_sample.index=df[df[variable].isnull()].index
    df.loc[df[variable].isnull(),variable]=random_sample


In [None]:
for i in df.columns.to_list():
    if df[i].isnull:
        impute_nan(df,i)
    else:
        pass

In [None]:
for i in df_test.columns.to_list():
    if df_test[i].isnull:
        impute_nan(df_test,i)
    else:
        pass

#### All NaN values are filled

In [None]:
df_test.isnull().sum()

In [None]:
df.isnull().sum()

## Building up the model

Creating X and Y 

In [None]:
# create x and y
Y = df['windmill_generated_power(kW/h)']
X = df.drop(['windmill_generated_power(kW/h)'],axis=1)

Scaling down the data

In [None]:
scaler = RobustScaler()
X = scaler.fit_transform(X)
df_test=scaler.fit_transform(df_test)

Splitting data into train and test set

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,train_size=0.86,random_state=42)
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

Building the model 

In [None]:
xgb = XGBRegressor(n_estimators=600,max_depth=6,booster='gbtree',n_jobs=10,
                   learning_rate=0.1,reg_lambda=0.3,reg_alpha=0.4)
xgb.fit(x_train,y_train)
y_train_pred = xgb.predict(x_train)
y_test_pred = xgb.predict(x_test)
print(r2_score(y_train,y_train_pred))
print(r2_score(y_test,y_test_pred))

# Getting the results on test data

In [None]:
df_sub = test[['tracking_id','datetime']]

# save the predictions on new dataset
results = xgb.predict(df_test)
df_sub['windmill_generated_power(kW/h)'] = results

In [None]:
df_sub.head()

# Thanks