In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing, model_selection, metrics

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:

data = pd.read_csv("/kaggle/input/dataquest2020/energy_train.csv")
data.head()

In [None]:
data.head()


In [None]:
print('The number of rows in dataset is - ' , data.shape[0])
print('The number of columns in dataset is - ' , data.shape[1])

In [None]:
data.isnull().sum().sort_values(ascending = True)

In [None]:
data['degree_C1'].fillna((data['degree_C1'].mean()), inplace=True)
data['degree_C3'].fillna((data['degree_C3'].mean()), inplace=True)
data['moisture_9'].fillna((data['moisture_9'].mean()), inplace=True)

In [None]:
from sklearn.model_selection import train_test_split

# 75% of the data is usedfor the training of the models and the rest is used for testing
train, test = train_test_split(data,test_size=0.25,random_state=40)

In [None]:
col_time=["date"]

col_temp = ["degree_C1","degree_C2","degree_C3","degree_C4","degree_C5","degree_C6","degree_C7","degree_C8","degree_C9"]

col_hum = ["moisture_1","moisture_2","moisture_3","moisture_4","moisture_5","moisture_6","moisture_7","moisture_8","moisture_9"]

col_weather = ["degree_Cout", "dew_index","moisture_out","Pressure",
                "Wind","Clarity"] 

col_light = ["luminousity"]

col_randoms = ["random_variable_1", "random_variable_2"]

col_target = ["WattHour"]

# Seperate dependent and independent variables 
feature_vars = train[col_time + col_temp + col_hum + col_weather + col_light + col_randoms ]
target_vars = train[col_target]

In [None]:
# Due to lot of zero enteries this column is of not much use and will be ignored in rest of the model
_ = feature_vars.drop(['luminousity'], axis=1 , inplace= True) ;

In [None]:
feature_vars.head(2)


In [None]:

data['WEEKDAY'] = ((pd.to_datetime(data['date']).dt.dayofweek)// 5 == 1).astype(float)
# There are 5472 weekend recordings 
data['WEEKDAY'].value_counts()

In [None]:
# Histogram of all the features to understand the distribution
feature_vars.hist(bins = 20 , figsize= (12,16)) ;

In [None]:
f, ax = plt.subplots(2,2,figsize=(12,8))
vis1 = sns.distplot(feature_vars["degree_C6"],bins=10, ax= ax[0][0])
vis2 = sns.distplot(feature_vars["moisture_out"],bins=10, ax=ax[0][1])
vis3 = sns.distplot(feature_vars["Clarity"],bins=10, ax=ax[1][0])
vis4 = sns.distplot(feature_vars["Wind"],bins=10, ax=ax[1][1])

In [None]:
# Distribution of values in Applainces column
f = plt.figure(figsize=(12,5))
plt.xlabel('Appliance consumption in Wh')
plt.ylabel('Frequency')
sns.distplot(target_vars , bins=10 ) ;

In [None]:
#Appliance column range with consumption less than 200 Wh
print('Percentage of the appliance consumption is less than 200 Wh')
print(((target_vars[target_vars <= 200].count()) / (len(target_vars)))*100 )

In [None]:

# Use the weather , temperature , applainces and random column to see the correlation
train_corr = train[col_temp + col_hum + col_weather +col_target+col_randoms]
corr = train_corr.corr()
# Mask the repeated values
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
  
f, ax = plt.subplots(figsize=(16, 14))
#Generate Heat Map, allow annotations and place floats in map
sns.heatmap(corr, annot=True, fmt=".2f" , mask=mask,)
    #Apply xticks
plt.xticks(range(len(corr.columns)), corr.columns);
    #Apply yticks
plt.yticks(range(len(corr.columns)), corr.columns)
    #show plot
plt.show()

In [None]:
def get_redundant_pairs(df):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

# Function to get top correlations 

def get_top_abs_correlations(df, n=5):
    au_corr = df.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(df)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr[0:n]

print("Top Absolute Correlations")
print(get_top_abs_correlations(train_corr, 40))

In [None]:
train_X

In [None]:
#Split training dataset into independent and dependent varibales
train_X = train[feature_vars.columns]
train_y = train[target_vars.columns]
train_X.drop(['date'], axis=1, inplace=True)

In [None]:
#Split testing dataset into independent and dependent varibales
test_X = test[feature_vars.columns]
test_y = test[target_vars.columns]

In [None]:
train_X.drop(["random_variable_1","random_variable_2","degree_C9","degree_C6","Clarity"],axis=1 , inplace=True)
test_X.drop(["random_variable_1","random_variable_2","degree_C9","degree_C6","Clarity"], axis=1, inplace=True)

In [None]:
train_X.columns


In [None]:
test_X.columns


In [None]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()

# Create test and training set by including Appliances column

train = train[list(train_X.columns.values) + col_target ]

test = test[list(test_X.columns.values) + col_target ]

# Create dummy test and training set to hold scaled values

sc_train = pd.DataFrame(columns=train.columns , index=train.index)

sc_train[sc_train.columns] = sc.fit_transform(train)

sc_test= pd.DataFrame(columns=test.columns , index=test.index)

sc_test[sc_test.columns] = sc.fit_transform(test)

In [None]:
sc_train.head()


In [None]:
sc_test.head()


In [None]:
# Remove Appliances column from traininig set

train_X =  sc_train.drop(['Appliances'] , axis=1)
train_y = sc_train['Appliances']

test_X =  sc_test.drop(['Appliances'] , axis=1)
test_y = sc_test['Appliances']

In [None]:
train_X.head()


In [None]:
feature_indices = np.argsort(grid_search.best_estimator_.feature_importances_)


In [None]:
importances = grid_search.best_estimator_.feature_importances_
indices = np.argsort(importances)[::-1]
names = [train_X.columns[i] for i in indices]
# Create plot
plt.figure(figsize=(10,6))

# Create plot title
plt.title("Feature Importance")

# Add bars
plt.bar(range(train_X.shape[1]), importances[indices])

# Add feature names as x-axis labels
plt.xticks(range(train_X.shape[1]), names, rotation=90)

# Show plot
plt.show()

In [None]:
###LSTM

In [None]:
from math import sqrt
from sklearn.model_selection import train_test_split
from numpy import concatenate
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from pandas import to_datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from matplotlib import pyplot

In [None]:
# convert series to supervised learning
def series_to_supervised(dataset, n_in=1, n_out=1, dropnan=True):
    num_vars = 1 if type(dataset) is list else dataset.shape[1]
    dataframe = DataFrame(dataset)
    cols, names = list(), list()
    
    # input sequence (t-n, ....t-1)
    for i in range(n_in, 0, -1):
        cols.append(dataframe.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(num_vars)]
    # forecast sequence (t, t+1 .... t+n)
    for i in range(0, n_out):
        cols.append(dataframe.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(num_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(num_vars)]
    
    # put it all together 
    agg = concat(cols, axis=1)
    agg.columns = names
    
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [None]:
feature=["moisture_out","moisture_8","moisture_1","degree_C3","moisture_3","degree_C2","Pressure","moisture_2","moisture_7","degree_C8","moisture_6","moisture_4","moisture_5","degree_Cout","moisture_9",
             "degree_C4","degree_C7","dew_index","Wind","degree_C1","degree_C5"]

data1 = data[col_target + col_time + feature]

In [None]:
import pandas as pd

data1["date"]=pd.to_datetime(data1["date"])
data1 = data1.set_index(['date'], drop=True)
data1.head()

In [None]:
data1

In [None]:
values=data1.values
values.shape

In [None]:
# normalize features
scaler = MinMaxScaler(feature_range=(0,1))
scaled = scaler.fit_transform(values)

In [None]:
reframed = series_to_supervised(scaled, 1, 1)


In [None]:
reframed.head()


In [None]:
reframed.drop(reframed.columns[[22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43]], axis=1, inplace=True)

In [None]:
values = reframed.values


In [None]:
X = values[:,:21]
Y = values[:,21]

In [None]:
X.shape


In [None]:
np.shape(Y)

In [None]:
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size=0.3)

# reshape input to be 3D [samples, timesteps, features]
X_Train = X_Train.reshape((X_Train.shape[0], 1, X_Train.shape[1]))
X_Test = X_Test.reshape((X_Test.shape[0], 1, X_Test.shape[1]))

In [None]:
print(X_Test)
X_Test.shape

In [None]:
# network architecture
model = Sequential()
model.add(LSTM(50, input_shape=(X_Train.shape[1], X_Train.shape[2])))
model.add(Dense(1))
model.compile(loss='mse', optimizer='adam')

# fit
history = model.fit(X_Train, Y_Train, epochs=70, batch_size=10, validation_data=(X_Test, Y_Test), verbose=2, shuffle=False)

In [None]:
pyplot.plot(history.history['loss'], label='Train')
pyplot.plot(history.history['val_loss'], label='Test')
pyplot.legend()
pyplot.show()

In [None]:
sca=DataFrame(scaled)
sca.head()

In [None]:
lstm_test_mse = model.evaluate(X_Test, Y_Test, batch_size=1)
print('Test MSE: %f'%lstm_test_mse)

In [None]:
from sklearn.metrics import r2_score

y_pred_test_lstm = model.predict(X_Test)
y_train_pred_lstm = model.predict(X_Train)
print("The R2 score on the Train set is:\t{:0.3f}".format(r2_score(Y_Train, y_train_pred_lstm)))
print("The R2 score on the Test set is:\t{:0.3f}".format(r2_score(Y_Test, y_pred_test_lstm)))


In [None]:
lstm_y_pred_test = model.predict(X_Test)
plt.figure(figsize=(10, 6))
plt.plot(Y_Test, label='True')
plt.plot(y_pred_test_lstm, label='LSTM')
plt.title("LSTM's Prediction")
plt.xlabel('Observation')
plt.ylabel('Appliances scaled')
plt.legend()
plt.show();

In [None]:
# make a prediction
yhat = model.predict(X_Test)
X_Test = X_Test.reshape((X_Test.shape[0], 21))
# invert scaling for forecast
inv_yhat = np.concatenate((yhat, X_Test[:, -21:]), axis=1)
inv_yhat = scaler.inverse_transform(inv_yhat)
inv_yhat = inv_yhat[:,0]
# invert scaling for actual
Y_Test = Y_Test.reshape((len(Y_Test), 1))
inv_y = np.concatenate((Y_Test, X_Test[:, -21:]), axis=1)
inv_y = scaler.inverse_transform(inv_y)
inv_y = inv_y[:,0]
# calculate RMSE
rmse = np.sqrt(mean_squared_error(inv_y, inv_yhat))
print('Test RMSE: %.3f' % rmse)

In [None]:
data2 = pd.read_csv("/kaggle/input/dataquest2020/energy_test.csv")

data2=data2.fillna(data.mean())

data2 = data2[col_time + feature]

In [None]:
a=np.ones((4375,))
data2['Watthour']=a

In [None]:
data2["date"]=pd.to_datetime(data2["date"])
data2 = data2.set_index(['date'], drop=True)
data2.head()

In [None]:
values2=data2.values
values2.shape

In [None]:
scaled2 = scaler.fit_transform(values2)

In [None]:
scaled2.shape

In [None]:
reframed2 = series_to_supervised(scaled2, 1, 1)

In [None]:
reframed2.head()

In [None]:
reframed2.drop(reframed2.columns[[22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43]], axis=1, inplace=True)

In [None]:
values2 = reframed2.values

In [None]:
values2.shape

In [None]:
X_Test_2 = values2[:,:21]

In [None]:
X_Test_2.shape

In [None]:
X_Test = X_Test_2.reshape((X_Test_2.shape[0], 1, X_Test_2.shape[1]))

In [None]:
X_Test.shape

In [None]:
yhat = model.predict(X_Test)
X_Test = X_Test.reshape((X_Test.shape[0], 21))
# invert scaling for forecast
inv_yhat = np.concatenate((yhat, X_Test[:, -21:]), axis=1)
inv_yhat = scaler.inverse_transform(inv_yhat)
inv_yhat = inv_yhat[:,0]

In [None]:
np.shape(inv_yhat)

In [None]:
b=np.ones((1,))
b=b*90

In [None]:
y=np.concatenate((inv_yhat, b))

In [None]:
pred = y.astype(int)

sample = pd.read_csv("/kaggle/input/dataquest2020/sample_submission.csv")
submission_df = pd.DataFrame(columns=['id', 'WattHour'])
submission_df['id'] = sample['id']
submission_df['WattHour'] = pred

for i in range (0,4375):
    if submission_df["WattHour"][i]%10 != 0:
        if (submission_df["WattHour"][i]%10) > 7:
            submission_df["WattHour"][i] = submission_df["WattHour"][i] + (10 - submission_df["WattHour"][i]%10)
        else:
            submission_df["WattHour"][i] = submission_df["WattHour"][i] - (submission_df["WattHour"][i]%10)

print(submission_df)
submission_df.to_csv('output.csv', header=True, index=False)

In [None]:
pred