In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os, gc, warnings
import random
import datetime

from tqdm.notebook import tqdm
# matplotlib and seaborn for plotting
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# import plotly.offline as py
# py.init_notebook_mode(connected=True)
# from plotly.offline import init_notebook_mode, iplot
# init_notebook_mode(connected=True)
# import plotly.graph_objs as go
# import plotly.offline as offline
# offline.init_notebook_mode()

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold
import sklearn

import lightgbm as lgb

import pickle

warnings.filterwarnings('ignore')

In [None]:
path = '../input/ashrae-energy-prediction'
# Input data files are available in the "../input/" directory.
for dirname, _, filenames in os.walk(path):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# unimportant features (see importance below)
unimportant_cols = ['wind_direction', 'wind_speed', 'sea_level_pressure']
target = 'meter_reading'

def load_data(source='train', path=path):
    ''' load and merge all tables '''
    assert source in ['train', 'test']
    
    building = pd.read_csv(f'{path}/building_metadata.csv', dtype={'building_id':np.uint16, 'site_id':np.uint8})
    weather  = pd.read_csv(f'{path}/weather_{source}.csv', parse_dates=['timestamp'],
                                                           dtype={'site_id':np.uint8, 'air_temperature':np.float16,
                                                                  'cloud_coverage':np.float16, 'dew_temperature':np.float16,
                                                                  'precip_depth_1_hr':np.float16},
                                                           usecols=lambda c: c not in unimportant_cols)
    df = pd.read_csv(f'{path}/{source}.csv', dtype={'building_id':np.uint16, 'meter':np.uint8}, parse_dates=['timestamp'])
    df = df.merge(building, on='building_id', how='left')
    df = df.merge(weather, on=['site_id', 'timestamp'], how='left')
    return df

In [None]:
%%time
train = load_data('train')
train.head(7)

In [None]:
train.info()

In [None]:
%%time
test = load_data('test')
test.sample(7)

In [None]:
test.info()

#  FULL EDA

**Data Wrangling**

In [None]:
buildings = pd.read_csv('../input/ashrae-energy-prediction/building_metadata.csv')
buildings.head()

In [None]:
print("NULL COUNT:" )
print(buildings.isnull().sum())
print("DUPLICATES COUNT:" )
print(buildings.duplicated().sum())

In [None]:
import plotly.express as px
  
fig = px.box(buildings, x = "primary_use", y="square_feet")
fig.show()

# Meter Readings Visual Train

In [None]:
meter_arr = train["meter"].unique()
for meter in meter_arr:
    mask = train["meter"] == meter
    plt.figure(figsize=(20, 5))
    sns.scatterplot(data = train[mask], x = "meter_reading", y = "air_temperature")
    plt.xlabel("Meter: {}".format(meter))
    plt.show()

**Removing Outliers**

In [None]:
mask1 = train["meter"] == 0
mask2 = train["meter_reading"] > 40000
mask = np.logical_and(mask1, mask2)
print(train.shape)
train[mask]["meter_reading"] = np.mean(train[mask1]["meter_reading"])
print(train.shape)

In [None]:
mask1 = train["meter"] == 3
mask2 = train["meter_reading"] > 140000
mask = np.logical_and(mask1, mask2)
print(train.shape)
train[mask]["meter_reading"] = np.mean(train[mask1]["meter_reading"])
print(train.shape)

Train meter 1 and 2 have no visual outliers

In [None]:
train.corr().meter

In [None]:
test.info()

**exploring test file**

In [None]:
#The goal is to predict the meter reading column accurately
# not much exploartion is needed in the test file 
# exploring should be after the prediction

In [None]:
le = LabelEncoder()
train["primary_use"] = le.fit_transform(train["primary_use"])

In [None]:
train['hour'] = np.uint8(train['timestamp'].dt.hour)
train['day'] = np.uint8(train['timestamp'].dt.day)
train['weekday'] = np.uint8(train['timestamp'].dt.weekday)
train['month'] = np.uint8(train['timestamp'].dt.month)
train['year'] = np.uint8(train['timestamp'].dt.year-2000)
train = train.drop("timestamp", axis = 1)

In [None]:
emptylist = []
for col in train.columns:
    max1 = train[col].max()
    min1 = train[col].min()
    if not np.isfinite(train[col]).all(): 
        emptylist.append(col)
        train[col].fillna(min1-1,inplace=True)

In [None]:
fig, ax = plt.subplots(figsize=(16,8))

corr = train[[col for col in train.columns if col != 'year']].sample(10000).corr(method='spearman')
_ = sns.heatmap(corr, annot=True, xticklabels=corr.columns.values, yticklabels=corr.columns.values)

In [None]:
target = np.log1p(train["meter_reading"])
train = train.drop(["meter_reading"],axis=1)

In [None]:
from sklearn.model_selection import train_test_split 
xtrain, xval, ytrain, yval = train_test_split(train, target, test_size=0.3, random_state=42)
RFRxtrain, RFRxtest, RFRytrain, RFRytest = train_test_split(train, target, test_size=0.995, random_state=42)
RFRxtrain, RFRxtest, RFRytrain, RFRytest = train_test_split(RFRxtrain, RFRytrain, test_size=0.3, random_state=42)

In [None]:
from sklearn import metrics
from sklearn.linear_model  import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler 
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,  mean_absolute_error
from sklearn.model_selection import cross_validate, KFold, RandomizedSearchCV, GridSearchCV, train_test_split


In [None]:
model = DecisionTreeRegressor(max_depth=30)
model.fit(xtrain, ytrain)
# This takes around ... minutes to run

In [None]:
ypred_val = model.predict(xval)

In [None]:
plt.figure(figsize=(10, 5))
plt.scatter(yval[0:10000], ypred_val[0:10000], s=30)
plt.title('Predicted vs. Actual meter reading')
plt.xlabel('Actual meter reading')
plt.ylabel('Predicted meter reading')

In [None]:
SGDreg   =  SGDRegressor(random_state = 42)
SGDreg.fit(xtrain, ytrain.ravel())
y_pred=SGDreg.predict(xval)

In [None]:
plt.figure(figsize=(10, 5))
plt.scatter(yval[0:10000], y_pred[0:10000], s=30)
plt.title('Predicted vs. Actual meter reading')
plt.xlabel('Actual meter reading')
plt.ylabel('Predicted meter reading')

In [None]:
def reduce_memory_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
reduce_memory_usage(RFRxtrain)
reduce_memory_usage(RFRxtest)

In [None]:
RFRreg = RandomForestRegressor(n_estimators = 100, random_state = 0)
RFRreg.fit(RFRxtrain, RFRytrain)


In [None]:
RFR_ypred = RFRreg.predict(RFRxtest)

In [None]:
plt.scatter(RFRytest[0:10000], RFR_ypred[0:10000], s=30)
plt.title('Predicted vs. Actual meter reading')
plt.xlabel('Actual meter reading')
plt.ylabel('Predicted meter reading')