In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
import datetime
import gc
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split,KFold
from sklearn.metrics import mean_squared_error


import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import lightgbm as lgb

%matplotlib inline
from sklearn.feature_selection import RFE, f_regression
from sklearn.linear_model import (LinearRegression, Ridge, Lasso,LogisticRegression)
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

from sklearn.ensemble import RandomForestRegressor
from math import sqrt
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

SMALL_SIZE = 10
MEDIUM_SIZE = 12
BIGGER_SIZE = 16

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)    # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

** The following function is based on https://www.kaggle.com/gemartin/load-data-reduce-memory-usage?fbclid=IwAR2PdhpX6JywVbJ84gZTngvXORMhP0t2hMXJZlrzktANha4Xf2YDPQqI538

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    

    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [None]:

def missing_statistics(df):    
    statitics = pd.DataFrame(df.isnull().sum()).reset_index()
    statitics.columns=['COLUMN NAME',"MISSING VALUES"]
    statitics['TOTAL ROWS'] = df.shape[0]
    statitics['% MISSING'] = round((statitics['MISSING VALUES']/statitics['TOTAL ROWS'])*100,2)
    return statitics

# Data Importation


In [None]:
print('-' * 80)
print('building')
building = import_data('/kaggle/input/ashrae-energy-prediction/building_metadata.csv')

print('-' * 80)
print('train')
train = import_data('/kaggle/input/ashrae-energy-prediction/train.csv')

print('-' * 80)
print('test')
test = import_data('/kaggle/input/ashrae-energy-prediction/test.csv')

print('-' * 80)
print('weather train')
weather_train = import_data('/kaggle/input/ashrae-energy-prediction/weather_train.csv')

print('-' * 80)
print('weather test')
weather_test = import_data('/kaggle/input/ashrae-energy-prediction/weather_test.csv')


In [None]:
building.describe().transpose()

In [None]:
# check missing values in building data
missing_statistics(building)

In [None]:
boxplot1 = building.boxplot(column=['floor_count'], by=['site_id'],figsize= (15,7))


In [None]:
boxplot = building.boxplot(column=['year_built'], by=['site_id'],figsize= (10,7))

In [None]:
## removing columns with more than 50% NANs
building=building.drop(columns=['year_built','floor_count']);

In [None]:
del boxplot,boxplot1
gc.collect()

# Join tables 


In [None]:
# Merge dataset to create training dataframe

train_buil = train.merge(building, on='building_id', how='left')
data_train = train_buil.merge(weather_train, on=['site_id','timestamp'], how='left')
data_train['timestamp'] = pd.to_datetime(data_train.timestamp, format='%Y-%m-%d %H:%M:%S')

del weather_train,train,train_buil
data_train.head(10)

In [None]:
# Merge dataset to create test dataframe
test_buil = test.merge(building, on='building_id', how='left')
data_test = test_buil.merge(weather_test, on=['site_id','timestamp'], how='left')

del weather_test,test_buil

data_test.head(10)

## Dataframes description

In [None]:
data_train.info()

In [None]:
data_train.info()

In [None]:
missing_statistics(data_train) 
# the cloud_coverage columns contains alot of missing values

In [None]:
missing_statistics(data_test) 
# the cloud_coverage columns contains alot of missing values 

In [None]:
O, C = data_test.shape
print(f'Dans test, il y a {O} observations et {C} colonnes.')

In [None]:
O, C = data_train.shape
print(f'Dans train, il y a {O} observations et {C} colonnes.') 

### Deleting cloud_coverage column 

In [None]:
data_train=data_train.drop(columns=['cloud_coverage']);
data_test=data_test.drop(columns=['cloud_coverage']);

# Rows processing


In [None]:
liste = data_train.isnull().sum(axis=1).tolist()#liste with the number of nans in each row

RowsEQUAL,RowsANDHIGH=[],[]
for n in range(0,8):
    RowsEQUAL.append(round(sum( i == n for i in liste)/data_train.shape[0]*100,3))
    RowsANDHIGH.append(round(sum(i > n for i in liste)/data_train.shape[0]*100,3))

Rows_Statistics = pd.DataFrame(RowsEQUAL, columns=["% Rows with NANs (=)"])
Rows_Statistics[" % Rows with NANs (>)"] = RowsANDHIGH
Rows_Statistics.index.name = 'Number of NANs'
#df.index += 1 
print(Rows_Statistics);

del RowsEQUAL,RowsANDHIGH
del Rows_Statistics

In [None]:
# Keep only the rows with at least 8 non-NAN values # 6 NAN or plus
data_train = data_train.dropna(thresh = 8) 

In [None]:
o, c = data_train.shape
print(f'Dans test apres suppression , il y a {o} observations et {c} colonnes.') #20216100

In [None]:
print("Nombre de lignes supprimées : " + str(O-o))

In [None]:
# number of rows deleted
features = data_train.columns
print(f'Les {C} colonnes sont: ', list(features))
 

In [None]:
data_train.timestamp.unique  #train data is for 1 year


In [None]:
data_test.timestamp.unique  #test data for 2 years

In [None]:
pd.to_datetime(data_train.timestamp, format='%Y-%m-%d %H:%M:%S').dt.time.astype(str).unique()
#meter reading for each hour

# weather plotboxs


In [None]:
data_train["datetime"] = pd.to_datetime(data_train["timestamp"])
data_train["hour"] = data_train["datetime"].dt.hour;
data_train["day"] = data_train["datetime"].dt.day;
data_train["week"] = data_train["datetime"].dt.week;
data_train["month"] = data_train["datetime"].dt.month;


In [None]:
fig, axes = plt.subplots(6,1,figsize=(13,20)) 
for i,col in enumerate(['air_temperature','dew_temperature','wind_direction','wind_speed','precip_depth_1_hr', 'sea_level_pressure']):

    plot = data_train.boxplot(col, by="site_id", ax=axes.flatten()[i]);

plt.tight_layout() ;


We notice a lack of data for some entire Sit, in terms of  'precip depth' and 'sea level pressure'. While for outliers we notice a large number of outliers especially in 'precip depth'

# Filling in missing data using the mean
### in terms of 'site_id', 'hour' and 'month'

In [None]:
data_train = data_train.set_index(['site_id','hour','month'])

In [None]:
#filling air temperature
air_temperature_filler = pd.DataFrame(data_train.groupby(['site_id','hour','month'])['air_temperature'].mean(),columns=["air_temperature"])
data_train.update(air_temperature_filler,overwrite=False)
del air_temperature_filler

In [None]:
#filling due temperature
due_temperature_filler = pd.DataFrame(data_train.groupby(['site_id','hour','month'])['dew_temperature'].mean(),columns=["dew_temperature"])
data_train.update(due_temperature_filler,overwrite=False)
del due_temperature_filler


In [None]:
# filling sea_level_pressure
sea_level_filler = pd.DataFrame(data_train.groupby(['site_id','hour','month'])['sea_level_pressure'].mean(),columns=['sea_level_pressure'])

data_train.update(sea_level_filler,overwrite=False)
del sea_level_filler

In [None]:
gc.collect()

In [None]:
#filling wind_direction
wind_direction_filler =  pd.DataFrame(data_train.groupby(['site_id','hour','month'])['wind_direction'].mean(),columns=['wind_direction'])
data_train.update(wind_direction_filler,overwrite=False)
del wind_direction_filler

In [None]:
#filling  wind speed
wind_speed_filler =  pd.DataFrame(data_train.groupby(['site_id','hour','month'])['wind_speed'].mean(),columns=['wind_speed'])
data_train.update(wind_speed_filler,overwrite=False)
del wind_speed_filler
gc.collect()

In [None]:
#filling precip depth
precip_depth_filler = pd.DataFrame(data_train.groupby(['site_id','hour','month'])['precip_depth_1_hr'].mean(),columns=['precip_depth_1_hr'])
data_train.update(precip_depth_filler,overwrite=False)
del precip_depth_filler


In [None]:
gc.collect()

In [None]:
data_train.reset_index(inplace=True)

In [None]:
missing_statistics(data_train) 

In [None]:
gc.collect()

# Consumption measures

In [None]:
data_train['meter'].unique()
#{0: 'electricity', 1: 'chilledwater', 2: 'steam', 3: 'hotwater'}

In [None]:
data_train['meter'].value_counts().unique() #observations per feature [0,1,2,3]

In [None]:
data_train[data_train['meter']== 3]["meter"].value_counts()

In [None]:
data_train['meter'].replace({0:"electricité",1:"eau froide",2:"vapeur",3:"eau chaude"},inplace=True)
sns.countplot(x= "meter",data = data_train,palette ="Set2");

# Test data description 

In [None]:
data_test.describe().transpose() #observations

In [None]:
missing_statistics(data_test)


In [None]:
    nullseries = data_test.isnull().sum()
    onlynull= nullseries[nullseries > 0]
    plt.figure(figsize=(10,5));
    sns.barplot(x=onlynull.index , y=onlynull*100/len(data_test));
    plt.ylabel("PERCENTAGE NAN DATA in Test")
    plt.xlabel("COLUMN NAME");
    plt.xticks(rotation=90);

# Visualisations

In [None]:
fig, ax = plt.subplots(1, figsize=(15, 10))
lab1 = ['train data']
lab2 = ['test data']
n, bins, patches = ax.hist(data_test.site_id, bins=15,color='r', edgecolor='white',alpha=0.6,label = lab2)
n1, bins1, patches1 = ax.hist(data_train.site_id, bins=15,color='b', edgecolor='white',alpha=0.5,label = lab1)

# Annotate each bar with the no. of buildings in that site:
for number, b in zip(n, bins[:-1]):
    ax.annotate(int(number), 
                 xy=(b+.5, number), xytext=(0, 1),#1 point vertical offset
                 textcoords="offset points",
                 ha='center', va='bottom', fontsize=12)
# Annotate each bar with the no. of buildings in that site:
for number1, b1 in zip(n1, bins1[:-1]):
    ax.annotate(int(number1), 
                 xy=(b1 +.5, number1), xytext=(0, 1),#1 point vertical offset
                 textcoords="offset points",
                 ha='center', va='bottom', fontsize=12)

ax.legend(prop ={'size': 10}) 
ax.set_xlabel('site_id')
ax.set_ylabel('number of buildings')
ax.set_title('Occurrence of buildings on each site', fontsize=16);


In [None]:
# Get a list of primary uses and its length
prim_use_list = data_train['primary_use'].unique()
len(prim_use_list)

In [None]:
building.groupby(['primary_use','site_id']).size().unstack().fillna(0).astype(int).style.background_gradient(axis=None)
building.groupby(['primary_use']).size().to_frame('number_buildings').fillna(0).style.background_gradient(axis=None)

In [None]:
use = data_train.groupby("primary_use").meter_reading.mean()
sns.barplot(y=use.index,x=use)
del use

In [None]:

# Group by primary use and plot time series profiles
fig, axes = plt.subplots(8, 2, figsize=(20, 35))


# Daily energy use for each building
edu_daily = data_train.groupby(['building_id', data_train['timestamp'].dt.date])['meter_reading'].sum()
edu_daily = edu_daily.reset_index()
edu_mean = edu_daily.groupby('timestamp')['meter_reading'].mean()
axes[0, 0].plot(edu_mean.index, edu_mean)

# For the rest of the building types we will write a loop for batch ploting:
for ax, use in zip(axes.flat[1:], prim_use_list[1:]): 
    prim_use_df = data_train[data_train['primary_use']==use]
    prim_use_daily = prim_use_df.groupby(['building_id', prim_use_df['timestamp'].dt.date])['meter_reading'].sum()
    prim_use_daily = prim_use_daily.reset_index()
    mean = prim_use_daily.groupby('timestamp')['meter_reading'].mean()
    
    ax.plot(mean.index, mean)
    ax.set_title(use)

fig.add_subplot(111, frameon=False)
# hide tick and tick label of the big axes
plt.tick_params(labelcolor='none', top=False, bottom=False, left=False, right=False)
plt.grid(False)
plt.xlabel('Time')
plt.ylabel('Meter Reading (Daily Sum)', labelpad=20)

plt.title('Time series profiles for different building types', pad=30)
plt.show()

In [None]:
boxplot = data_train.boxplot(column=['air_temperature','dew_temperature','wind_direction','wind_speed']);

In [None]:
del building , boxplot, fig, prim_use_daily 
gc.collect()

In [None]:
data_test["datetime"] = pd.to_datetime(data_test["timestamp"])
data_test["hour"] = data_test["datetime"].dt.hour;
data_test["week"] = data_test["datetime"].dt.week;
data_test["month"] = data_test["datetime"].dt.month;
data_test["day"] = data_test["datetime"].dt.day;

In [None]:
drop_features = ["datetime"] 

data_train.drop(drop_features, axis=1, inplace=True)
data_test.drop(drop_features, axis=1, inplace=True)
del drop_features

In [None]:
data_train['meter'].replace({"electricité":0,"eau froide":1,"vapeur":2,"eau chaude":3},inplace=True)


In [None]:
data_test.head()


In [None]:
gc.collect()

# Save processed data in the output 

In [None]:
data_train.to_csv('data_training.csv', index=False)


In [None]:
data_test.to_csv('test_data.csv', index=False)