# EDA for ASHRAE - Great Energy Predictor III Kaggle competition
https://www.kaggle.com/c/ashrae-energy-prediction

## Table of contents: <a class="anchor" id="contents"></a>
* [Data Import](#DataImport)
    * [Import](#Import)
    * [Merge](#Merge)
        
* [EDA](#EDA)
    * [Train](#Train)
        * [Overview](#OverviewTrain)   
        * [Missing](#MissingTrain) 
        * [meter_reading - target column](#meter_reading) 
        * [meter column](#meter) 
    * [Test](#Test)
        * [Overview](#OverviewTest)   
        * [Missing](#MissingTest) 
    * [Train VS Test](#TrainTestNum)
    * [Weather](#Weather)
        * [Overview](#OverviewWeather)   
        * [Missing](#MissingWeather) 
        * [site_id key column](#site_idWeather) 
        * [air_temperature column](#air_temperature) 
        * [cloud_coverage column](#cloud_coverage) 
        * [dew_temperature column](#dew_temperature) 
        * [precip_depth_1_hr column](#precip_depth_1_hr)
        * [sea_level_pressure column](#sea_level_pressure) 
        * [wind_direction column](#wind_direction) 
        * [wind_speed column](#wind_speed)
    * [Buildings](#Buildings)
        * [Overview](#OverviewBuildings)   
        * [Missing](#MissingBuildings) 
        * [site_id key column](#site_idBuildings) 
        * [primary_use categorical column](#primary_use) 
        * [year_built column](#year_built) 
        * [floor_count column](#floor_count) 

In [None]:
import numpy as np
import pandas as pd

import time
import math

# Visualiazation
import seaborn as sns
import matplotlib.pyplot as plt

# Import and suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
def SecondsToStr(time_taken):
    ''' Function return hours, minutes, seconds '''
    ''' from the time in string format. '''

    hours, rest = divmod(time_taken, 3600)
    minutes, seconds = divmod(rest, 60)
    h_ = str(math.trunc(hours))
    m_ = str(math.trunc(minutes))
    s_ = str(round(seconds, 2))
    time_taken_str = ':'.join([h_, m_, s_])

    # return hours, minutes, seconds from the time taken
    return time_taken_str

In [None]:
def df_eda(df_, with_stat_=False):
    
    # Columns of the DataFrame
    print('columns:')
    print(df_.columns.to_list())
    # Shape (number of columns, rows)
    print('\nshape:')
    print(df_.shape)
    # Types of the columns
    print('\ntypes:')
    print(df_.dtypes)
    if with_stat_:
        # Statistic for numerical columns
        print('\nstat:')
        print(df_.describe())
        
def column_info(df_, col_):
    print(col_)
    print('')
    desc_ = df_[col_].describe()
    print(round(desc_.drop(['count']), 2))
    
def column_visualizatin(df_, col_, target_, koef_, with_target_=False):
    fig, ax = plt.subplots() 
    ax.hist(df_[col_], color='g', alpha=0.5, normed=True, label=col_) 
    
    if with_target_:
        df_grouped_ = df_[[col_, target_]].groupby([col_]).mean().reset_index()
        df_grouped_[target_] = df_grouped_[target_].astype('float')/koef_
        ax.plot(df_grouped_[col_], df_grouped_[target_], color='r', label=target_)

    ax.set(title=col_)
    ax.legend(loc='best')
    plt.show()
    
def value_distribution(df_, col_, n_):
    
    ''' Check the share '''

    print(round(df_[col_].value_counts(normalize=True)*100, 2)[:n_])
    sns.countplot(x=col_, data=df_)
    plt.xticks(rotation=90)
    
def share_of_missing_per_column(df_, df_name_):
    
    print('')
    print(df_name_)
    print('Share of missing per column:\n')
    data = []

    for col in df_.columns.to_list():
        if (df_[col].isnull().sum() > 0):
            data.append([col, '{}%'.format(round(100*df_[col].isnull().sum()/df_[col].shape[0], 2))])
    return pd.DataFrame.from_records(data, columns=['Column', 'Missing_share'])

# Data Import <a class="anchor" id="DataImport"></a>
[Table of contents](#contents)

https://www.kaggle.com/c/ashrae-energy-prediction/data

## Import <a class="anchor" id="Import"></a>
[Table of contents](#contents)


In [None]:
# import train Dataset
start_time = time.time()

train = pd.read_csv('../input/ashrae-energy-prediction/train.csv')
building_metadata = pd.read_csv('../input/ashrae-energy-prediction/building_metadata.csv')
weather_train = pd.read_csv('../input/ashrae-energy-prediction/weather_train.csv')

print('Total time: {}'.format(SecondsToStr(time.time() - start_time)))

In [None]:
# import test Dataset
start_time = time.time()

test = pd.read_csv('../input/ashrae-energy-prediction/test.csv')
weather_test = pd.read_csv('../input/ashrae-energy-prediction/weather_test.csv')
sample_submission = pd.read_csv('../input/ashrae-energy-prediction/sample_submission.csv')

print('Total time: {}'.format(SecondsToStr(time.time() - start_time)))

## Merge <a class="anchor" id="Merge"></a>
[Table of contents](#contents)

In [None]:
# merge Datasets
start_time = time.time()

# Temporary merge for EDA
train_df = (train.merge(building_metadata, on='building_id', how='left')).merge(weather_train, on=['site_id', 'timestamp'], how='left')
test_df = (test.merge(building_metadata, on='building_id', how='left')).merge(weather_test, on=['site_id', 'timestamp'], how='left')

# Temporary merge for EDA
weather_df = weather_train.append(weather_test, ignore_index=True)
weather_columns = weather_df.columns.to_list()

print('Total time: {}'.format(SecondsToStr(time.time() - start_time)))

# EDA <a class="anchor" id="EDA"></a>
[Table of contents](#contents)

In [None]:
# df_eda(train_df, False)
# df_eda(test_df, False)

cat_cols, num_cols = [], []

for col in train_df.columns:
    if train_df[col].dtype == object:
        cat_cols.append(col)
    else:
        num_cols.append(col)
print('Numerical columns {}, categorical columns {}'.format(len(num_cols), len(cat_cols)))

## Train <a class="anchor" id="Train"></a>
[Table of contents](#contents)

### Overview <a class="anchor" id="OverviewTrain"></a>
[Table of contents](#contents)

In [None]:
print('Train shape:', train.shape)
print('Train TimeBorder:', train['timestamp'].min(), ':', train['timestamp'].max())
print('Train timestamp. Number of missing values:', train['timestamp'].isnull().sum())
train.head(2)

### Missing <a class="anchor" id="MissingTrain"></a>
[Table of contents](#contents)

In [None]:
share_of_missing_per_column(train, 'Train')

### meter_reading target numerical column <a class="anchor" id="meter_reading"></a>
[Table of contents](#contents)

In [None]:
target = 'meter_reading'

In [None]:
# print(round(train[target].value_counts(normalize=True)*100, 2))
column_info(train, target)
train[target].plot()
plt.show()

### Meter column <a class="anchor" id="Meter"></a>
[Table of contents](#contents)

In [None]:
# Check the share of meter
value_distribution(train, 'meter', 4)

## Test <a class="anchor" id="Test"></a>
[Table of contents](#contents)

### Overview <a class="anchor" id="OverviewTest"></a>
[Table of contents](#contents)

In [None]:
print('Test shape:', test.shape)
print('Test TimeBorder:', test['timestamp'].min(), ':', test['timestamp'].max())
print('Test timestamp. Number of missing values:', test['timestamp'].isnull().sum())
test.head(2)

### Missing <a class="anchor" id="MissingTest"></a>
[Table of contents](#contents)

In [None]:
share_of_missing_per_column(test, 'Test')

## Train VS Test (numerical statistic per column) <a class="anchor" id="TrainTestNum"></a>
[Table of contents](#contents)

In [None]:
data = []

lst = train_df.columns.to_list()
lst.pop(lst.index(target))

for col in lst:
    if col in num_cols:
        eql_median = False

        if train_df[col].median() == test_df[col].median():
            eql_median = True

        data.append([col, 
                     train_df[col].median(),
                     test_df[col].median(),
                     eql_median,
                     train_df[col].mean(),
                     test_df[col].mean(),
                     train_df[col].var(),
                     test_df[col].var()
                     ])
pd.DataFrame.from_records(data, columns=['Column', 
                                         'Train_median', 'Test_median', 
                                         'Equal_medians',
                                         'Train_mean', 'Test_mean', 
                                         'Train_var', 'Test_var', 
                                        ])

## Weather <a class="anchor" id="Weather"></a>
[Table of contents](#contents)

### Overview <a class="anchor" id="OverviewWeather"></a>
[Table of contents](#contents)

In [None]:
print('Weather shape:', weather_df.shape)
print('Weather TimeBorder:', weather_df['timestamp'].min(), ':', weather_df['timestamp'].max())
print('Weather timestamp. Number of missing values:', weather_df['timestamp'].isnull().sum())

weather_df.head(2)

In [None]:
weather_df.columns

In [None]:
weather_df.dtypes

In [None]:
weather_df.describe()

In [None]:
plt.figure(figsize=(8, 8))
sns.heatmap(weather_df.corr(), square=True, annot=True)

### Missing <a class="anchor" id="MissingWeather"></a>
[Table of contents](#contents)

In [None]:
share_of_missing_per_column(weather_df, 'Weather')

### site_id key column <a class="anchor" id="site_idWeather"></a>
[Table of contents](#contents)

In [None]:
# Check the share of site_id
value_distribution(weather_df, 'site_id', 3)

### air_temperature numerical column <a class="anchor" id="air_temperature"></a>
[Table of contents](#contents)

In [None]:
column_visualizatin(train_df, 'air_temperature', target, 100000, True)

In [None]:
weather_df['air_temperature'].hist(color='salmon', alpha=0.5) 

### cloud_coverage numerical column <a class="anchor" id="cloud_coverage"></a>
[Table of contents](#contents)

In [None]:
column_visualizatin(train_df, 'cloud_coverage', target, 100000, True)

### dew_temperature numerical column <a class="anchor" id="dew_temperature"></a>
[Table of contents](#contents)

In [None]:
column_visualizatin(train_df, 'dew_temperature', target, 100000, True)

In [None]:
weather_df['dew_temperature'].hist(color='salmon', alpha=0.5) 

### precip_depth_1_hr numerical column <a class="anchor" id="precip_depth_1_hr"></a>
[Table of contents](#contents)

In [None]:
column_visualizatin(train_df, 'precip_depth_1_hr', target, 1000000, True)

### sea_level_pressure numerical column <a class="anchor" id="sea_level_pressure"></a>
[Table of contents](#contents)

In [None]:
column_visualizatin(train_df, 'sea_level_pressure', target, 100000, True)

### wind_direction numerical column <a class="anchor" id="wind_direction"></a>
[Table of contents](#contents)

In [None]:
column_visualizatin(train_df, 'wind_direction', target, 1000000, True)

### wind_speed numerical column <a class="anchor" id="wind_speed"></a>
[Table of contents](#contents)

In [None]:
column_visualizatin(train_df, 'wind_speed', target, 100000, True)

## Buildings <a class="anchor" id="Buildings"></a>
[Table of contents](#contents)

### Overview <a class="anchor" id="OverviewBuildings"></a>
[Table of contents](#contents)

In [None]:
print(building_metadata.shape)
building_metadata.head(2)

In [None]:
building_metadata.dtypes

In [None]:
building_metadata.describe()

In [None]:
plt.figure(figsize=(8, 8))
sns.heatmap(building_metadata.corr(), square=True, annot=True)

### Missing <a class="anchor" id="MissingBuildings"></a>
[Table of contents](#contents)

In [None]:
share_of_missing_per_column(building_metadata, 'Buildings')

### site_id numerical column <a class="anchor" id="site_idBuildings"></a>
[Table of contents](#contents)

In [None]:
# Check the share of site_id
value_distribution(building_metadata, 'site_id', 3)

### primary_use categorical column <a class="anchor" id="primary_use"></a>
[Table of contents](#contents)

In [None]:
# Check the share of primary_use
value_distribution(building_metadata, 'primary_use', 10)

### year_built numerical column <a class="anchor" id="year_built"></a>
[Table of contents](#contents)

In [None]:
column_visualizatin(train_df, 'year_built', target, 100000, True)

### floor_count numerical column <a class="anchor" id="floor_count"></a>
[Table of contents](#contents)

In [None]:
column_visualizatin(train_df, 'floor_count', target, 10000, True)