# Persiapan Data

In [None]:
import numpy as np #linear algebra 
import pandas as pd #data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

#input data files are available in read-only "../input/" directory
#for example, running this (by clicking run or pressing Shift+Enter) will list all files unde the input directory

import os
for dirname, _, filenames in os.walk('kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
test_data = pd.read_csv("../input/ashrae-energy-prediction/test.csv")
train_data = pd.read_csv("../input/ashrae-energy-prediction/train.csv")
building = pd.read_csv("../input/ashrae-energy-prediction/building_metadata.csv")
weathertest = pd.read_csv("../input/ashrae-energy-prediction/weather_test.csv")
weathertrain = pd.read_csv("../input/ashrae-energy-prediction/weather_train.csv")
warnings.filterwarnings('ignore')

In [None]:
#mengurangi penggunaan memori data, sumber: https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df, df_name):

    start_mem = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:

        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2

    print('Memory usage of {} is reduced by {:.2f} %. Usage dropped from {:.2f} MB to {:.2f} MB.'.format(df_name, (100 * (start_mem - end_mem) / start_mem), start_mem, end_mem))
    
    return df

In [None]:
train_data = reduce_mem_usage(train_data, 'Train Data')
test_data = reduce_mem_usage(test_data, 'Test Data')

building = reduce_mem_usage(building, 'Building Data')

weathertrain = reduce_mem_usage(weathertrain, 'Weather Train Data')
weathertest = reduce_mem_usage(weathertest, 'Weather Test Data')

In [None]:
#gabung table building_metadata, test, dan weather_test 
test = building.merge(test_data, on ='building_id', how = 'left').merge(weathertest, on = ('site_id', 'timestamp'), how = 'left')
test.head()

In [None]:
#gabung tabel building_metadata, train, dan weather_train
train = building.merge(train_data, on ='building_id', how ='left').merge(weathertrain, on = ('site_id', 'timestamp'), how ='left')
train.head()

# Preprocessing Pada Tabel Train

In [None]:
#ubah tipe data pada kolom berikut
train['site_id'] = train['site_id'].astype('category')
train['building_id'] = train['building_id'].astype('category')
train['meter'] = train['meter'].astype('category')
train['timestamp'] = pd.to_datetime(train['timestamp'], format='%Y-%m-%d %H:%M:%S')

In [None]:
train.head()

In [None]:
#periksa info data pada tabel gabungan
train.info()
train.describe(include="all")

Pada kolom precip_depth_1_hr, terlihat bahwa nilai minimalnya -1. Padahal variabel precip_depth_1_hr merupakan ukuran panjang yang tidak bisa < 0. Nilai -1 akan diubah menjadi 0 dengan menganggap bahwa terjadi salah input.

In [None]:
train['precip_depth_1_hr']=train['precip_depth_1_hr'].replace(-1,0)
train['precip_depth_1_hr'].describe()

## Mencari Outlier

karena 'meter_reading' merupakan target variable, maka kita akan mencari apakah 'meter_reading' memiliki outlier atau tidak.

In [None]:
fig, ax = plt.subplots(figsize=(8, 8))
sns.distplot(train['meter_reading'], color='blue', ax = ax).set_title("Tabel Distribusi meter_reading", fontsize=16)
plt.xlabel('meter_reading', fontsize=12)
plt.ylabel('Density', fontsize=12)
plt.show()

sns.boxplot(x=train['meter_reading'])

data akan ditransformasi dengan transformasi logaritma untuk mengurangi efek outlier

In [None]:
train['meter_reading']=np.log1p(train['meter_reading'])

fig, ax = plt.subplots(figsize=(8, 8))
sns.distplot(train['meter_reading'], color='blue', ax = ax).set_title("Tabel Distribusi meter_reading", fontsize=16)
plt.xlabel('meter_reading', fontsize=12)
plt.ylabel('Density', fontsize=12)
plt.show()

sns.boxplot(x=train['meter_reading'])

setelah transformasi, efek outlier berkurang

## Missing Values

In [None]:
train.isnull().sum() #jumlah missing values pada setiap kolom

In [None]:
#persentase missing values terhadap jumlah data
missing_values = pd.DataFrame(train.isnull().sum() * 100 / len(train))
missing_values.columns = ["Missing Values"]

missing_values = missing_values[missing_values["Missing Values"] != 0]
missing_values.sort_values(by = "Missing Values", axis = 0, ascending = False, inplace = True)

missing_values

In [None]:
plt.figure(figsize=(8,8))
sns.barplot(x = missing_values["Missing Values"], y = missing_values.index).set_title("Persentase Missing Values", fontsize=16)
plt.xlabel("Persen(%)",fontsize=12)
plt.ylabel("Variabel",fontsize=12)
plt.show()

Dari tabel dan visualisasi tersebut, dapat dilihat bahwa kolom 'floor_count' dan 'year_built' memiliki persentase missing values > 50% dan variabel-variabel tersebut kurang penting sehingga akan di-delete.

In [None]:
#drop kolom dengan persentase missing values > 50%
train.drop('floor_count', axis=1, inplace=True)
train.drop('year_built', axis=1, inplace=True)

In [None]:
#tabel histogram untuk periksa distribusi variabel
train.hist(bins=50, figsize=(24,15))
plt.suptitle('Histogram', fontsize=16)
plt.show()

Untuk variabel dengan distribusi yang cukup skewed, maka akan diimputasi dengan median, Sedangkan untuk variabel dengan distribusi yang tidak skewed/mendekati normal, maka akan diimputasi dengan mean.

Karena semua variabel cukup skewed, maka akan diimputasi dengan median

In [None]:
#imputasi dengan median
train['cloud_coverage'].fillna(train['cloud_coverage'].median(), inplace=True)
train['dew_temperature'].fillna(train['dew_temperature'].median(), inplace=True)
train['precip_depth_1_hr'].fillna(train['precip_depth_1_hr'].median(), inplace=True)
train['wind_direction'].fillna(train['wind_direction'].median(), inplace=True)
train['wind_speed'].fillna(train['wind_speed'].median(), inplace=True)
train['air_temperature'].fillna(train['air_temperature'].median(), inplace=True)
train['sea_level_pressure'].fillna(train['sea_level_pressure'].median(), inplace=True)

In [None]:
#data setelah preprocessing
train 

In [None]:
train.info()

In [None]:
train.isnull().sum()

# Analisis Data

In [None]:
#periksa statistika sederhana setelah preprocessing
train.describe(include="all")

In [None]:
train.hist(bins=50, figsize=(24,15))
plt.suptitle('Histogram Setelah Preprocessing', fontsize=16)
plt.show()

In [None]:
plt.figure(figsize=(8,8))
sns.countplot(x = 'site_id', data = train).set_title("Jumlah bangunan per site id", fontsize=12)
plt.xlabel('Site id')
plt.ylabel('Jumlah Bangunan')
plt.show()

In [None]:
plt.figure(figsize=(8,8))
sns.countplot(x = 'primary_use', data = train).set_title("Frekuensi kategori primary_use", fontsize=12)
plt.xticks(rotation=90)
plt.xlabel('Primary_use')
plt.ylabel('Count')
plt.show()

In [None]:
building_groups = train.groupby(['building_id']).meter_reading.mean().reset_index()

plt.figure(figsize=(14,8))
sns.lineplot(x = building_groups['building_id'], y = building_groups.meter_reading).set_title("Rata-Rata Meter Reading berdasarkan building id", fontsize=16)
plt.xlabel('Building id', fontsize=12)
plt.ylabel('Rata-Rata Meter Reading', fontsize=12)
plt.show()

In [None]:
building_groups = train.groupby(['timestamp']).meter_reading.mean().reset_index()

plt.figure(figsize=(14,8))
sns.lineplot(x = building_groups['timestamp'], y = building_groups.meter_reading).set_title("Rata-Rata Meter Reading berdasarkan timestamp", fontsize=16)
plt.xlabel('Time Stamp', fontsize=12)
plt.ylabel('Rata-Rata Meter Reading', fontsize=12)
plt.show()

In [None]:
#korelasi antar variabel
correlation_matrix = train.corr()
plt.figure(figsize=(18,18))
sns.heatmap(correlation_matrix, annot = True, vmin = -1, vmax = 1, cmap="Blues").set_title('Heatmap Korelasi')
plt.show()

In [None]:
train.to_csv('EDA_Kelompok_F.csv',index=False)