## Electricity Cleaned Dataset

**Summary**
* Converted timestamp object to datetime 
    * All other types are float64
* Deleted columns with more than 50% missing values 
* Used interpolation to fill in missing values 
    * Electricity shows linear usage according to GitHub documentation so performed slinear interpolation for a basic linear interpolation 
* Filled in remaining missing values with backward propagation fill and then forward propagation fill 


In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
os.chdir('/kaggle/input/buildingdatagenomeproject2')
os.listdir()

In [None]:
#imports
import pandas as pd
import numpy as np
import seaborn as sns
import missingno as msno

In [None]:
#load dataset 
electricity = pd.read_csv("electricity_cleaned.csv")

In [None]:
electricity.info()

In [None]:
electricity.head()

In [None]:
electricity.shape

In [None]:
#show types of the values 
electricity.dtypes

In [None]:
#change to DateTime format
electricity["timestamp"] = pd.to_datetime(electricity["timestamp"], format = "%Y-%m-%d %H:%M:%S")

In [None]:
#show types of the values 
#check that changing to DateTime format worked
electricity.dtypes

In [None]:
#checked for misssing values 
electricity.isnull().sum()

In [None]:
#to visualize missing values 
msno.matrix(electricity)

In [None]:
#function shows the percentage of missing values and type of the values
def missing_data(data):
    percent = (data.isnull().sum() / data.isnull().count())
    x = pd.concat([percent], axis=1, keys=['Percentage_of_Missing_Values'])
    type = []
    
    for col in data.columns:
        dtype = str(data[col].dtype)
        type.append(dtype)
    x['Data Type'] = type
    
    return(np.transpose(x))

In [None]:
missing_data(electricity)

In [None]:
temp = missing_data(electricity)
col_names = temp.T.query('Percentage_of_Missing_Values > 0.5').index

In [None]:
electricity[col_names]

In [None]:
#removed the columns/locations with more than 50% missing values 
electricity_cleaned = electricity.drop(electricity[col_names], axis = 1)

In [None]:
electricity_cleaned.head()

In [None]:
electricity_cleaned.shape

In [None]:
#to visualize missing values 
msno.matrix(electricity_cleaned)

In [None]:
#interpolate 
electricity_cleaned = electricity_cleaned.interpolate(method='slinear')

In [None]:
electricity_cleaned.isnull().sum()

In [None]:
#to visualize missing values 
msno.matrix(electricity_cleaned)

In [None]:
#shows the number of non-zero values per column 
electricity_cleaned.loc[:, electricity_cleaned.columns != 'timestamp'].astype(bool).sum(axis=0)

In [None]:
#back propagation fill
electricity_cleaned = electricity_cleaned.fillna(method='bfill')

In [None]:
#to visualize missing values 
msno.matrix(electricity_cleaned)

In [None]:
#forward propagation fill 
electricity_cleaned = electricity_cleaned.fillna(method='ffill') 

In [None]:
#to visualize missing values 
msno.matrix(electricity_cleaned)

In [None]:
electricity_cleaned.isnull().sum()

In [None]:
#save as csv
electricity_cleaned.to_csv('/kaggle/working/electricity_cleaned_new.csv')