# Data cleaning

### Imports
Import libraries and write settings here.

In [50]:
%load_ext autoreload
%autoreload
%load_ext dotenv
%dotenv

import pandas as pd
import os
import qgrid
import ipywidgets
import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [51]:
file_id = "25100061"
filepath = "data/{}_MetaData-dirty.df".format(file_id)
df_dictionary = pd.read_pickle('data/{}_MetaData-dirty.df'.format(file_id))
df = pd.read_pickle('data/{}-dirty.df'.format(file_id))
qgrid.enable()

###### Guide to StatsCan datasets
See https://www.statcan.gc.ca/eng/developers/csv/user-guide for description of column names.  Re-summarized here for reference:

Comments on columns:

|Column | StatsCan Notes | Our Notes |
|:--| :--| :--|
|UOM ID (Units of measure ID) | | seems only to be necessarily to have a unique identifier to deal with Eng/French translation of units of measurement, so ignore this for now|
|SCALAR_FACTOR | The scalar factor associated with a data series, displayed as text. | Disregard this if all entries are 'units' (scalar factor of 1)|
| Scalar_ID | | See above |
| Vector |Unique variable length reference code time-series identifier, consisting of the letter 'V', followed by up to 10 digits. (i.e. V1234567890, V1, etc.) | Don't understand this | 
| Status | Shows various states of a data value using symbols. These symbols are described in the [symbol legend](https://www.statcan.gc.ca/eng/concepts/definitions/guide-symbol) and notes contained in the metadata file. Some symbols accompany a data value while others replace a data value. i.e. – A, B, C, D, E, F,.., X, 0s | It's at row 98 of the data dictionary here|
| Symbol | Describes data points that are preliminary or revised, displayed using the symbols p and r. These symbols accompany a data value.| Can drop this | 
| DECIMALS | This field displays the decimal precision for a given value. | We should append this to VALUE before converting it to numeric |






In [52]:
symbol_legend = df_dictionary.iloc[0:1].copy()
symbol_legend = symbol_legend.drop(['Cube Notes', 'Total number of dimensions', 'Frequency','CANSIM Id', 'Archive Status'], axis = 1)
symbol_legend

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

### Rename and drop columns

In [53]:
# Drop columns whose only value is nan
df.dropna(axis=1, how='all')

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

In [81]:
def columns_with_unique_val(df):
    unique = []
    val = []
    for col in df.columns:
        values = df[col].unique()
        if len(values) == 1:
            unique.append(col)
            val.append(values[0])
            
    return unique, val

column_map={"REF_DATE": "Year", "GEO": "Region", "UOM": "Unit of measurement"}
df = df.rename(columns=column_map)

# Unless we combine multiple tables where these values differ 
# (for instance, multiple units of measurement), let's store this outside of the df
drop_cols, vals = columns_with_unique_val(df)
table_metadata = {drop_cols[i]: vals[i] for i in range(len(drop_cols))}

df = df.drop(drop_cols, axis=1)

In [82]:
#df['Household energy consumption, by type of dwelling, Canada and provinces'].unique()
df.columns
# table_metadata
df
table_metadata

{}

### Set datatypes

In [83]:
display(df.dtypes)

df['Year'] = pd.to_numeric(df['Year'])
df['VALUE'] = pd.to_numeric(df['VALUE'])

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

# Exploration

# Export data

In [87]:
#Export most recent year (2015) and only consumption with Gigajoule units
export_df = df[(df["Region"] == "Canada") & (df["Year"] == 2015) & (df["Energy consumption"] == "Gigajoules")][['Year', 'Energy type', 'Type of dwelling', 'VALUE', 'Unit of measurement']]
export_df.to_csv("{}_clean_minimal.csv".format(file_id), index=False)
export_df

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…