# Data cleaning

### Imports
Import libraries and write settings here.

In [139]:
%load_ext autoreload
%autoreload
%load_ext dotenv
%dotenv

import pandas as pd
import os
import qgrid
import ipywidgets
import numpy as np
import re

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [140]:
file_id = "25100025"
filepath = "data/{}_MetaData-dirty.df".format(file_id)
df_dictionary = pd.read_pickle('data/{}_MetaData-dirty.df'.format(file_id))
df = pd.read_pickle('data/{}-dirty.df'.format(file_id))
qgrid.enable()

## Guide to StatsCan datasets
See https://www.statcan.gc.ca/eng/developers/csv/user-guide for description of column names.  Re-summarized here for reference:

Comments on columns:

|Column | StatsCan Notes | Our Notes |
|:--| :--| :--|
|UOM ID (Units of measure ID) | | seems only to be necessarily to have a unique identifier to deal with Eng/French translation of units of measurement, so ignore this for now|
|SCALAR_FACTOR | The scalar factor associated with a data series, displayed as text. | Disregard this if all entries are 'units' (scalar factor of 1)|
| Scalar_ID | | See above |
| Vector |Unique variable length reference code time-series identifier, consisting of the letter 'V', followed by up to 10 digits. (i.e. V1234567890, V1, etc.) | Don't understand this | 
| Status | Shows various states of a data value using symbols. These symbols are described in the [symbol legend](https://www.statcan.gc.ca/eng/concepts/definitions/guide-symbol) and notes contained in the metadata file. Some symbols accompany a data value while others replace a data value. i.e. – A, B, C, D, E, F,.., X, 0s | It's at row 98 of the data dictionary here|
| Symbol | Describes data points that are preliminary or revised, displayed using the symbols p and r. These symbols accompany a data value.| Can drop this | 
| DECIMALS | This field displays the decimal precision for a given value. | We should append this to VALUE before converting it to numeric |






In [141]:
symbol_legend = df_dictionary.iloc[127:141].copy()
col_rename_dict = {"Cube Title": "Description", "Product Id": "Symbol"}
symbol_legend = symbol_legend.rename(columns=col_rename_dict)
symbol_legend = symbol_legend[col_rename_dict.values()]
symbol_legend = symbol_legend.reset_index(drop=True) # Otherwise, index starts at 98

symbol_legend

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

### Rename and drop columns

In [142]:
def columns_with_unique_val(df):
    unique = []
    val = []
    for col in df.columns:
        values = df[col].unique()
        if len(values) == 1:
            unique.append(col)
            val.append(values[0])
            
    return unique, val

# Drop columns whose only value is nan
df.dropna(axis=1, how='all')

column_map={"REF_DATE": "Year", "GEO": "Region", "UOM": "Unit of measurement"}
df = df.rename(columns=column_map)

def get_naics_code(val):
    matches = re.match(r".* \[(\d*)\]$", val)
    code = matches[1] if matches is not None else np.nan
    return code

# Separate NAICS code from NAICS category string
df['NAICS Code'] = df['North American Industry Classification System (NAICS)'].apply(get_naics_code)
df['NAICS Code'] = pd.to_numeric(df['NAICS Code'], downcast="integer")
df['North American Industry Classification'] = df['North American Industry Classification System (NAICS)'].apply(lambda val: val.split(' [')[0])
df = df.drop(['North American Industry Classification System (NAICS)'], axis=1)

# Unless we combine multiple tables where these values differ 
# (for instance, multiple units of measurement), let's store this outside of the df
drop_cols, vals = columns_with_unique_val(df)
table_metadata = {drop_cols[i]: vals[i] for i in range(len(drop_cols))}

# Get right datatypes!
df = df.astype({'Year': 'uint32', 'VALUE': 'float64', 'STATUS': 'category', 'NAICS Code': 'int64'}, errors='ignore')# {'Year': 'raise', 'VALUE': 'ignore', 'STATUS': 'ignore', 'NAICS Code': 'raise'})

df = df.drop(drop_cols, axis=1)

In [143]:
priority_cols = ['Year', 'Fuel type', 'North American Industry Classification', 'VALUE']
rest_of_cols = list(set(df.columns) - set(priority_cols))
df = df.reindex(columns=(priority_cols + rest_of_cols))
df = df.set_index(['Year'])

# Exploration

In [144]:
def search(term, df):
    mask = np.column_stack([df[col].str.contains(term, na=False) for col in df if df[col].dtype == 'object'])
    return df.loc[mask.any(axis=1)]
    
search("beverage", df_dictionary)

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

# Export data

In [145]:
YEAR = 2017
export_df = df.loc[YEAR][priority_cols[1:]]
export_df.to_csv("{}_clean_minimal.csv".format(file_id), index=False)