## Instructions:

## Todo:

### Inputs:

### Outputs:


<a name='index'></a>

## Notebook Index

1. <a href=#imports>Imports</a>


2. <a href=#read>Read Dataset</a>


3. <a href=#functions>Define data generator functions and default parameters</a>


4. <a href=#analyses>Analyses</a>

In [1]:
from IPython.display import HTML, Image, IFrame, Markdown

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').show();
 } else {
 $('div.input').hide();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

---

<a name='imports'></a>
## Imports
Imports for function used in this notebook.

<a href=#index>index</a>

### Data Wrangling

In [2]:
import pandas as pd
import numpy as np

### Utils

In [3]:
from pickle import dump, load
from tqdm.auto import tqdm
import glob
from functools import reduce

---

<a name='read'></a>
## Data Loading
All the data loaded from disk and used in this notebook

<a href=#index>index</a>

In [4]:
data_dict = {"train":{}, "test":{}}
for file in sorted(glob.glob("../data/processed/*.csv")):
    file_name, extension = file.split("/")[-1].split(".")
    if file_name.startswith("NYC"):
        date_cols = [
    'Design_Start',
    'Final_Change_Date',
    'Schedule_Start',
    'Schedule_End',
]
        drop_col = "Unnamed: 0"
        
    else:
        date_cols = []
        drop_col = []
    
    df = pd.read_csv(file, parse_dates=date_cols).drop(columns=drop_col)
    
    if file_name.split("_")[-1] == "train":
        data_dict["train"][file_name] = df
    elif file_name.split("_")[-1] == "test":
        data_dict["test"][file_name] = df
    else:
        data_dict[file_name] = df
data_dict.keys()

#could also do great expectations to check file format/content

is_missing_file = set(['NYC_capital_projects_3yr_test', 'NYC_capital_projects_3yr_train', 'UMAP_embeddings_NYC_capital_projects_3yr_test', 'UMAP_embeddings_NYC_capital_projects_3yr_train'])\
                - (set(data_dict["train"].keys()) | set(data_dict["test"].keys()) )

if is_missing_file:
    display(Markdown("[Click here to go to Google Drive folder](https://drive.google.com/drive/folders/1I2EJtiYyLfK5DNrtIGBA2n8pTStr5lWG)"))
    display(Markdown(f"You seem to be missing the files {[f'{name}.csv' for name in is_missing_file]}. Please download them from the Google drive."))
    raise FileNotFoundError()

In [5]:
data_dict["train"].keys()

dict_keys(['NYC_capital_projects_3yr_final_train', 'NYC_capital_projects_3yr_train', 'UMAP_embeddings_NYC_capital_projects_3yr_train', 'ae_pca_encoded_embed_train'])

In [6]:
data_dict["test"].keys()

dict_keys(['NYC_capital_projects_3yr_final_test', 'NYC_capital_projects_3yr_test', 'UMAP_embeddings_NYC_capital_projects_3yr_test', 'ae_pca_encoded_embed_test'])

In [7]:
umap_df = data_dict["train"]['UMAP_embeddings_NYC_capital_projects_3yr_train']
data_dict["train"]['UMAP_embeddings_NYC_capital_projects_3yr_train'] = umap_df[list(umap_df.columns[umap_df.columns.str.startswith("umap_attributes_2D") | umap_df.columns.str.startswith("umap_descr_2D")]) + ["PID"]]

In [8]:
df_train_merged = reduce(lambda  left,right: pd.merge(left.copy(),right.copy(),on='PID',
                                            how='left'), data_dict["train"].values())

assert df_train_merged.isnull().sum().sum() == 0

df_train_merged.to_csv("../data/processed/NYC_capital_projects_3yr_final_train.csv")


In [9]:
df_test_merged = reduce(lambda  left,right: pd.merge(left.copy(),right.copy(),on='PID',
                                            how='left'), data_dict["test"].values())

assert df_train_merged.isnull().sum().sum() == 0

df_test_merged.to_csv("../data/processed/NYC_capital_projects_3yr_final_test.csv")
