## Instructions:

## Todo:

### Inputs:

### Outputs:


<a name='index'></a>

## Notebook Index

1. <a href=#imports>Imports</a>


2. <a href=#read>Read Dataset</a>


3. <a href=#functions>Define data generator functions and default parameters</a>


4. <a href=#analyses>Analyses</a>

In [1]:
from IPython.display import HTML, Image, IFrame, Markdown

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').show();
 } else {
 $('div.input').hide();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

---

<a name='imports'></a>
## Imports
Imports for function used in this notebook.

<a href=#index>index</a>

### Data Wrangling

In [2]:
import pandas as pd
import numpy as np

### Utils

In [3]:
from pickle import dump, load
from tqdm.auto import tqdm
import glob
from functools import reduce

---

<a name='read'></a>
## Data Loading
All the data loaded from disk and used in this notebook

<a href=#index>index</a>

In [4]:
data_dict = {"train":{}, "test":{}}
files_needed = set(
    [
        'NYC_capital_projects_3yr_test',
        'NYC_capital_projects_3yr_train',
        'ae_pca_encoded_embed_test',
        'ae_pca_encoded_embed_train',
        'UMAP_embeddings_NYC_capital_projects_3yr_test',
        'UMAP_embeddings_NYC_capital_projects_3yr_train',
        'kmeans3_attribute_labels_test',
        'kmeans3_attribute_labels_train'
    ]
)

files_needed_paths = [f"../data/processed/{file}.csv" for file in files_needed]
                   
for file in sorted(files_needed_paths):
    file_name, extension = file.split("/")[-1].split(".")
    print(file_name)
    if file_name.startswith("NYC"):
        date_cols = [
    'Design_Start',
    'Final_Change_Date',
    'Schedule_Start',
    'Schedule_End',
]
        drop_col = "Unnamed: 0"
    
    #umap 
    else:
        date_cols = []
        drop_col = []
    
    df = pd.read_csv(file, parse_dates=date_cols).drop(columns=drop_col)
    
    if file_name.split("_")[-1] == "train":
        data_dict["train"][file_name] = df
    elif file_name.split("_")[-1] == "test":
        data_dict["test"][file_name] = df
    else:
        data_dict[file_name] = df
data_dict.keys()

#could also do great expectations to check file format/content

is_missing_file = files_needed\
                - (set(data_dict["train"].keys()) | set(data_dict["test"].keys()) )

if is_missing_file:
    display(
        Markdown(
            "[Click here to go to Google Drive folder](https://drive.google.com/drive/folders/1I2EJtiYyLfK5DNrtIGBA2n8pTStr5lWG)"
        )
    )
    display(
        Markdown(
            f"You seem to be missing the files {[f'{name}.csv' for name in is_missing_file]}. Please download them from the Google drive."
        )
    )
    raise FileNotFoundError()

NYC_capital_projects_3yr_test
NYC_capital_projects_3yr_train
UMAP_embeddings_NYC_capital_projects_3yr_test
UMAP_embeddings_NYC_capital_projects_3yr_train
ae_pca_encoded_embed_test
ae_pca_encoded_embed_train
kmeans3_attribute_labels_test
kmeans3_attribute_labels_train


In [5]:
data_dict["train"].keys()

dict_keys(['NYC_capital_projects_3yr_train', 'UMAP_embeddings_NYC_capital_projects_3yr_train', 'ae_pca_encoded_embed_train', 'kmeans3_attribute_labels_train'])

In [6]:
data_dict["test"].keys()

dict_keys(['NYC_capital_projects_3yr_test', 'UMAP_embeddings_NYC_capital_projects_3yr_test', 'ae_pca_encoded_embed_test', 'kmeans3_attribute_labels_test'])

In [7]:
data_dict["train"]['UMAP_embeddings_NYC_capital_projects_3yr_train'].columns[
    data_dict["train"]['UMAP_embeddings_NYC_capital_projects_3yr_train'].columns.str.contains("label")
]

Index(['attribute_clustering_label'], dtype='object')

In [8]:
# filters so only keep the 2D features
umap_df_train = data_dict["train"]['UMAP_embeddings_NYC_capital_projects_3yr_train']
umap_df_train['attribute_clustering_label'] = umap_df_train['attribute_clustering_label'].astype("str") 
data_dict["train"]['UMAP_embeddings_NYC_capital_projects_3yr_train'] = umap_df_train[
    ["PID"] + list(
        umap_df_train.columns[
            umap_df_train.columns.str.startswith("umap_attributes_2D") | 
            umap_df_train.columns.str.startswith("umap_descr_2D") 
        ]
    ) + ['attribute_clustering_label']
]

# filters so only keep the 2D features
umap_df_test = data_dict["test"]['UMAP_embeddings_NYC_capital_projects_3yr_test']
umap_df_test['attribute_clustering_label'] = umap_df_test['attribute_clustering_label'].astype("str") 
data_dict["test"]['UMAP_embeddings_NYC_capital_projects_3yr_test'] = umap_df_test[
    ["PID"]  + list(
        umap_df_test.columns[
            umap_df_test.columns.str.startswith("umap_attributes_2D") |
            umap_df_test.columns.str.startswith("umap_descr_2D")
        ]
    ) + ['attribute_clustering_label']
]


In [9]:
df_train_merged = reduce(lambda  left,right: pd.merge(left.copy(),right.copy(),on='PID',
                                            how='left'), data_dict["train"].values())

assert df_train_merged.isnull().sum().sum() == 0
assert df_train_merged.shape == (134,53)


In [10]:
df_test_merged = reduce(lambda  left,right: pd.merge(left.copy(),right.copy(),on='PID',
                                            how='left'), data_dict["test"].values())

assert df_test_merged.isnull().sum().sum() == 0
assert df_test_merged.shape == (15,53)


In [11]:
df_train_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 134 entries, 0 to 133
Data columns (total 53 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   PID                         134 non-null    int64         
 1   Project_Name                134 non-null    object        
 2   Description                 134 non-null    object        
 3   Category                    134 non-null    object        
 4   Borough                     134 non-null    object        
 5   Managing_Agency             134 non-null    object        
 6   Client_Agency               134 non-null    object        
 7   Phase_Start                 134 non-null    object        
 8   Current_Project_Years       134 non-null    float64       
 9   Current_Project_Year        134 non-null    int64         
 10  Design_Start                134 non-null    datetime64[ns]
 11  Budget_Start                134 non-null    float64       

In [12]:
df_test_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15 entries, 0 to 14
Data columns (total 53 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   PID                         15 non-null     int64         
 1   Project_Name                15 non-null     object        
 2   Description                 15 non-null     object        
 3   Category                    15 non-null     object        
 4   Borough                     15 non-null     object        
 5   Managing_Agency             15 non-null     object        
 6   Client_Agency               15 non-null     object        
 7   Phase_Start                 15 non-null     object        
 8   Current_Project_Years       15 non-null     float64       
 9   Current_Project_Year        15 non-null     int64         
 10  Design_Start                15 non-null     datetime64[ns]
 11  Budget_Start                15 non-null     float64       
 

In [13]:
df_train_merged.to_csv("../data/processed/NYC_capital_projects_3yr_final_train.csv")
df_test_merged.to_csv("../data/processed/NYC_capital_projects_3yr_final_test.csv")