# Notebook 04: Merge Final Features for Modeling

## Instructions:

### Inputs:

### Outputs:


---

<a name='imports'></a>
## Imports
Imports for function used in this notebook.

<a href=#index>index</a>

### Data Wrangling

In [2]:
import pandas as pd
import numpy as np

### Utils

In [3]:
import os
from pickle import dump, load
from tqdm.auto import tqdm
import glob
from functools import reduce

---

<a name='read'></a>
## Data Loading
All the data loaded from disk and used in this notebook

<a href=#index>index</a>

In [4]:
files_needed = set(
    [
        'NYC_capital_projects_3yr_test',
        'NYC_capital_projects_3yr_train',
        'ae_pca_encoded_embed_test',
        'ae_pca_encoded_embed_train',
        'UMAP_embeddings_NYC_capital_projects_3yr_test',
        'UMAP_embeddings_NYC_capital_projects_3yr_train',
        'kmeans3_attribute_labels_test',
        'kmeans3_attribute_labels_train',
    ]
)

files_needed_paths = [f"../data/interim/{file}.csv" for file in files_needed]

savepath_train = "../data/processed/NYC_capital_projects_3yr_final_train.csv"
savepath_test = "../data/processed/NYC_capital_projects_3yr_final_test.csv"

In [5]:
# check to ensure target files exist to prevent runtime errors
path_errors = []
for filepath in files_needed_paths: 
    if (not os.path.isfile(filepath)) and (not os.path.isdir(filepath)):
        path_errors.append(filepath)
        
if len(path_errors)==0:
    print("OK - all 'files_needed_paths' point to existing files!")

else:
    raise ValueError(
        "The following target paths do not exist...\n\n\t{}\n"\
        "".format(path_errors)
    )

OK - all 'files_needed_paths' point to existing files!


In [6]:
data_dict = {"train":{}, "test":{}}

print('Dataframes added to data dictionary:\n')

for file in sorted(files_needed_paths):
    file_name, extension = file.split("/")[-1].split(".")
    if file_name.startswith("NYC"):
        date_cols = [
    'Design_Start',
    'Final_Change_Date',
    'Schedule_Start',
    'Schedule_End',
]
        drop_col = "Unnamed: 0"
    
    #umap 
    else:
        date_cols = []
        drop_col = []
    
    df = pd.read_csv(file, parse_dates=date_cols).drop(columns=drop_col)
    
    if file_name.split("_")[-1] == "train":
        data_dict["train"][file_name] = df
    elif file_name.split("_")[-1] == "test":
        data_dict["test"][file_name] = df
    else:
        data_dict[file_name] = df
    
    print(f'\t{file_name}')

Dataframes added to data dictionary:

	NYC_capital_projects_3yr_test
	NYC_capital_projects_3yr_train
	UMAP_embeddings_NYC_capital_projects_3yr_test
	UMAP_embeddings_NYC_capital_projects_3yr_train
	ae_pca_encoded_embed_test
	ae_pca_encoded_embed_train
	kmeans3_attribute_labels_test
	kmeans3_attribute_labels_train


In [7]:
data_dict["train"].keys()

dict_keys(['NYC_capital_projects_3yr_train', 'UMAP_embeddings_NYC_capital_projects_3yr_train', 'ae_pca_encoded_embed_train', 'kmeans3_attribute_labels_train'])

In [8]:
data_dict["test"].keys()

dict_keys(['NYC_capital_projects_3yr_test', 'UMAP_embeddings_NYC_capital_projects_3yr_test', 'ae_pca_encoded_embed_test', 'kmeans3_attribute_labels_test'])

In [9]:
data_dict["train"]['UMAP_embeddings_NYC_capital_projects_3yr_train'].columns[
    data_dict["train"]['UMAP_embeddings_NYC_capital_projects_3yr_train'].columns.str.contains("label")
]

Index(['attribute_clustering_label'], dtype='object')

In [10]:
# filters so only keep the 2D features
umap_df_train = data_dict["train"]['UMAP_embeddings_NYC_capital_projects_3yr_train']
umap_df_train['attribute_clustering_label'] = umap_df_train['attribute_clustering_label'].astype("str") 
data_dict["train"]['UMAP_embeddings_NYC_capital_projects_3yr_train'] = umap_df_train[
    ["PID"] + list(
        umap_df_train.columns[
            umap_df_train.columns.str.startswith("umap_attributes_2D") | 
            umap_df_train.columns.str.startswith("umap_descr_2D") 
        ]
    ) + ['attribute_clustering_label']
]

# filters so only keep the 2D features
umap_df_test = data_dict["test"]['UMAP_embeddings_NYC_capital_projects_3yr_test']
umap_df_test['attribute_clustering_label'] = umap_df_test['attribute_clustering_label'].astype("str") 
data_dict["test"]['UMAP_embeddings_NYC_capital_projects_3yr_test'] = umap_df_test[
    ["PID"]  + list(
        umap_df_test.columns[
            umap_df_test.columns.str.startswith("umap_attributes_2D") |
            umap_df_test.columns.str.startswith("umap_descr_2D")
        ]
    ) + ['attribute_clustering_label']
]


In [11]:
df_train_merged = reduce(lambda  left,right: pd.merge(left.copy(),right.copy(),on='PID',
                                            how='left'), data_dict["train"].values())

assert df_train_merged.isnull().sum().sum() == 0
assert df_train_merged.shape == (134,53)


In [12]:
df_test_merged = reduce(lambda  left,right: pd.merge(left.copy(),right.copy(),on='PID',
                                            how='left'), data_dict["test"].values())

assert df_test_merged.isnull().sum().sum() == 0
assert df_test_merged.shape == (15,53)


In [13]:
df_train_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 134 entries, 0 to 133
Data columns (total 53 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   PID                         134 non-null    int64         
 1   Project_Name                134 non-null    object        
 2   Description                 134 non-null    object        
 3   Category                    134 non-null    object        
 4   Borough                     134 non-null    object        
 5   Managing_Agency             134 non-null    object        
 6   Client_Agency               134 non-null    object        
 7   Phase_Start                 134 non-null    object        
 8   Current_Project_Years       134 non-null    float64       
 9   Current_Project_Year        134 non-null    int64         
 10  Design_Start                134 non-null    datetime64[ns]
 11  Budget_Start                134 non-null    float64       

In [14]:
df_test_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15 entries, 0 to 14
Data columns (total 53 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   PID                         15 non-null     int64         
 1   Project_Name                15 non-null     object        
 2   Description                 15 non-null     object        
 3   Category                    15 non-null     object        
 4   Borough                     15 non-null     object        
 5   Managing_Agency             15 non-null     object        
 6   Client_Agency               15 non-null     object        
 7   Phase_Start                 15 non-null     object        
 8   Current_Project_Years       15 non-null     float64       
 9   Current_Project_Year        15 non-null     int64         
 10  Design_Start                15 non-null     datetime64[ns]
 11  Budget_Start                15 non-null     float64       
 

In [15]:
df_train_merged.to_csv(savepath_train)
df_test_merged.to_csv(savepath_test)