In [1]:
!pip install psycopg2 sqlalchemy
!pip install azure-storage-blob



In [2]:
import pandas as pd
import numpy as np
import json
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from sqlalchemy import create_engine
from io import StringIO
from io import BytesIO

In [3]:
#config file/connection string setup 
config_file_path = 'config.json'

with open(config_file_path, 'r') as config_file:
    config = json.load(config_file)

In [4]:
CONNECTION_STRING_AZURE_STORAGE = config["connectionString"]
CONTAINER_AZURE='affordablehousing'

blob_service_client = BlobServiceClient.from_connection_string(CONNECTION_STRING_AZURE_STORAGE)
container_client = blob_service_client.get_container_client(CONTAINER_AZURE)


In [5]:
def azure_download_blob(connect_str, container_name, blob_name):
    blob_service_client = BlobServiceClient.from_connection_string(connect_str)
    blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)
    download_stream = blob_client.download_blob()
    return download_stream.readall()

In [6]:
download_data=azure_download_blob(CONNECTION_STRING_AZURE_STORAGE, CONTAINER_AZURE, 'cleaned_affordablehousingdata.csv')

In [7]:
#converting csv download into df
download_data=pd.read_csv(BytesIO(download_data))
download_data

Unnamed: 0,project_id,project_name,building_id,house_number,street_name,borough,postcode,latitude,longitude,reporting_construction_type,...,_3_br_units,_4_br_units,_5_br_units,_6_br_units,counted_rental_units,counted_homeownership_units,all_counted_units,total_units,total_low_income_units,total_non_low_income_units
0,44218,MEC E. 125TH ST. PARCEL B WEST,987329,2319,3 AVENUE,Manhattan,10035.0,40.804262,-73.935383,New Construction,...,15,0,0,0,297,0,297,404,202,93
1,44225,Brook Avenue Apartments,927748,469,EAST 147 STREET,Bronx,10455.0,40.813854,-73.916400,New Construction,...,0,0,0,0,11,0,11,11,11,0
2,44225,Brook Avenue Apartments,955261,455,EAST 147 STREET,Bronx,10455.0,40.813948,-73.916682,New Construction,...,1,0,0,0,55,0,55,55,54,0
3,44230,Mermaid / West. 16th Street,336111,2427,MERMAID AVENUE,Brooklyn,11224.0,40.575896,-73.991490,New Construction,...,2,0,0,0,2,1,3,3,0,3
4,44230,Mermaid / West. 16th Street,336151,3216,MERMAID AVENUE,Brooklyn,11224.0,40.575062,-73.998834,New Construction,...,2,0,0,0,2,1,3,3,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4053,74669,731 LIBERTY AVENUE APARTMENTS,989464,735,LIBERTY AVENUE,Brooklyn,11208.0,40.676466,-73.882637,New Construction,...,0,0,0,0,3,0,3,10,0,3
4054,74671,729 731 EAST 32ND ST,1006551,729,EAST 32 STREET,Brooklyn,11210.0,40.633299,-73.945770,New Construction,...,0,0,0,0,2,0,2,6,0,2
4055,74671,729 731 EAST 32ND ST,1006620,731,EAST 32 STREET,Brooklyn,11210.0,40.633267,-73.945766,New Construction,...,0,0,0,0,2,0,2,6,0,2
4056,74673,61 EAST 53 ST,1009851,61,EAST 53 STREET,Brooklyn,11203.0,40.658937,-73.928211,New Construction,...,10,0,0,0,20,0,20,20,0,20


In [8]:
print(download_data.columns)

Index(['project_id', 'project_name', 'building_id', 'house_number',
       'street_name', 'borough', 'postcode', 'latitude', 'longitude',
       'reporting_construction_type', 'extremely_low_income_units',
       'very_low_income_units', 'low_income_units', 'moderate_income_units',
       'middle_income_units', 'studio_units', '_1_br_units', '_2_br_units',
       '_3_br_units', '_4_br_units', '_5_br_units', '_6_br_units',
       'counted_rental_units', 'counted_homeownership_units',
       'all_counted_units', 'total_units', 'total_low_income_units',
       'total_non_low_income_units'],
      dtype='object')


In [10]:
df_ah=download_data.copy()

In [31]:
#allows to write over copied dataframe
pd.options.mode.copy_on_write = True

In [25]:
#creating borough dimension

borough=df_ah['borough'].unique()
dim_borough=pd.DataFrame(borough, columns=['borough_name'])
dim_borough['borough_id']=range(1, len(dim_borough) + 1)
dim_borough

Unnamed: 0,borough_name,borough_id
0,Manhattan,1
1,Bronx,2
2,Brooklyn,3
3,Queens,4
4,Staten Island,5


In [45]:
#creating project dimension
dim_project=df_ah[['project_id','project_name']]
dim_project=dim_project.drop_duplicates()
dim_project

Unnamed: 0,project_id,project_name
0,44218,MEC E. 125TH ST. PARCEL B WEST
1,44225,Brook Avenue Apartments
3,44230,Mermaid / West. 16th Street
10,44239,Home for Harlem Dowling
11,44256,WSFSSH. 145 WEST 108TH ST. VALLEY LODGE
...,...,...
4052,74668,179 MARTENSE STREET
4053,74669,731 LIBERTY AVENUE APARTMENTS
4054,74671,729 731 EAST 32ND ST
4056,74673,61 EAST 53 ST


In [47]:
#creating building dimension 
dim_building=df_ah[['building_id','house_number','street_name','postcode', 'latitude', 'longitude',
       'reporting_construction_type']]
dim_building

Unnamed: 0,building_id,house_number,street_name,postcode,latitude,longitude,reporting_construction_type
0,987329,2319,3 AVENUE,10035.0,40.804262,-73.935383,New Construction
1,927748,469,EAST 147 STREET,10455.0,40.813854,-73.916400,New Construction
2,955261,455,EAST 147 STREET,10455.0,40.813948,-73.916682,New Construction
3,336111,2427,MERMAID AVENUE,11224.0,40.575896,-73.991490,New Construction
4,336151,3216,MERMAID AVENUE,11224.0,40.575062,-73.998834,New Construction
...,...,...,...,...,...,...,...
4053,989464,735,LIBERTY AVENUE,11208.0,40.676466,-73.882637,New Construction
4054,1006551,729,EAST 32 STREET,11210.0,40.633299,-73.945770,New Construction
4055,1006620,731,EAST 32 STREET,11210.0,40.633267,-73.945766,New Construction
4056,1009851,61,EAST 53 STREET,11203.0,40.658937,-73.928211,New Construction


In [53]:
#creating fact table 
fact_table=df_ah[['very_low_income_units', 'low_income_units', 'moderate_income_units',
       'middle_income_units', 'studio_units', '_1_br_units', '_2_br_units',
       '_3_br_units', '_4_br_units', '_5_br_units', '_6_br_units',
       'counted_rental_units', 'counted_homeownership_units',
       'all_counted_units', 'total_units', 'total_low_income_units',
       'total_non_low_income_units','project_id','building_id','borough']]
fact_table['fact_id']=range(1, len(fact_table) + 1)
fact_table

Unnamed: 0,very_low_income_units,low_income_units,moderate_income_units,middle_income_units,studio_units,_1_br_units,_2_br_units,_3_br_units,_4_br_units,_5_br_units,...,counted_rental_units,counted_homeownership_units,all_counted_units,total_units,total_low_income_units,total_non_low_income_units,project_id,building_id,borough,fact_id
0,101,101,0,93,83,70,129,15,0,0,...,297,0,297,404,202,93,44218,987329,Manhattan,1
1,0,11,0,0,0,1,10,0,0,0,...,11,0,11,11,11,0,44225,927748,Bronx,2
2,14,40,0,0,7,35,12,1,0,0,...,55,0,55,55,54,0,44225,955261,Bronx,3
3,0,0,3,0,0,0,1,2,0,0,...,2,1,3,3,0,3,44230,336111,Brooklyn,4
4,0,0,3,0,0,0,1,2,0,0,...,2,1,3,3,0,3,44230,336151,Brooklyn,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4053,0,0,0,3,3,0,0,0,0,0,...,3,0,3,10,0,3,74669,989464,Brooklyn,4054
4054,0,0,0,2,0,2,0,0,0,0,...,2,0,2,6,0,2,74671,1006551,Brooklyn,4055
4055,0,0,0,2,0,2,0,0,0,0,...,2,0,2,6,0,2,74671,1006620,Brooklyn,4056
4056,0,0,0,20,5,5,0,10,0,0,...,20,0,20,20,0,20,74673,1009851,Brooklyn,4057


In [56]:
#mapping borough dimension to add borough id 
dict_borough=dict(zip(dim_borough['borough_name'],dim_borough['borough_id']))
print(dict_borough)

{'Manhattan': 1, 'Bronx': 2, 'Brooklyn': 3, 'Queens': 4, 'Staten Island': 5}


In [57]:
#mapping borough dimension to add borough id 
fact_table['borough_id']=fact_table['borough'].map(dict_borough)
fact_table

Unnamed: 0,very_low_income_units,low_income_units,moderate_income_units,middle_income_units,studio_units,_1_br_units,_2_br_units,_3_br_units,_4_br_units,_5_br_units,...,counted_homeownership_units,all_counted_units,total_units,total_low_income_units,total_non_low_income_units,project_id,building_id,borough,fact_id,borough_id
0,101,101,0,93,83,70,129,15,0,0,...,0,297,404,202,93,44218,987329,Manhattan,1,1
1,0,11,0,0,0,1,10,0,0,0,...,0,11,11,11,0,44225,927748,Bronx,2,2
2,14,40,0,0,7,35,12,1,0,0,...,0,55,55,54,0,44225,955261,Bronx,3,2
3,0,0,3,0,0,0,1,2,0,0,...,1,3,3,0,3,44230,336111,Brooklyn,4,3
4,0,0,3,0,0,0,1,2,0,0,...,1,3,3,0,3,44230,336151,Brooklyn,5,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4053,0,0,0,3,3,0,0,0,0,0,...,0,3,10,0,3,74669,989464,Brooklyn,4054,3
4054,0,0,0,2,0,2,0,0,0,0,...,0,2,6,0,2,74671,1006551,Brooklyn,4055,3
4055,0,0,0,2,0,2,0,0,0,0,...,0,2,6,0,2,74671,1006620,Brooklyn,4056,3
4056,0,0,0,20,5,5,0,10,0,0,...,0,20,20,0,20,74673,1009851,Brooklyn,4057,3


In [58]:
#dropping borough column 
fact_table.drop(columns=['borough'], inplace=True)
fact_table

Unnamed: 0,very_low_income_units,low_income_units,moderate_income_units,middle_income_units,studio_units,_1_br_units,_2_br_units,_3_br_units,_4_br_units,_5_br_units,...,counted_rental_units,counted_homeownership_units,all_counted_units,total_units,total_low_income_units,total_non_low_income_units,project_id,building_id,fact_id,borough_id
0,101,101,0,93,83,70,129,15,0,0,...,297,0,297,404,202,93,44218,987329,1,1
1,0,11,0,0,0,1,10,0,0,0,...,11,0,11,11,11,0,44225,927748,2,2
2,14,40,0,0,7,35,12,1,0,0,...,55,0,55,55,54,0,44225,955261,3,2
3,0,0,3,0,0,0,1,2,0,0,...,2,1,3,3,0,3,44230,336111,4,3
4,0,0,3,0,0,0,1,2,0,0,...,2,1,3,3,0,3,44230,336151,5,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4053,0,0,0,3,3,0,0,0,0,0,...,3,0,3,10,0,3,74669,989464,4054,3
4054,0,0,0,2,0,2,0,0,0,0,...,2,0,2,6,0,2,74671,1006551,4055,3
4055,0,0,0,2,0,2,0,0,0,0,...,2,0,2,6,0,2,74671,1006620,4056,3
4056,0,0,0,20,5,5,0,10,0,0,...,20,0,20,20,0,20,74673,1009851,4057,3


In [60]:
#reorganizing columns in fact table
new_order=['fact_id','borough_id','project_id','building_id','total_low_income_units',
           'total_non_low_income_units','very_low_income_units', 'low_income_units', 
           'moderate_income_units','middle_income_units', 'studio_units', '_1_br_units', '_2_br_units',
           '_3_br_units', '_4_br_units', '_5_br_units', '_6_br_units',
           'counted_rental_units', 'counted_homeownership_units',
           'all_counted_units', 'total_units', 
       ]

fact_table=fact_table[new_order]
fact_table.head()

Unnamed: 0,fact_id,borough_id,project_id,building_id,total_low_income_units,total_non_low_income_units,very_low_income_units,low_income_units,moderate_income_units,middle_income_units,...,_1_br_units,_2_br_units,_3_br_units,_4_br_units,_5_br_units,_6_br_units,counted_rental_units,counted_homeownership_units,all_counted_units,total_units
0,1,1,44218,987329,202,93,101,101,0,93,...,70,129,15,0,0,0,297,0,297,404
1,2,2,44225,927748,11,0,0,11,0,0,...,1,10,0,0,0,0,11,0,11,11
2,3,2,44225,955261,54,0,14,40,0,0,...,35,12,1,0,0,0,55,0,55,55
3,4,3,44230,336111,0,3,0,0,3,0,...,0,1,2,0,0,0,2,1,3,3
4,5,3,44230,336151,0,3,0,0,3,0,...,0,1,2,0,0,0,2,1,3,3
