# Data Augmentation
In this notebook i will perform data augmentation needed for the big data project

In [1]:
import warnings; warnings.simplefilter(action='ignore', category=FutureWarning)
import pyarrow as pa
import pyarrow.parquet as pq
from pyarrow import csv
from pyarrow import dataset as ds
import os
import numpy as np
import pandas as pd


import dask_jobqueue
import dask
import dask.dataframe as dd
import time

from distributed import Client
from contextlib import suppress 
import platform



In [2]:
DATA_FOLDER = "./data/"
DATA_FILES = os.listdir(DATA_FOLDER)
PARQUET_PATH = "/d/hpc/projects/FRI/bigdata/students/mfmt/entire_1.parquet"
print(DATA_FILES)

['weather.csv', 'school.csv', 'attraction.csv']


In [3]:
with suppress(Exception):
    client.shutdown()

cluster = dask_jobqueue.SLURMCluster(
            queue = 'all', 
            processes=1,
            cores=32, 
            memory='8GB',
            scheduler_options={'dashboard_address': ':21722'},
            death_timeout=180, # seconds
            walltime="01:30:00",
          )

client = Client(cluster, timeout="180")#, memory_limit='8GB')
display(client.cluster)

Tab(children=(HTML(value='<div class="jp-RenderedHTMLCommon jp-RenderedHTML jp-mod-trusted jp-OutputArea-outpu…

In [4]:
client.cluster.scale(16)
display(client.cluster)

Tab(children=(HTML(value='<div class="jp-RenderedHTMLCommon jp-RenderedHTML jp-mod-trusted jp-OutputArea-outpu…

In [96]:
client.shutdown()

2022-06-09 15:09:38,899 - distributed.client - ERROR - Failed to reconnect to scheduler after 180.00 seconds, closing client


In [None]:
df = dd.read_parquet(PARQUET_FILE, engine = "pyarrow", ignore_metadata = True)

## Get Location for street names

So that we are able to connect street names with Longitude and Latidue (and later to the original dataset) we will create a dataframe containing the correct info.

In [31]:
df_loc = pd.read_csv(DATA_FOLDER + "geo_location.csv", low_memory=False)

In [32]:
df_loc.head(5)

Unnamed: 0,the_geom,PHYSICALID,L_LOW_HN,L_HIGH_HN,R_LOW_HN,R_HIGH_HN,L_ZIP,R_ZIP,L_BLKFC_ID,R_BLKFC_ID,...,PRE_MODIFI,PRE_DIRECT,PRE_TYPE,POST_TYPE,POST_DIREC,POST_MODIF,FULL_STREE,ST_NAME,BIKE_TRAFD,SHAPE_Leng
0,MULTILINESTRING ((-74.01793061274537 40.706174...,3,50.0,64.0,51.0,63.0,10280.0,10280.0,212262587,1222601917,...,,,,PL,,,BATTERY PL,BATTERY,,262.777812
1,MULTILINESTRING ((-74.01757437049282 40.706839...,5,66.0,82.0,65.0,81.0,10280.0,10280.0,1222600642,1222604562,...,,,,PL,,,BATTERY PL,BATTERY,,259.415989
2,MULTILINESTRING ((-74.01712051881411 40.707462...,6,84.0,98.0,83.0,101.0,10280.0,10280.0,1222600640,1222600933,...,,,,PL,,,BATTERY PL,BATTERY,,280.444781
3,MULTILINESTRING ((-74.01693786186667 40.704813...,8,,,,,10280.0,10280.0,212262587,1222600931,...,,,,PL,,,BATTERY PL,BATTERY,,32.070139
4,MULTILINESTRING ((-74.01428922948978 40.704549...,14,,,,,10004.0,10004.0,212262587,212262395,...,,,,PL,,,BATTERY PL,BATTERY,,206.27185


In [33]:
lats = []
longs = []
seg_ids = []
names = []
ct = 0

def str_mean(l):
    n = len(l)
    s = 0
    for i in range(n):
        s = s +  float(l[i])
    return s / n

for i, k in df_loc.iterrows():
    loc = k["the_geom"].replace("MULTILINESTRING ((","").replace(")","").replace(",","").split(" ")
    lts = loc[1::2]
    lgs = loc[0::2]
    lats.append(str_mean(lts))
    longs.append(str_mean(lgs))
    #seg_ids.append(int(k["SEGMENTID"]))
    names.append(k["ST_NAME"])

    
data = {"Lattitude": lats, "Longitude": longs, "Name":names}
#print(data)
df_loc = pd.DataFrame(data=data)
df_loc.head(5)

Unnamed: 0,Lattitude,Longitude,Name
0,40.706513,-74.017771,BATTERY
1,40.707151,-74.017347,BATTERY
2,40.707798,-74.016873,BATTERY
3,40.704825,-74.016994,BATTERY
4,40.704642,-74.01464,BATTERY


In [94]:
#Use dask partitions to get faster results
def get_closest_street(row):
    df = df_loc
    street_name = ""
    df["dist"] = (row["Lattitude"] - df["Lattitude"])**2 + (row["Longitude"] - df["Longitude"])**2
    r = df[df["dist"] == min(df["dist"])]
    return r["Name"]


## Attraction / PoI dataset augmentation

Before we can add data for the attraction dataset we have to extract the relavant info so that we can more easily join both dataframes

In [36]:
df_poi = pd.read_csv(DATA_FOLDER + "attraction.csv")

In [37]:
df_poi.head(5)

Unnamed: 0,the_geom,SEGMENTID,COMPLEXID,SAFTYPE,SOS,PLACEID,FACI_DOM,BIN,BOROUGH,CREATED,MODIFIED,FACILITY_T,SOURCE,B7SC,PRI_ADD,NAME
0,POINT (-74.00701717096757 40.724634757833414),31895,0,N,1.0,567,9,0,1.0,05/14/2009 12:00:00 AM,11/18/2011 12:00:00 AM,6,DoITT,19743001.0,0,HOLLAND
1,POINT (-73.82661642130311 40.797182526598505),306303,3378,N,2.0,568,8,0,4.0,05/14/2009 12:00:00 AM,01/09/2017 12:00:00 AM,6,DoITT,49731001.0,0,WHITESTONE
2,POINT (-73.99395441100663 40.70384707235758),144842,3960,N,2.0,576,8,0,3.0,05/14/2009 12:00:00 AM,01/22/2018 12:00:00 AM,6,DoITT,39734001.0,0,BROOKLYN
3,POINT (-73.9919414213091 40.70960010711745),162664,0,N,1.0,580,8,0,1.0,05/14/2009 12:00:00 AM,05/11/2011 12:00:00 AM,6,DoITT,19795001.0,0,MANHATTAN
4,POINT (-73.9526609766105 40.73906602249743),157362,0,N,1.0,582,8,0,3.0,05/14/2009 12:00:00 AM,03/03/2017 12:00:00 AM,6,DoITT,39740001.0,0,PULASKI


In [38]:
df_poi["the_geom"]

0        POINT (-74.00701717096757 40.724634757833414)
1        POINT (-73.82661642130311 40.797182526598505)
2         POINT (-73.99395441100663 40.70384707235758)
3          POINT (-73.9919414213091 40.70960010711745)
4          POINT (-73.9526609766105 40.73906602249743)
                             ...                      
20566     POINT (-73.99235660627777 40.76311099568828)
20567     POINT (-73.9737709593231 40.699189360095815)
20568    POINT (-74.20232608033247 40.533084962118224)
20569     POINT (-73.9925833710639 40.762968178359465)
20570    POINT (-74.02061564309312 40.653082401053034)
Name: the_geom, Length: 20571, dtype: object

In [102]:
lats = []
longs = []
seg_ids = []
names = []
for i, k in df_poi.iterrows():
    loc = k["the_geom"].replace("POINT (","").replace(")","").split(" ")
    lats.append(float(loc[1]))
    longs.append(float(loc[0]))
    seg_ids.append(int(k["SEGMENTID"]))
    names.append(k["NAME"])

    
data = {"Lattitude": lats, "Longitude": longs, "Segment ID":seg_ids, "Name":names}
#print(data)
df_new = pd.DataFrame(data=data)
df_new.head(5)

Unnamed: 0,Lattitude,Longitude,Segment ID,Name
0,40.724635,-74.007017,31895,HOLLAND
1,40.797183,-73.826616,306303,WHITESTONE
2,40.703847,-73.993954,144842,BROOKLYN
3,40.7096,-73.991941,162664,MANHATTAN
4,40.739066,-73.952661,157362,PULASKI


In [83]:
df_debug = df_new.iloc[1:5]
print(df_debug)

   Lattitude  Longitude  Segment ID        Name
1  40.797183 -73.826616      306303  WHITESTONE
2  40.703847 -73.993954      144842    BROOKLYN
3  40.709600 -73.991941      162664   MANHATTAN
4  40.739066 -73.952661      157362     PULASKI


In [None]:
#from dask.multiprocessing import get
#def apply_f_to_df(df):
#    return df.apply(get_closest_street, axis=1, result_type = "expand")
#df_new = dd.from_pandas(df_new,npartitions = 30).compute()
#res = df_new.map_partitions(apply_f_to_df).compute(get = get)
#print(res)
df_new["Street Name"] = df_new.apply(get_closest_street, axis=1, result_type = "expand")

In [None]:
df_new.head(5)

## Weather dataset augmentation


In [6]:
df_weather = pd.read_csv(DATA_FOLDER + "weather.csv")

In [7]:
df_weather.head(5)

Unnamed: 0,Sensor.ID,AirTemp,Day,Hour,Latitude,Longitude,Year,Install.Type,Borough,ntacode
0,Bk-BR_01,71.189,06/15/2018,1,40.666205,-73.91691,2018,Street Tree,Brooklyn,BK81
1,Bk-BR_01,70.243333,06/15/2018,2,40.666205,-73.91691,2018,Street Tree,Brooklyn,BK81
2,Bk-BR_01,69.392667,06/15/2018,3,40.666205,-73.91691,2018,Street Tree,Brooklyn,BK81
3,Bk-BR_01,68.263167,06/15/2018,4,40.666205,-73.91691,2018,Street Tree,Brooklyn,BK81
4,Bk-BR_01,67.114,06/15/2018,5,40.666205,-73.91691,2018,Street Tree,Brooklyn,BK81


In [None]:
df_weather["Street Name"] = df_weather.apply(get_closest_street, axis=1, result_type = "expand")

## School dataset augmentation

In [15]:
df_school = pd.read_csv(DATA_FOLDER + "school.csv")
df_school.head(5)

Unnamed: 0,FISCAL_YEAR,ATS SYSTEM CODE,LOCATION_CODE,LOCATION_NAME,BEDS NUMBER,MANAGED_BY_NAME,LOCATION_TYPE_DESCRIPTION,LOCATION_CATEGORY_DESCRIPTION,GRADES_TEXT,GRADES_FINAL_TEXT,...,ADMINISTRATIVE_DISTRICT_NAME,COMMUNITY_SCHOOL_SUP_NAME,FIELD_SUPPORT_CENTER_NAME,FIELD_SUPPORT_CENTER_LEADER_NAME,SCHOOL_SUPPORT_TEAM_NAME,SCHOOL_SUPPORT_TEAM_LEADER_NAME,HIGHSCHOOL_NETWORK_LOCATION_CODE,HIGHSCHOOL_NETWORK_NAME,HIGHSCHOOL_NETWORK_SUPERINTENDENT,Location 1
0,2018,01M015,M015,P.S. 015 Roberto Clemente,310100010015,DOE,General Academic,Elementary,"PK,0K,01,02,03,04,05,SE","PK,0K,01,02,03,04,05",...,COMMUNITY SCHOOL DISTRICT 01,"PHILLIPS, DANIELLA",Field Support Center - Manhattan,"CHU, YUET",School Support Team 3- Manhattan,,,,,"333 EAST 4 STREET\nMANHATTAN, NY 10009\n(40.72..."
1,2018,01M019,M019,P.S. 019 Asher Levy,310100010019,DOE,General Academic,Elementary,"PK,0K,01,02,03,04,05,SE","PK,0K,01,02,03,04,05",...,COMMUNITY SCHOOL DISTRICT 01,"PHILLIPS, DANIELLA",Field Support Center - Manhattan,"CHU, YUET",School Support Team 3- Manhattan,,,,,"185 1 AVENUE\nMANHATTAN, NY 10003\n(40.730009,..."
2,2018,01M020,M020,P.S. 020 Anna Silver,310100010020,DOE,General Academic,Elementary,"PK,0K,01,02,03,04,05,SE","PK,0K,01,02,03,04,05",...,COMMUNITY SCHOOL DISTRICT 01,"PHILLIPS, DANIELLA",Field Support Center - Manhattan,"CHU, YUET",School Support Team 3- Manhattan,,,,,"166 ESSEX STREET\nMANHATTAN, NY 10002\n(40.721..."
3,2018,01M034,M034,P.S. 034 Franklin D. Roosevelt,310100010034,DOE,General Academic,K-8,"PK,0K,01,02,03,04,05,06,07,08,SE","PK,0K,01,02,03,04,05,06,07,08",...,COMMUNITY SCHOOL DISTRICT 01,"PHILLIPS, DANIELLA",Field Support Center - Manhattan,"CHU, YUET",School Support Team 3- Manhattan,,,,,"730 EAST 12 STREET\nMANHATTAN, NY 10009\n(40.7..."
4,2018,01M063,M063,The STAR Academy - P.S.63,310100010063,DOE,General Academic,Elementary,"PK,0K,01,02,03,04,05,SE","PK,0K,01,02,03,04,05",...,COMMUNITY SCHOOL DISTRICT 01,"PHILLIPS, DANIELLA",Field Support Center - Manhattan,"CHU, YUET",School Support Team 3- Manhattan,,,,,"121 EAST 3 STREET\nMANHATTAN, NY 10009\n(40.72..."


In [16]:
lats = []
longs = []
descs = []
names = []
for i, k in df_poi.iterrows():
    loc = k["the_geom"].replace("POINT (","").replace(")","").split(" ")
    lats.append(float(loc[1]))
    longs.append(float(loc[0]))
    seg_ids.append(int(k["SEGMENTID"]))
    names.append(k["LOCATION_CATEGORY_DESCRIPTION"])

    
data = {"Lattitude": lats, "Longitude": longs, "Segment ID":seg_ids, "Name":names}
#print(data)
df_new = pd.DataFrame(data=data)
df_new.head(5)

0       333 EAST 4 STREET\nMANHATTAN, NY 10009\n(40.72...
1       185 1 AVENUE\nMANHATTAN, NY 10003\n(40.730009,...
2       166 ESSEX STREET\nMANHATTAN, NY 10002\n(40.721...
3       730 EAST 12 STREET\nMANHATTAN, NY 10009\n(40.7...
4       121 EAST 3 STREET\nMANHATTAN, NY 10009\n(40.72...
                              ...                        
1818    14 WEST 170 STREET\nBRONX, NY 10452\n(40.84026...
1819    3565 THIRD AVENUE\nBRONX, NY 10456\n(40.832328...
1820    1506-30 BROOK AVENUE\nBRONX, NY 10457\n(40.838...
1821    3740 BAYCHESTER AVENUE\nBRONX, NY 10466\n(40.8...
1822    950 LONGFELLOW AVENUE\nBRONX, NY 10474\n(40.82...
Name: Location 1, Length: 1823, dtype: object