# Notebook with test code

---

## Imports

### Standard library imports

In [1]:
import yaml

import sys

from datetime import (date, datetime)

import os

import pickle

import re

### Third party libraries

In [2]:
import boto3

import pandas as pd

### Local application imports

In [3]:
sys.path.append("../")

In [4]:
%load_ext autoreload
%autoreload 2

from src.etl.ingesta_almacenamiento import (
    
    ## Functions
    get_client,
    ingesta_inicial,
    ingesta_consecutiva,
    get_s3_resource,
    guardar_ingesta,
    get_s3_credentials,
    
)


from src.utils.utils import (

    get_api_token

)

---

## AWS base examples

### S3

#### Initial configuration

##### Resource

##### Client

In [6]:
s3_creds = get_s3_credentials("../conf/local/credentials.yaml")

session = boto3.Session(
    aws_access_key_id=s3_creds['aws_access_key_id'],
    aws_secret_access_key=s3_creds['aws_secret_access_key']
)
s3 = session.client('s3')

#### Connecting to existing bucket and viewing contents

In [61]:
bucket = "data-product-architecture-equipo-9"
key ="ingestion/consecutive"

In [62]:
objects = s3.list_objects_v2(Bucket=bucket, Prefix=key)['Contents']
objects

[{'Key': 'ingestion/consecutive/',
  'LastModified': datetime.datetime(2021, 3, 18, 15, 36, 25, tzinfo=tzutc()),
  'ETag': '"d41d8cd98f00b204e9800998ecf8427e"',
  'Size': 0,
  'StorageClass': 'STANDARD'},
 {'Key': 'ingestion/consecutive/YEAR=2021/MONTH=03/consecutive_inspections_2021-03-18.pkl',
  'LastModified': datetime.datetime(2021, 3, 19, 0, 16, 32, tzinfo=tzutc()),
  'ETag': '"410a1b7bbf74d80e1c9e669ee727f7dd"',
  'Size': 1047385,
  'StorageClass': 'STANDARD'},
 {'Key': 'ingestion/consecutive/YEAR=2021/MONTH=04/consecutive_inspections_2021-04-10.pkl',
  'LastModified': datetime.datetime(2021, 4, 11, 0, 55, 2, tzinfo=tzutc()),
  'ETag': '"006092603520a926fef44b8e29a344b5"',
  'Size': 946582,
  'StorageClass': 'STANDARD'},
 {'Key': 'ingestion/consecutive/YEAR=2021/MONTH=04/consecutive_inspections_2021-04-12.pkl',
  'LastModified': datetime.datetime(2021, 4, 12, 20, 7, 4, tzinfo=tzutc()),
  'ETag': '"7b7070289f8b712d0c47c86c26cf6652"',
  'Size': 13,
  'StorageClass': 'STANDARD'},
 {

#### Reading pickle from S3

In [63]:
obj_path = [file["Key"] for file in objects if "2021-03-18" in file["Key"]][0]
obj_path

'ingestion/consecutive/YEAR=2021/MONTH=03/consecutive_inspections_2021-03-18.pkl'

In [64]:
response = s3.get_object(
    Bucket=bucket,
    Key=obj_path
)
response

{'ResponseMetadata': {'RequestId': 'E86A0YYMF4A0X9CT',
  'HostId': '1jaZMPS90cBuG9MyKMtf8Vc05wAofxaHM19GlIR0vrlou9F/5teRcXoPfJI3dYeWSWQybshI1mA=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '1jaZMPS90cBuG9MyKMtf8Vc05wAofxaHM19GlIR0vrlou9F/5teRcXoPfJI3dYeWSWQybshI1mA=',
   'x-amz-request-id': 'E86A0YYMF4A0X9CT',
   'date': 'Sun, 18 Apr 2021 15:39:48 GMT',
   'last-modified': 'Fri, 19 Mar 2021 00:16:32 GMT',
   'etag': '"410a1b7bbf74d80e1c9e669ee727f7dd"',
   'accept-ranges': 'bytes',
   'content-type': 'binary/octet-stream',
   'content-length': '1047385',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'AcceptRanges': 'bytes',
 'LastModified': datetime.datetime(2021, 3, 19, 0, 16, 32, tzinfo=tzutc()),
 'ContentLength': 1047385,
 'ETag': '"410a1b7bbf74d80e1c9e669ee727f7dd"',
 'ContentType': 'binary/octet-stream',
 'Metadata': {},
 'Body': <botocore.response.StreamingBody at 0x16ce68580>}

In [65]:
body = response["Body"].read()

---

In [75]:
pickle.loads(pickle.loads(body))

[{'inspection_id': '2484957',
  'dba_name': 'SAWYER  (ANNEX)',
  'aka_name': 'SAWYER',
  'license_': '25231',
  'facility_type': 'School',
  'risk': 'Risk 1 (High)',
  'address': '5247 S Spaulding ',
  'city': 'CHICAGO',
  'state': 'IL',
  'zip': '60632',
  'inspection_date': '2021-02-19T00:00:00.000',
  'inspection_type': 'Canvass',
  'results': 'Pass',
  'latitude': '41.79756783709009',
  'longitude': '-87.70597566195573',
  'location': {'latitude': '-87.70597566195573',
   'longitude': '41.79756783709009'}},
 {'inspection_id': '2484950',
  'dba_name': 'PHLOUR BAKERY AND CAFE',
  'aka_name': 'PHLOUR BAKERY AND CAFE',
  'license_': '2522179',
  'facility_type': 'Bakery',
  'risk': 'Risk 1 (High)',
  'address': '1138 W BRYN MAWR AVE ',
  'city': 'CHICAGO',
  'state': 'IL',
  'zip': '60660',
  'inspection_date': '2021-02-19T00:00:00.000',
  'inspection_type': 'Canvass',
  'results': 'Pass',
  'violations': '40. PERSONAL CLEANLINESS - Comments: FOODHANDLERS NOT WEARING HAIR RESTRAINTS. M

#### Creating bucket

In [27]:
bucket_name = "comdline-test-bucket-rob"

s3.create_bucket(
    Bucket=bucket_name,
    CreateBucketConfiguration={'LocationConstraint': 'us-west-2'},
    ACL="private"
)

{'ResponseMetadata': {'RequestId': 'C3D49DWEQ321XZ1W',
  'HostId': 'IScnbV3aGQc8F+esmQVlkA7KlCOaJF0nfhRbyrYsTGjfjf/6L4x4xIEwVzpeLQa21ihbVNblyW8=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'IScnbV3aGQc8F+esmQVlkA7KlCOaJF0nfhRbyrYsTGjfjf/6L4x4xIEwVzpeLQa21ihbVNblyW8=',
   'x-amz-request-id': 'C3D49DWEQ321XZ1W',
   'date': 'Sun, 18 Apr 2021 15:14:12 GMT',
   'location': 'http://comdline-test-bucket-rob.s3.amazonaws.com/',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'Location': 'http://comdline-test-bucket-rob.s3.amazonaws.com/'}

#### Uploading file to bucket

In [None]:
file_to_upload = "../../admin/test_file3_for_s3.txt"
file_name = file_to_upload.split(sep="/")[-1]

In [None]:
# s3.upload_file(file_to_upload, bucket_name, file_name)

s3.meta.client.upload_file(file_to_upload, bucket_name, file_name)

In [None]:
[obj for obj in s3.Bucket(bucket_name).objects.all()]

#### Downloading files from bucket

In [None]:
path_to_download = "/Users/rp_mbp/Documents/ReposRob_RobPer/ITAMmcd/semestre_2/Arquitectura_Prod_Dat/Venv_ArqPD/repos/dpa_2021/_robdir/" + file_name

s3.meta.client.download_file(bucket_name, file_name, path_to_download)

#### Deleting bucket files

In [None]:
s3.Object(bucket_name, file_name).delete()

---

#### Deleting bucket

## Downloading and storing Chicago data

#### Interacting with API

In [48]:
dataset_id = "4ijn-s7e5"

In [49]:
token = get_api_token("../conf/local/credentials.yaml")
token

'5HfsId12lhMMzSlYANoAq451w'

In [50]:
client = get_client(token)
client

<sodapy.socrata.Socrata at 0x16ce68700>

In [51]:
most_rec_date = '2021-04-10'

In [52]:
soql_query = "inspection_date >= '{}'".format(most_rec_date)
soql_query

"inspection_date >= '2021-04-10'"

In [53]:
x = client.get(dataset_id, 
               limit=10,
               where=soql_query
              )

In [54]:
x

[{'inspection_id': '2498114',
  'dba_name': 'CELTIC CROWN',
  'aka_name': 'CELTIC CROWN',
  'license_': '404',
  'facility_type': 'Restaurant',
  'risk': 'Risk 1 (High)',
  'address': '4301 N WESTERN AVE ',
  'city': 'CHICAGO',
  'state': 'IL',
  'zip': '60618',
  'inspection_date': '2021-04-12T00:00:00.000',
  'inspection_type': 'Non-Inspection',
  'results': 'No Entry',
  'latitude': '41.959539275814876',
  'longitude': '-87.68848380383307',
  'location': {'latitude': '-87.68848380383307',
   'longitude': '41.959539275814876'}},
 {'inspection_id': '2498106',
  'dba_name': 'ASIAN CUISINE EXPRESS',
  'aka_name': 'ASIAN CUISINE EXPRESS',
  'license_': '2327198',
  'facility_type': 'Restaurant',
  'risk': 'Risk 1 (High)',
  'address': '3823 W 31ST ST ',
  'city': 'CHICAGO',
  'state': 'IL',
  'zip': '60623',
  'inspection_date': '2021-04-12T00:00:00.000',
  'inspection_type': 'Canvass',
  'results': 'Pass',
  'latitude': '41.836838656288656',
  'longitude': '-87.72053342728425',
  'locat

##### Using "dumps"

In [None]:
xx = pickle.dumps(x)

In [None]:
xx

In [None]:
pickle.loads(xx)

##### Using "dump"

In [8]:
pkl_store_loc = !pwd
pkl_store_loc = pkl_store_loc[0] + "/prueba_pickle.pkl"

In [10]:
pkl_store_loc

'/Users/rp_mbp/Documents/ReposRob_RobPer/ITAMmcd/semestre_2/Arquitectura_Prod_Dat/Venv_ArqPD/repos/dpa_2021/_robdir/prueba_pickle.pkl'

In [None]:
pickle.dump(x, open(pkl_store_loc, 'wb'))

In [11]:
pickle.load(open(pkl_store_loc, "rb"))

[{'inspection_id': '2498114',
  'dba_name': 'CELTIC CROWN',
  'aka_name': 'CELTIC CROWN',
  'license_': '404',
  'facility_type': 'Restaurant',
  'risk': 'Risk 1 (High)',
  'address': '4301 N WESTERN AVE ',
  'city': 'CHICAGO',
  'state': 'IL',
  'zip': '60618',
  'inspection_date': '2021-04-12T00:00:00.000',
  'inspection_type': 'Non-Inspection',
  'results': 'No Entry',
  'latitude': '41.959539275814876',
  'longitude': '-87.68848380383307',
  'location': {'latitude': '-87.68848380383307',
   'longitude': '41.959539275814876'}},
 {'inspection_id': '2498106',
  'dba_name': 'ASIAN CUISINE EXPRESS',
  'aka_name': 'ASIAN CUISINE EXPRESS',
  'license_': '2327198',
  'facility_type': 'Restaurant',
  'risk': 'Risk 1 (High)',
  'address': '3823 W 31ST ST ',
  'city': 'CHICAGO',
  'state': 'IL',
  'zip': '60623',
  'inspection_date': '2021-04-12T00:00:00.000',
  'inspection_type': 'Canvass',
  'results': 'Pass',
  'latitude': '41.836838656288656',
  'longitude': '-87.72053342728425',
  'locat

##### Storing data in S3

In [29]:
pkl_store_loc = !pwd
pkl_store_loc = pkl_store_loc[0] + "/prueba_pickle.pkl"
pkl_store_loc

'/Users/rp_mbp/Documents/ReposRob_RobPer/ITAMmcd/semestre_2/Arquitectura_Prod_Dat/Venv_ArqPD/repos/dpa_2021/_robdir/prueba_pickle.pkl'

In [55]:
## Storing file locally
pickle.dump(x, open(pkl_store_loc, "wb"))

In [56]:
## Loading file from local and saving as variable
xx = pickle.dumps(pickle.load(open(pkl_store_loc, "rb")))

In [57]:
## Saving pickle in s3
s3.put_object(
    Bucket=bucket_name,
    Key="test_pickle.pkl",
    Body=xx
)

{'ResponseMetadata': {'RequestId': '9XXVB722VK42KZJN',
  'HostId': '00tEbVt/wGh7lRGtbJYFwhz+9OQaCs60kTl8AW3k6nnTK0sPX+8Zg4/vOfwWLTJc+kwHOyEDlJA=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '00tEbVt/wGh7lRGtbJYFwhz+9OQaCs60kTl8AW3k6nnTK0sPX+8Zg4/vOfwWLTJc+kwHOyEDlJA=',
   'x-amz-request-id': '9XXVB722VK42KZJN',
   'date': 'Sun, 18 Apr 2021 15:38:14 GMT',
   'etag': '"c3e99b11cd9ecf2d0fbe64122c32a476"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"c3e99b11cd9ecf2d0fbe64122c32a476"'}

##### Downloading data from s3 and unpickling

In [58]:
res_xx = s3.get_object(
    Bucket=bucket_name,
    Key="test_pickle.pkl"
)
res_xx

{'ResponseMetadata': {'RequestId': '1XRWXCT66KF3AZCT',
  'HostId': 'u59a8VUy9a2oMgX0MU6Y08J6AMWTpThiupWcxbw+DGNPS7geSnU4U5Hl3chcGsZgnBRTlQXawPQ=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'u59a8VUy9a2oMgX0MU6Y08J6AMWTpThiupWcxbw+DGNPS7geSnU4U5Hl3chcGsZgnBRTlQXawPQ=',
   'x-amz-request-id': '1XRWXCT66KF3AZCT',
   'date': 'Sun, 18 Apr 2021 15:38:29 GMT',
   'last-modified': 'Sun, 18 Apr 2021 15:38:14 GMT',
   'etag': '"c3e99b11cd9ecf2d0fbe64122c32a476"',
   'accept-ranges': 'bytes',
   'content-type': 'binary/octet-stream',
   'content-length': '8742',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'AcceptRanges': 'bytes',
 'LastModified': datetime.datetime(2021, 4, 18, 15, 38, 14, tzinfo=tzutc()),
 'ContentLength': 8742,
 'ETag': '"c3e99b11cd9ecf2d0fbe64122c32a476"',
 'ContentType': 'binary/octet-stream',
 'Metadata': {},
 'Body': <botocore.response.StreamingBody at 0x16ce6ce20>}

In [59]:
body = res_xx["Body"].read()

In [60]:
pickle.loads(body)

[{'inspection_id': '2498114',
  'dba_name': 'CELTIC CROWN',
  'aka_name': 'CELTIC CROWN',
  'license_': '404',
  'facility_type': 'Restaurant',
  'risk': 'Risk 1 (High)',
  'address': '4301 N WESTERN AVE ',
  'city': 'CHICAGO',
  'state': 'IL',
  'zip': '60618',
  'inspection_date': '2021-04-12T00:00:00.000',
  'inspection_type': 'Non-Inspection',
  'results': 'No Entry',
  'latitude': '41.959539275814876',
  'longitude': '-87.68848380383307',
  'location': {'latitude': '-87.68848380383307',
   'longitude': '41.959539275814876'}},
 {'inspection_id': '2498106',
  'dba_name': 'ASIAN CUISINE EXPRESS',
  'aka_name': 'ASIAN CUISINE EXPRESS',
  'license_': '2327198',
  'facility_type': 'Restaurant',
  'risk': 'Risk 1 (High)',
  'address': '3823 W 31ST ST ',
  'city': 'CHICAGO',
  'state': 'IL',
  'zip': '60623',
  'inspection_date': '2021-04-12T00:00:00.000',
  'inspection_type': 'Canvass',
  'results': 'Pass',
  'latitude': '41.836838656288656',
  'longitude': '-87.72053342728425',
  'locat

#### Uploading info

#### Checking aws s3 contents

In [5]:
s3 = get_s3_resource()

FileNotFoundError: Couldnt load the file

In [None]:
cont_ingest_path

In [None]:
objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=cont_ingest_path)['Contents']
objects

In [None]:
objects.append(
 {'Key': 'ingestion/consecutive/consecutive-inspections-2021-02-22.pkl',
  'LastModified': "datetime.datetime(2021, 2, 22, 2, 53, 54, tzinfo=tzutc())",
  'ETag': '"79cff7864a646f1dfd4d51b4e732a226"',
  'Size': 1152660,
  'StorageClass': 'STANDARD'}
)

objects.append(
 {'Key': 'ingestion/consecutive/consecutive-inspections-2021-02-15.pkl',
  'LastModified': "datetime.datetime(2021, 2, 22, 2, 53, 54, tzinfo=tzutc())",
  'ETag': '"79cff7864a646f1dfd4d51b4e732a226"',
  'Size': 1152660,
  'StorageClass': 'STANDARD'}
)

objects.append(
 {'Key': 'ingestion/consecutive/consecutive-inspections-2021-02-10.pkl',
  'LastModified': "datetime.datetime(2021, 2, 22, 2, 53, 54, tzinfo=tzutc())",
  'ETag': '"79cff7864a646f1dfd4d51b4e732a226"',
  'Size': 1152660,
  'StorageClass': 'STANDARD'}
)

In [None]:
rex = str(cont_dat_prefix) + "(.*).pkl"

lx = [datetime.strptime(re.search(rex, obj["Key"]).group(1), '%Y-%m-%d') for obj in objects if cont_dat_prefix in obj["Key"]]
most_rec_date = datetime.strftime(max(lx), '%Y-%m-%d')
most_rec_date

In [None]:
s3x = pickle.loads(s3.get_object(Bucket=bucket_name, Key=(cont_ingest_path + "consecutive-inspections-2021-02-21.pkl"))['Body'].read())

In [None]:
dfx = pd.DataFrame(s3x)
dfx

In [None]:
dfx["inspection_date"] = pd.to_datetime(dfx["inspection_date"])

In [None]:
print("max date: ", max(dfx["inspection_date"]))
print("min date: ", min(dfx["inspection_date"]))

In [None]:
dfx.groupby(pd.Grouper(key="inspection_date")).count()[["inspection_id"]]

---

## Creating local directories for temporal data

In [None]:
base_path = "../src/pipeline/luigi/ingestion_tmp/"

### Find most recent ingestion

#### Case 1: There is previous consecutive downloads

In [None]:
lyrs = [ydir[-4:] for ydir in os.listdir(base_path + "consecutive") if "YEAR=" in ydir]
lyrs

In [None]:
mr_yr = max(lyrs)
mr_yr

In [None]:
lmths = [mdir[-2:] for mdir in os.listdir(base_path + "consecutive" + "/" + "YEAR=" + mr_yr) if "MONTH=" in mdir]
lmths

In [None]:
mr_mth = max(lmths)
mr_mth

In [None]:
lings = [ing for ing in os.listdir(base_path + "consecutive" + "/" + "YEAR=" + mr_yr + "/" + "MONTH=" + mr_mth)]
lings

In [None]:
regex = "consecutive_inspections_" + "(.*).pkl"

In [None]:
min([re.search(regex, ing).group(1) for ing in lings])

In [None]:
lx = ['.DS_Store', 'consecutive_inspections_2021-03-05.pkl', 'consecutive_inspections_2021-03-10.pkl']
lx

In [None]:
cont_dat_prefix = "consecutive_inspections_"

In [None]:
regex = cont_dat_prefix + "(.*).pkl"

In [None]:
lx

In [None]:
lings

In [None]:
most_recent_ing = max([re.search(regex, ing).group(1) for ing in lx if ".pkl" in ing])
most_recent_ing

#### Additional notes

In [None]:
x = date.today().strftime('%Y-%m-%d')

In [None]:
x[5:7]

---

## Creating .csv files with metadata

### Loading previous metadata pickle

In [22]:
pkl_meta = "../results/metadata/transform_metadata.pkl"

In [23]:
pickle.load(open(pkl_meta, "rb"))

Unnamed: 0_level_0,trans_count,new_cols
ing_time_exec,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-04-12 18:11:08.664824,2,serious_violations


### Creating dummy metadata

#### Line 1

In [34]:
meta_1 = str(datetime.now())

In [35]:
meta_2 = str(2)

In [36]:
meta_3 = "serious_violations"

In [44]:
meta_comp = ",".join([meta_1, meta_2, meta_3])
meta_comp

'2021-04-18 11:24:44.839666,2,serious_violations'

In [46]:
import csv
with open("transformation_metadata.csv", mode="w") as metadata_file:
    metadata_writer = csv.writer(metadata_file, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
    metadata_writer.writerow([meta_1, meta_2, meta_3])

---
---