# Notebook with test code

---

## Imports

### Standard library imports

In [1]:
import yaml

import sys

from datetime import (date, datetime)

import os

import pickle

import re

### Third party libraries

In [2]:
import boto3

import pandas as pd

### Local application imports

In [3]:
sys.path.append("../")

In [4]:
%load_ext autoreload
%autoreload 2

from src.etl.ingesta_almacenamiento import (
    
    ## Functions
    get_client,
    ingesta_inicial,
    ingesta_consecutiva,
    get_s3_resource,
    guardar_ingesta,
    get_s3_credentials,
    
)

from src.utils.general import (
    get_api_token
)

ModuleNotFoundError: No module named 'src.utils.general'

---

## AWS base examples

### S3

#### Initial configuration

In [None]:
ses = boto3.session.Session(profile_name="robper_dpa")

In [None]:
s3 = ses.resource("s3")

In [None]:
[buck.name for buck in s3.buckets.all()]

In [8]:
s3_creds = get_s3_credentials("../conf/local/credentials.yaml")

session = boto3.Session(
    aws_access_key_id=s3_creds['aws_access_key_id'],
    aws_secret_access_key=s3_creds['aws_secret_access_key']
)
s3 = session.client('s3')

#### Connecting to existing bucket and viewing contents

In [11]:
bucket = "data-product-architecture-equipo-9"
key ="ingestion/initial"

In [12]:
objects = s3.list_objects_v2(Bucket=bucket, Prefix=key)['Contents']
objects

[{'Key': 'ingestion/initial/YEAR=2021/MONTH=03/ingesta.pkl',
  'LastModified': datetime.datetime(2021, 3, 15, 23, 37, 15, tzinfo=tzutc()),
  'ETag': '"d72a24388342ada956ed95095bbee96e"',
  'Size': 293826470,
  'StorageClass': 'STANDARD'},
 {'Key': 'ingestion/initial/historic-inspections-2021-02-19.pkl',
  'LastModified': datetime.datetime(2021, 2, 20, 1, 53, 32, tzinfo=tzutc()),
  'ETag': '"3414a3b19529ec3e91de6838c3c9585f-31"',
  'Size': 257245095,
  'StorageClass': 'STANDARD'}]

#### Creating bucket

In [None]:
bucket_name = "comdline-test-bucket-rob"

s3.create_bucket(
    Bucket=bucket_name,
    CreateBucketConfiguration={'LocationConstraint': 'us-west-2'},
    ACL="private"
)

#### Uploading file to bucket

In [None]:
file_to_upload = "../../admin/test_file3_for_s3.txt"
file_name = file_to_upload.split(sep="/")[-1]

In [None]:
# s3.upload_file(file_to_upload, bucket_name, file_name)

s3.meta.client.upload_file(file_to_upload, bucket_name, file_name)

In [None]:
[obj for obj in s3.Bucket(bucket_name).objects.all()]

#### Downloading files from bucket

In [None]:
path_to_download = "/Users/rp_mbp/Documents/ReposRob_RobPer/ITAMmcd/semestre_2/Arquitectura_Prod_Dat/Venv_ArqPD/repos/dpa_2021/_robdir/" + file_name

s3.meta.client.download_file(bucket_name, file_name, path_to_download)

#### Deleting bucket files

In [None]:
s3.Object(bucket_name, file_name).delete()

#### Deleting bucket

---

## Downloading and storing Chicago data

#### Interacting with API

In [None]:
dataset_id = "4ijn-s7e5"

In [None]:
token = get_api_token("../conf/local/credentials.yaml")
token

In [None]:
client = get_client(token)
client

In [None]:
most_rec_date = '2021-01-15'

In [None]:
soql_query = "inspection_date >= '{}'".format(most_rec_date)
soql_query

In [None]:
x = client.get(dataset_id, 
               limit=10,
               where=soql_query
              )

In [None]:
x

In [None]:
len(x)

#### Uploading info

#### Checking aws s3 contents

In [None]:
s3 = get_s3_resource()

FileNotFoundError: Couldnt load the file

In [None]:
cont_ingest_path

In [None]:
objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=cont_ingest_path)['Contents']
objects

In [None]:
objects.append(
 {'Key': 'ingestion/consecutive/consecutive-inspections-2021-02-22.pkl',
  'LastModified': "datetime.datetime(2021, 2, 22, 2, 53, 54, tzinfo=tzutc())",
  'ETag': '"79cff7864a646f1dfd4d51b4e732a226"',
  'Size': 1152660,
  'StorageClass': 'STANDARD'}
)

objects.append(
 {'Key': 'ingestion/consecutive/consecutive-inspections-2021-02-15.pkl',
  'LastModified': "datetime.datetime(2021, 2, 22, 2, 53, 54, tzinfo=tzutc())",
  'ETag': '"79cff7864a646f1dfd4d51b4e732a226"',
  'Size': 1152660,
  'StorageClass': 'STANDARD'}
)

objects.append(
 {'Key': 'ingestion/consecutive/consecutive-inspections-2021-02-10.pkl',
  'LastModified': "datetime.datetime(2021, 2, 22, 2, 53, 54, tzinfo=tzutc())",
  'ETag': '"79cff7864a646f1dfd4d51b4e732a226"',
  'Size': 1152660,
  'StorageClass': 'STANDARD'}
)

In [None]:
rex = str(cont_dat_prefix) + "(.*).pkl"

lx = [datetime.strptime(re.search(rex, obj["Key"]).group(1), '%Y-%m-%d') for obj in objects if cont_dat_prefix in obj["Key"]]
most_rec_date = datetime.strftime(max(lx), '%Y-%m-%d')
most_rec_date

In [None]:
s3x = pickle.loads(s3.get_object(Bucket=bucket_name, Key=(cont_ingest_path + "consecutive-inspections-2021-02-21.pkl"))['Body'].read())

In [None]:
dfx = pd.DataFrame(s3x)
dfx

In [None]:
dfx["inspection_date"] = pd.to_datetime(dfx["inspection_date"])

In [None]:
print("max date: ", max(dfx["inspection_date"]))
print("min date: ", min(dfx["inspection_date"]))

In [None]:
dfx.groupby(pd.Grouper(key="inspection_date")).count()[["inspection_id"]]

---

## Creating local directories for temporal data

In [None]:
base_path = "../src/pipeline/luigi/ingestion_tmp/"

### Find most recent ingestion

#### Case 1: There is previous consecutive downloads

In [None]:
lyrs = [ydir[-4:] for ydir in os.listdir(base_path + "consecutive") if "YEAR=" in ydir]
lyrs

In [None]:
mr_yr = max(lyrs)
mr_yr

In [None]:
lmths = [mdir[-2:] for mdir in os.listdir(base_path + "consecutive" + "/" + "YEAR=" + mr_yr) if "MONTH=" in mdir]
lmths

In [None]:
mr_mth = max(lmths)
mr_mth

In [None]:
lings = [ing for ing in os.listdir(base_path + "consecutive" + "/" + "YEAR=" + mr_yr + "/" + "MONTH=" + mr_mth)]
lings

In [None]:
regex = "consecutive_inspections_" + "(.*).pkl"

In [None]:
min([re.search(regex, ing).group(1) for ing in lings])

In [None]:
lx = ['.DS_Store', 'consecutive_inspections_2021-03-05.pkl', 'consecutive_inspections_2021-03-10.pkl']
lx

In [None]:
cont_dat_prefix = "consecutive_inspections_"

In [None]:
regex = cont_dat_prefix + "(.*).pkl"

In [None]:
pd.read_pickle("../results/metadata/ingesta_metadata.pkl")

['.DS_Store',
 'consecutive_inspections_2021-03-05.pkl',
 'consecutive_inspections_2021-03-10.pkl']

#### Transformation

In [92]:
pd.read_pickle("../results/metadata/transform_metadata.pkl")

['consecutive_inspections_2021-03-05.pkl',
 'consecutive_inspections_2021-03-10.pkl',
 'consecutive_inspections_2021-03-16.pkl',
 'consecutive_inspections_2021-02-11.pkl',
 'consecutive_inspections_2021-02-04.pkl',
 'consecutive_inspections_2020-12-25.pkl']

#### Feature engineering

In [93]:
pd.read_pickle("../results/metadata/fe_metadata.pkl")

'2021-03-10'

## Notas

In [5]:
x = {}

In [6]:
cols_elim = 23

In [7]:
x["cols_elim"] = cols_elim

In [8]:
cols_live = 10

In [9]:
x["cols_live"] = cols_live

In [10]:
x["exec_time"] = str(datetime.now())

In [11]:
x[5:7]

'03'

---
---