# Notebook with test code

---

## Imports

### Standard library imports

In [1]:
import yaml

import sys

from datetime import (date, datetime)

import os

import pickle

import re

### Third party libraries

In [2]:
import boto3

import pandas as pd

### Local application imports

In [3]:
sys.path.append("../")

In [4]:
%load_ext autoreload
%autoreload 2

from src.etl.ingesta_almacenamiento import (
    
    ## Functions
    get_client,
    ingesta_inicial,
    ingesta_consecutiva,
    get_s3_resource,
    request_data_to_API,
    get_s3_credentials,
    
)


from src.utils.utils import (

    get_api_token

)


from src.utils.data_dict import (
    data_dict
)

from src.utils.params_ml import(
    models_dict
)

---

## AWS base examples

### S3

#### Initial configuration

##### Resource

##### Client

In [6]:
s3_creds = get_s3_credentials("../conf/local/credentials.yaml")

session = boto3.Session(
    aws_access_key_id=s3_creds['aws_access_key_id'],
    aws_secret_access_key=s3_creds['aws_secret_access_key']
)
s3 = session.client('s3')

#### Connecting to existing bucket and viewing contents

In [None]:
bucket = "data-product-architecture-equipo-9"
key ="ingestion/consecutive"

In [None]:
objects = s3.list_objects_v2(Bucket=bucket, Prefix=key)['Contents']
objects

#### Reading pickle from S3

In [None]:
obj_path = [file["Key"] for file in objects if "2021-03-18" in file["Key"]][0]
obj_path

In [None]:
response = s3.get_object(
    Bucket=bucket,
    Key=obj_path
)
response

In [None]:
body = response["Body"].read()

---

In [None]:
pickle.loads(pickle.loads(body))

#### Creating bucket

In [None]:
bucket_name = "comdline-test-bucket-rob"

s3.create_bucket(
    Bucket=bucket_name,
    CreateBucketConfiguration={'LocationConstraint': 'us-west-2'},
    ACL="private"
)

#### Uploading file to bucket

In [None]:
file_to_upload = "../../admin/test_file3_for_s3.txt"
file_name = file_to_upload.split(sep="/")[-1]

In [None]:
# s3.upload_file(file_to_upload, bucket_name, file_name)

s3.meta.client.upload_file(file_to_upload, bucket_name, file_name)

In [None]:
[obj for obj in s3.Bucket(bucket_name).objects.all()]

#### Downloading files from bucket

In [None]:
path_to_download = "/Users/rp_mbp/Documents/ReposRob_RobPer/ITAMmcd/semestre_2/Arquitectura_Prod_Dat/Venv_ArqPD/repos/dpa_2021/_robdir/" + file_name

s3.meta.client.download_file(bucket_name, file_name, path_to_download)

#### Deleting bucket files

In [None]:
s3.Object(bucket_name, file_name).delete()

---

#### Deleting bucket

## Downloading and storing Chicago data

#### Interacting with API

In [74]:
dataset_id = "4ijn-s7e5"

In [75]:
token = get_api_token("../conf/local/credentials.yaml")
token

'5HfsId12lhMMzSlYANoAq451w'

In [76]:
client = get_client(token)
client

<sodapy.socrata.Socrata at 0x1713eb7c0>

In [77]:
most_rec_date = '2021-04-10'

In [78]:
soql_query = "inspection_date >= '{}'".format(most_rec_date)
soql_query

"inspection_date >= '2021-04-10'"

In [79]:
x = client.get(dataset_id, 
               limit=10,
               where=soql_query
              )

In [80]:
x

[{'inspection_id': '2498101',
  'dba_name': 'LEXINGTON BETTY SMOKEHOUSE',
  'aka_name': 'LEXINGTON BETTY SMOKEHOUSE',
  'license_': '2717939',
  'facility_type': 'Restaurant',
  'risk': 'Risk 1 (High)',
  'address': '756 E 111TH ST ',
  'city': 'CHICAGO',
  'state': 'IL',
  'zip': '60628',
  'inspection_date': '2021-04-12T00:00:00.000',
  'inspection_type': 'Non-Inspection',
  'results': 'No Entry',
  'latitude': '41.69297593328423',
  'longitude': '-87.60266451724692',
  'location': {'latitude': '-87.60266451724692',
   'longitude': '41.69297593328423'}},
 {'inspection_id': '2498054',
  'dba_name': 'KINGS  GYROS #2',
  'aka_name': 'KINGS  GYROS #2',
  'license_': '1985223',
  'facility_type': 'Restaurant',
  'risk': 'Risk 1 (High)',
  'address': '5233 N MILWAUKEE AVE ',
  'city': 'CHICAGO',
  'state': 'IL',
  'zip': '60630',
  'inspection_date': '2021-04-12T00:00:00.000',
  'inspection_type': 'Canvass',
  'results': 'Out of Business',
  'latitude': '41.97547236947101',
  'longitude': 

In [81]:
len(x)

10

In [82]:
xx = pd.DataFrame(x)
xx

Unnamed: 0,inspection_id,dba_name,aka_name,license_,facility_type,risk,address,city,state,zip,inspection_date,inspection_type,results,latitude,longitude,location,violations
0,2498101,LEXINGTON BETTY SMOKEHOUSE,LEXINGTON BETTY SMOKEHOUSE,2717939,Restaurant,Risk 1 (High),756 E 111TH ST,CHICAGO,IL,60628,2021-04-12T00:00:00.000,Non-Inspection,No Entry,41.69297593328423,-87.60266451724692,"{'latitude': '-87.60266451724692', 'longitude'...",
1,2498054,KINGS GYROS #2,KINGS GYROS #2,1985223,Restaurant,Risk 1 (High),5233 N MILWAUKEE AVE,CHICAGO,IL,60630,2021-04-12T00:00:00.000,Canvass,Out of Business,41.97547236947101,-87.76738848299047,"{'latitude': '-87.76738848299047', 'longitude'...",
2,2498080,RUBYS SOULFOOD EXPRESS,RUBYS SOULFOOD EXPRESS,2723468,Restaurant,Risk 1 (High),11028 S HALSTED ST,CHICAGO,IL,60628,2021-04-12T00:00:00.000,Canvass,Fail,41.6932926013791,-87.64255435412615,"{'latitude': '-87.64255435412615', 'longitude'...","1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOW..."
3,2498091,"Colemon, Johnnie","Colemon, Johnnie",26751,School,Risk 1 (High),1441 W 119th St (11900S),CHICAGO,IL,60643,2021-04-12T00:00:00.000,Canvass,Pass,41.6774778561333,-87.6581201090055,"{'latitude': '-87.6581201090055', 'longitude':...","53. TOILET FACILITIES: PROPERLY CONSTRUCTED, S..."
4,2498093,CHAPPELL SCHOOL,CHAPPELL SCHOOL,22681,School,Risk 1 (High),2135 W Foster AVE,CHICAGO,IL,60625,2021-04-12T00:00:00.000,Canvass Re-Inspection,Fail,41.97586700298142,-87.68325437820378,"{'latitude': '-87.68325437820378', 'longitude'...",10. ADEQUATE HANDWASHING SINKS PROPERLY SUPPLI...
5,2498133,HONEYBAKED HAM AND CAFE,HONEYBAKED HAM AND CAFE,2060329,Restaurant,Risk 1 (High),2815 N ASHLAND AVE,CHICAGO,IL,60657,2021-04-12T00:00:00.000,Canvass,Pass,41.93289126306049,-87.66826723143315,"{'latitude': '-87.66826723143315', 'longitude'...",41. WIPING CLOTHS: PROPERLY USED & STORED - Co...
6,2498113,HOT DOG EXPRESS,HOT DOG EXPRESS (T2 E5),1909525,Restaurant,Risk 2 (Medium),11601 W TOUHY AVE,CHICAGO,IL,60666,2021-04-12T00:00:00.000,Canvass,Pass,42.008536400868735,-87.91442843927047,"{'latitude': '-87.91442843927047', 'longitude'...","55. PHYSICAL FACILITIES INSTALLED, MAINTAINED ..."
7,2498083,SUBWAY SANDWICH,SUBWAY SANDWICH,55059,Restaurant,Risk 1 (High),1958 W PETERSON AVE,CHICAGO,IL,60660,2021-04-12T00:00:00.000,Canvass,Pass,41.99079241594284,-87.67936519400483,"{'latitude': '-87.67936519400483', 'longitude'...",49. NON-FOOD/FOOD CONTACT SURFACES CLEAN - Com...
8,2498086,ELSIETECATERING,ELSIETECATERING,2663526,Catering,Risk 1 (High),3817 S KEDZIE AVE,CHICAGO,IL,60632,2021-04-12T00:00:00.000,Canvass,Pass,41.82381861968802,-87.70428542093514,"{'latitude': '-87.70428542093514', 'longitude'...",47. FOOD & NON-FOOD CONTACT SURFACES CLEANABLE...
9,2498072,FOX'S BEVERLY PUB,FOX'S BEVERLY PUB,149,Restaurant,Risk 1 (High),9956 S WESTERN AVE,CHICAGO,IL,60643,2021-04-12T00:00:00.000,Canvass,Pass w/ Conditions,41.71201213322909,-87.68193540843004,"{'latitude': '-87.68193540843004', 'longitude'...",5. PROCEDURES FOR RESPONDING TO VOMITING AND D...


In [24]:
str(xx.shape)

'(10, 17)'

##### Using "dumps"

In [None]:
xx = pickle.dumps(x)

In [None]:
xx

In [None]:
pickle.loads(xx)

##### Using "dump"

In [None]:
pkl_store_loc = !pwd
pkl_store_loc = pkl_store_loc[0] + "/prueba_pickle.pkl"

In [None]:
pkl_store_loc

In [None]:
pickle.dump(x, open(pkl_store_loc, 'wb'))

In [None]:
pickle.load(open(pkl_store_loc, "rb"))

##### Storing data in S3

In [None]:
pkl_store_loc = !pwd
pkl_store_loc = pkl_store_loc[0] + "/prueba_pickle.pkl"
pkl_store_loc

In [None]:
## Storing file locally
pickle.dump(x, open(pkl_store_loc, "wb"))

In [None]:
## Loading file from local and saving as variable
xx = pickle.dumps(pickle.load(open(pkl_store_loc, "rb")))

In [None]:
## Saving pickle in s3
s3.put_object(
    Bucket=bucket_name,
    Key="test_pickle.pkl",
    Body=xx
)

##### Downloading data from s3 and unpickling

In [None]:
res_xx = s3.get_object(
    Bucket=bucket_name,
    Key="test_pickle.pkl"
)
res_xx

In [None]:
body = res_xx["Body"].read()

In [None]:
pickle.loads(body)

#### Uploading info

#### Checking aws s3 contents

In [5]:
s3 = get_s3_resource()

FileNotFoundError: Couldnt load the file

In [None]:
cont_ingest_path

In [None]:
objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=cont_ingest_path)['Contents']
objects

In [None]:
objects.append(
 {'Key': 'ingestion/consecutive/consecutive-inspections-2021-02-22.pkl',
  'LastModified': "datetime.datetime(2021, 2, 22, 2, 53, 54, tzinfo=tzutc())",
  'ETag': '"79cff7864a646f1dfd4d51b4e732a226"',
  'Size': 1152660,
  'StorageClass': 'STANDARD'}
)

objects.append(
 {'Key': 'ingestion/consecutive/consecutive-inspections-2021-02-15.pkl',
  'LastModified': "datetime.datetime(2021, 2, 22, 2, 53, 54, tzinfo=tzutc())",
  'ETag': '"79cff7864a646f1dfd4d51b4e732a226"',
  'Size': 1152660,
  'StorageClass': 'STANDARD'}
)

objects.append(
 {'Key': 'ingestion/consecutive/consecutive-inspections-2021-02-10.pkl',
  'LastModified': "datetime.datetime(2021, 2, 22, 2, 53, 54, tzinfo=tzutc())",
  'ETag': '"79cff7864a646f1dfd4d51b4e732a226"',
  'Size': 1152660,
  'StorageClass': 'STANDARD'}
)

In [None]:
rex = str(cont_dat_prefix) + "(.*).pkl"

lx = [datetime.strptime(re.search(rex, obj["Key"]).group(1), '%Y-%m-%d') for obj in objects if cont_dat_prefix in obj["Key"]]
most_rec_date = datetime.strftime(max(lx), '%Y-%m-%d')
most_rec_date

In [None]:
s3x = pickle.loads(s3.get_object(Bucket=bucket_name, Key=(cont_ingest_path + "consecutive-inspections-2021-02-21.pkl"))['Body'].read())

In [None]:
dfx = pd.DataFrame(s3x)
dfx

In [None]:
dfx["inspection_date"] = pd.to_datetime(dfx["inspection_date"])

In [None]:
print("max date: ", max(dfx["inspection_date"]))
print("min date: ", min(dfx["inspection_date"]))

In [None]:
dfx.groupby(pd.Grouper(key="inspection_date")).count()[["inspection_id"]]

---

## Creating local directories for temporal data

In [None]:
base_path = "../src/pipeline/luigi/ingestion_tmp/"

### Find most recent ingestion

#### Case 1: There is previous consecutive downloads

In [None]:
lyrs = [ydir[-4:] for ydir in os.listdir(base_path + "consecutive") if "YEAR=" in ydir]
lyrs

In [None]:
mr_yr = max(lyrs)
mr_yr

In [None]:
lmths = [mdir[-2:] for mdir in os.listdir(base_path + "consecutive" + "/" + "YEAR=" + mr_yr) if "MONTH=" in mdir]
lmths

In [None]:
mr_mth = max(lmths)
mr_mth

In [None]:
lings = [ing for ing in os.listdir(base_path + "consecutive" + "/" + "YEAR=" + mr_yr + "/" + "MONTH=" + mr_mth)]
lings

In [None]:
regex = "consecutive_inspections_" + "(.*).pkl"

In [None]:
min([re.search(regex, ing).group(1) for ing in lings])

In [None]:
lx = ['.DS_Store', 'consecutive_inspections_2021-03-05.pkl', 'consecutive_inspections_2021-03-10.pkl']
lx

In [None]:
cont_dat_prefix = "consecutive_inspections_"

In [None]:
regex = cont_dat_prefix + "(.*).pkl"

In [None]:
lx

In [None]:
lings

In [None]:
most_recent_ing = max([re.search(regex, ing).group(1) for ing in lx if ".pkl" in ing])
most_recent_ing

#### Additional notes

In [None]:
x = date.today().strftime('%Y-%m-%d')

In [None]:
x[5:7]

---

## Creating .csv files with metadata

### Working with previous metadata pickle

#### Transformation pickle

In [None]:
pkl_meta = "../results/metadata/transform_metadata.pkl"

In [None]:
dfx = pickle.load(open(pkl_meta, "rb"))
dfx

In [None]:
mx = [dfx.index[0]]
mx

In [None]:
[mx.append(val) for val in dfx.iloc[0, :]]

In [None]:
mx

In [None]:
## Overwriting csv file from a dataframe (no header)
def write_csv_from_df(df, filepath, filename):
    
    ## Extracting df contents as list
    mdata_list = [df.index[0]]
    [mdata_list.append(val) for val in df.iloc[0, :]]
    
    ## Creating and writing csv file with extracted list
    with open(filepath + filename, mode="w") as metadata_file:
        metadata_writer = csv.writer(metadata_file, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
        metadata_writer.writerow(mdata_list)

In [None]:
filepath = "metadata_test_dir/"

In [None]:
filename = "transformation_metadata_2.csv"

In [None]:
write_csv_from_df(dfx, filepath, filename)

In [None]:
dfx

#### Model selection pickle

In [6]:
s3_creds = get_s3_credentials("../conf/local/credentials.yaml")

session = boto3.Session(
    aws_access_key_id=s3_creds['aws_access_key_id'],
    aws_secret_access_key=s3_creds['aws_secret_access_key']
)
s3 = session.client('s3')

In [12]:
bucket = "data-product-architecture-equipo-9"
key ="model_selection"

In [13]:
objects = s3.list_objects_v2(Bucket=bucket, Prefix=key)['Contents']
objects

[{'Key': 'model_selection/trained_model_2021-04-25.pkl',
  'LastModified': datetime.datetime(2021, 4, 25, 19, 4, 52, tzinfo=tzutc()),
  'ETag': '"67661198a0290f2b58f8c536a6aa1dca"',
  'Size': 18758,
  'StorageClass': 'STANDARD'}]

In [14]:
obj_path = [file["Key"] for file in objects if "trained" in file["Key"]][0]
obj_path

'model_selection/trained_model_2021-04-25.pkl'

In [15]:
response = s3.get_object(
    Bucket=bucket,
    Key=obj_path
)
response

{'ResponseMetadata': {'RequestId': '3NV1TEHC56S24YVD',
  'HostId': 'mv3pOiGYUqrko8pc6slazP3OlotVaB6QUrbu59nHG02JuV1SGI1Qjh5oqKPETWwpahJcfLRnJgc=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'mv3pOiGYUqrko8pc6slazP3OlotVaB6QUrbu59nHG02JuV1SGI1Qjh5oqKPETWwpahJcfLRnJgc=',
   'x-amz-request-id': '3NV1TEHC56S24YVD',
   'date': 'Sun, 25 Apr 2021 19:18:26 GMT',
   'last-modified': 'Sun, 25 Apr 2021 19:04:52 GMT',
   'etag': '"67661198a0290f2b58f8c536a6aa1dca"',
   'accept-ranges': 'bytes',
   'content-type': 'binary/octet-stream',
   'content-length': '18758',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'AcceptRanges': 'bytes',
 'LastModified': datetime.datetime(2021, 4, 25, 19, 4, 52, tzinfo=tzutc()),
 'ContentLength': 18758,
 'ETag': '"67661198a0290f2b58f8c536a6aa1dca"',
 'ContentType': 'binary/octet-stream',
 'Metadata': {},
 'Body': <botocore.response.StreamingBody at 0x16f18de80>}

In [16]:
body = response["Body"].read()

In [18]:
model = pickle.loads(body)
model

{'best_trained_model': DecisionTreeClassifier(max_depth=10, min_samples_leaf=5, random_state=2222),
 'model_test_predict_labels': array([1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0,
        1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,
        1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1,
        1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,
        0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
        1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1,
        1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1,
        1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1,
        1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0,
        1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1,
        1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1

In [21]:
str(model["best_trained_model"])

'DecisionTreeClassifier(max_depth=10, min_samples_leaf=5, random_state=2222)'

In [24]:
model["best_trained_model"]

TypeError: score() missing 2 required positional arguments: 'X' and 'y'

### Creating dummy metadata

#### Line 1

In [None]:
meta_1 = str(datetime.now())

In [None]:
meta_2 = str(10)

In [None]:
meta_3 = "carnegie"

In [None]:
meta_comp = ",".join([meta_1, meta_2, meta_3])
meta_comp

In [None]:
import csv
with open("transformation_metadata.csv", mode="w") as metadata_file:
    metadata_writer = csv.writer(metadata_file, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
    metadata_writer.writerow([meta_1, meta_2, meta_3])

#### Creating string based on dictionary keys for metadata

In [10]:
" | ".join([mdl for mdl in models_dict])

'random_forest | decision_tree'

## Saving dictionary of dataframes as pickle

### Creating dictionary of dataframes

In [9]:
dfx1 = {
    "col1_df1": list(range(1, 11)),
    "col2_df1": list("htkdlekdjc"),
}
dfx1 = pd.DataFrame.from_dict(dfx1)
dfx1

Unnamed: 0,col1_df1,col2_df1
0,1,h
1,2,t
2,3,k
3,4,d
4,5,l
5,6,e
6,7,k
7,8,d
8,9,j
9,10,c


In [10]:
dfx2 = {
    "col1_df2": list(range(11, 21)),
    "col2_df2": list("jfldurytgc"),
}
dfx2 = pd.DataFrame.from_dict(dfx2)
dfx2

Unnamed: 0,col1_df2,col2_df2
0,11,j
1,12,f
2,13,l
3,14,d
4,15,u
5,16,r
6,17,y
7,18,t
8,19,g
9,20,c


In [11]:
df_dict = {
    "dfx1": dfx1,
    "dfx2": dfx2
}
df_dict

{'dfx1':    col1_df1 col2_df1
 0         1        h
 1         2        t
 2         3        k
 3         4        d
 4         5        l
 5         6        e
 6         7        k
 7         8        d
 8         9        j
 9        10        c,
 'dfx2':    col1_df2 col2_df2
 0        11        j
 1        12        f
 2        13        l
 3        14        d
 4        15        u
 5        16        r
 6        17        y
 7        18        t
 8        19        g
 9        20        c}

In [13]:
df_dict['dfx2']

Unnamed: 0,col1_df2,col2_df2
0,11,j
1,12,f
2,13,l
3,14,d
4,15,u
5,16,r
6,17,y
7,18,t
8,19,g
9,20,c


### Storing dictionary as pickle variable and unpickling

In [16]:
df_dict_pkl = pickle.dumps(df_dict)

In [19]:
df_dict_x = pickle.loads(df_dict_pkl)

In [21]:
df_dict_x['dfx2']

Unnamed: 0,col1_df2,col2_df2
0,11,j
1,12,f
2,13,l
3,14,d
4,15,u
5,16,r
6,17,y
7,18,t
8,19,g
9,20,c


### Storing dictionary as pickle in local memory and unpickling

In [24]:
pkl_file = "../data/pickles/test_df_dict_pkl.pkl"

In [28]:
pickle.dump(df_dict, open(pkl_file, "wb"))

In [33]:
df_dict_x = pickle.load(open(pkl_file, "rb"))

In [35]:
df_dict_x['dfx2']

Unnamed: 0,col1_df2,col2_df2
0,11,j
1,12,f
2,13,l
3,14,d
4,15,u
5,16,r
6,17,y
7,18,t
8,19,g
9,20,c


---

## *Notes*

---
---