# Notebook with test code

---

## Imports

### Standard library imports

In [1]:
import yaml

import sys

from datetime import (date, datetime)

import os

import pickle

import re

### Third party libraries

In [2]:
import boto3

import pandas as pd

### Local application imports

In [None]:
sys.path.append("../")

In [None]:
%load_ext autoreload
%autoreload 2

from src.etl.ingesta_almacenamiento import (
    
    ## Functions
    get_client,
    ingesta_inicial,
    ingesta_consecutiva,
    get_s3_resource,
    request_data_to_API,
    get_s3_credentials,
    
)


from src.utils.utils import (

    get_api_token

)


from src.utils.data_dict import (
    data_dict
)

from src.utils.params_ml import(
    models_dict
)

---

## AWS base examples

### S3

#### Initial configuration

##### Resource

##### Client

In [None]:
s3_creds = get_s3_credentials("../conf/local/credentials.yaml")

session = boto3.Session(
    aws_access_key_id=s3_creds['aws_access_key_id'],
    aws_secret_access_key=s3_creds['aws_secret_access_key']
)
s3 = session.client('s3')

#### Connecting to existing bucket and viewing contents

In [None]:
bucket = "data-product-architecture-equipo-9"
key ="ingestion/consecutive"

In [None]:
objects = s3.list_objects_v2(Bucket=bucket, Prefix=key)['Contents']
objects

#### Reading pickle from S3

In [None]:
obj_path = [file["Key"] for file in objects if "2021-03-18" in file["Key"]][0]
obj_path

In [None]:
response = s3.get_object(
    Bucket=bucket,
    Key=obj_path
)
response

In [None]:
body = response["Body"].read()

---

In [None]:
pickle.loads(pickle.loads(body))

#### Creating bucket

In [None]:
bucket_name = "comdline-test-bucket-rob"

s3.create_bucket(
    Bucket=bucket_name,
    CreateBucketConfiguration={'LocationConstraint': 'us-west-2'},
    ACL="private"
)

#### Uploading file to bucket

In [None]:
file_to_upload = "../../admin/test_file3_for_s3.txt"
file_name = file_to_upload.split(sep="/")[-1]

In [None]:
# s3.upload_file(file_to_upload, bucket_name, file_name)

s3.meta.client.upload_file(file_to_upload, bucket_name, file_name)

In [None]:
[obj for obj in s3.Bucket(bucket_name).objects.all()]

#### Downloading files from bucket

In [None]:
path_to_download = "/Users/rp_mbp/Documents/ReposRob_RobPer/ITAMmcd/semestre_2/Arquitectura_Prod_Dat/Venv_ArqPD/repos/dpa_2021/_robdir/" + file_name

s3.meta.client.download_file(bucket_name, file_name, path_to_download)

#### Deleting bucket files

In [None]:
s3.Object(bucket_name, file_name).delete()

---

#### Deleting bucket

## Downloading and storing Chicago data

#### Interacting with API

In [None]:
dataset_id = "4ijn-s7e5"

In [None]:
token = get_api_token("../conf/local/credentials.yaml")
token

In [None]:
client = get_client(token)
client

In [None]:
most_rec_date = '2021-04-10'

In [None]:
soql_query = "inspection_date >= '{}'".format(most_rec_date)
soql_query

In [None]:
x = client.get(dataset_id, 
               limit=10,
               where=soql_query
              )

In [None]:
x

In [None]:
len(x)

In [None]:
xx = pd.DataFrame(x)
xx

In [None]:
str(xx.shape)

##### Using "dumps"

In [None]:
xx = pickle.dumps(x)

In [None]:
xx

In [None]:
pickle.loads(xx)

##### Using "dump"

In [None]:
pkl_store_loc = !pwd
pkl_store_loc = pkl_store_loc[0] + "/prueba_pickle.pkl"

In [None]:
pkl_store_loc

In [None]:
pickle.dump(x, open(pkl_store_loc, 'wb'))

In [None]:
pickle.load(open(pkl_store_loc, "rb"))

##### Storing data in S3

In [None]:
pkl_store_loc = !pwd
pkl_store_loc = pkl_store_loc[0] + "/prueba_pickle.pkl"
pkl_store_loc

In [None]:
## Storing file locally
pickle.dump(x, open(pkl_store_loc, "wb"))

In [None]:
## Loading file from local and saving as variable
xx = pickle.dumps(pickle.load(open(pkl_store_loc, "rb")))

In [None]:
## Saving pickle in s3
s3.put_object(
    Bucket=bucket_name,
    Key="test_pickle.pkl",
    Body=xx
)

##### Downloading data from s3 and unpickling

In [None]:
res_xx = s3.get_object(
    Bucket=bucket_name,
    Key="test_pickle.pkl"
)
res_xx

In [None]:
body = res_xx["Body"].read()

In [None]:
pickle.loads(body)

#### Uploading info

#### Checking aws s3 contents

In [None]:
s3 = get_s3_resource()

In [None]:
cont_ingest_path

In [None]:
objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=cont_ingest_path)['Contents']
objects

In [None]:
objects.append(
 {'Key': 'ingestion/consecutive/consecutive-inspections-2021-02-22.pkl',
  'LastModified': "datetime.datetime(2021, 2, 22, 2, 53, 54, tzinfo=tzutc())",
  'ETag': '"79cff7864a646f1dfd4d51b4e732a226"',
  'Size': 1152660,
  'StorageClass': 'STANDARD'}
)

objects.append(
 {'Key': 'ingestion/consecutive/consecutive-inspections-2021-02-15.pkl',
  'LastModified': "datetime.datetime(2021, 2, 22, 2, 53, 54, tzinfo=tzutc())",
  'ETag': '"79cff7864a646f1dfd4d51b4e732a226"',
  'Size': 1152660,
  'StorageClass': 'STANDARD'}
)

objects.append(
 {'Key': 'ingestion/consecutive/consecutive-inspections-2021-02-10.pkl',
  'LastModified': "datetime.datetime(2021, 2, 22, 2, 53, 54, tzinfo=tzutc())",
  'ETag': '"79cff7864a646f1dfd4d51b4e732a226"',
  'Size': 1152660,
  'StorageClass': 'STANDARD'}
)

In [None]:
rex = str(cont_dat_prefix) + "(.*).pkl"

lx = [datetime.strptime(re.search(rex, obj["Key"]).group(1), '%Y-%m-%d') for obj in objects if cont_dat_prefix in obj["Key"]]
most_rec_date = datetime.strftime(max(lx), '%Y-%m-%d')
most_rec_date

In [None]:
s3x = pickle.loads(s3.get_object(Bucket=bucket_name, Key=(cont_ingest_path + "consecutive-inspections-2021-02-21.pkl"))['Body'].read())

In [None]:
dfx = pd.DataFrame(s3x)
dfx

In [None]:
dfx["inspection_date"] = pd.to_datetime(dfx["inspection_date"])

In [None]:
print("max date: ", max(dfx["inspection_date"]))
print("min date: ", min(dfx["inspection_date"]))

In [None]:
dfx.groupby(pd.Grouper(key="inspection_date")).count()[["inspection_id"]]

---

## Creating local directories for temporal data

In [None]:
base_path = "../src/pipeline/luigi/ingestion_tmp/"

### Find most recent ingestion

#### Case 1: There is previous consecutive downloads

In [None]:
lyrs = [ydir[-4:] for ydir in os.listdir(base_path + "consecutive") if "YEAR=" in ydir]
lyrs

In [None]:
mr_yr = max(lyrs)
mr_yr

In [None]:
lmths = [mdir[-2:] for mdir in os.listdir(base_path + "consecutive" + "/" + "YEAR=" + mr_yr) if "MONTH=" in mdir]
lmths

In [None]:
mr_mth = max(lmths)
mr_mth

In [None]:
lings = [ing for ing in os.listdir(base_path + "consecutive" + "/" + "YEAR=" + mr_yr + "/" + "MONTH=" + mr_mth)]
lings

In [None]:
regex = "consecutive_inspections_" + "(.*).pkl"

In [None]:
min([re.search(regex, ing).group(1) for ing in lings])

In [None]:
lx = ['.DS_Store', 'consecutive_inspections_2021-03-05.pkl', 'consecutive_inspections_2021-03-10.pkl']
lx

In [None]:
cont_dat_prefix = "consecutive_inspections_"

In [None]:
regex = cont_dat_prefix + "(.*).pkl"

In [None]:
lx

In [None]:
lings

In [None]:
most_recent_ing = max([re.search(regex, ing).group(1) for ing in lx if ".pkl" in ing])
most_recent_ing

#### Additional notes

In [None]:
x = date.today().strftime('%Y-%m-%d')

In [None]:
x[5:7]

---

## Creating .csv files with metadata

### Working with previous metadata pickle

#### Transformation pickle

In [None]:
pkl_meta = "../results/metadata/transform_metadata.pkl"

In [None]:
dfx = pickle.load(open(pkl_meta, "rb"))
dfx

In [None]:
mx = [dfx.index[0]]
mx

In [None]:
[mx.append(val) for val in dfx.iloc[0, :]]

In [None]:
mx

In [None]:
## Overwriting csv file from a dataframe (no header)
def write_csv_from_df(df, filepath, filename):
    
    ## Extracting df contents as list
    mdata_list = [df.index[0]]
    [mdata_list.append(val) for val in df.iloc[0, :]]
    
    ## Creating and writing csv file with extracted list
    with open(filepath + filename, mode="w") as metadata_file:
        metadata_writer = csv.writer(metadata_file, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
        metadata_writer.writerow(mdata_list)

In [None]:
filepath = "metadata_test_dir/"

In [None]:
filename = "transformation_metadata_2.csv"

In [None]:
write_csv_from_df(dfx, filepath, filename)

In [None]:
dfx

#### Model selection pickle

In [None]:
s3_creds = get_s3_credentials("../conf/local/credentials.yaml")

session = boto3.Session(
    aws_access_key_id=s3_creds['aws_access_key_id'],
    aws_secret_access_key=s3_creds['aws_secret_access_key']
)
s3 = session.client('s3')

In [None]:
bucket = "data-product-architecture-equipo-9"
key ="model_selection"

In [None]:
objects = s3.list_objects_v2(Bucket=bucket, Prefix=key)['Contents']
objects

In [None]:
obj_path = [file["Key"] for file in objects if "trained" in file["Key"]][0]
obj_path

In [None]:
response = s3.get_object(
    Bucket=bucket,
    Key=obj_path
)
response

In [None]:
body = response["Body"].read()

In [None]:
model = pickle.loads(body)
model

In [None]:
str(model["best_trained_model"])

In [None]:
model["best_trained_model"]

### Creating dummy metadata

#### Line 1

In [None]:
meta_1 = str(datetime.now())

In [None]:
meta_2 = str(10)

In [None]:
meta_3 = "carnegie"

In [None]:
meta_comp = ",".join([meta_1, meta_2, meta_3])
meta_comp

In [None]:
import csv
with open("transformation_metadata.csv", mode="w") as metadata_file:
    metadata_writer = csv.writer(metadata_file, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
    metadata_writer.writerow([meta_1, meta_2, meta_3])

#### Creating string based on dictionary keys for metadata

In [None]:
" | ".join([mdl for mdl in models_dict])

## Saving dictionary of dataframes as pickle

### Creating dictionary of dataframes

In [None]:
dfx1 = {
    "col1_df1": list(range(1, 11)),
    "col2_df1": list("htkdlekdjc"),
}
dfx1 = pd.DataFrame.from_dict(dfx1)
dfx1

In [None]:
dfx2 = {
    "col1_df2": list(range(11, 21)),
    "col2_df2": list("jfldurytgc"),
}
dfx2 = pd.DataFrame.from_dict(dfx2)
dfx2

In [None]:
df_dict = {
    "dfx1": dfx1,
    "dfx2": dfx2
}
df_dict

In [None]:
df_dict['dfx2']

### Storing dictionary as pickle variable and unpickling

In [None]:
df_dict_pkl = pickle.dumps(df_dict)

In [None]:
df_dict_x = pickle.loads(df_dict_pkl)

In [None]:
df_dict_x['dfx2']

### Storing dictionary as pickle in local memory and unpickling

In [None]:
pkl_file = "../data/pickles/test_df_dict_pkl.pkl"

In [None]:
pickle.dump(df_dict, open(pkl_file, "wb"))

In [None]:
df_dict_x = pickle.load(open(pkl_file, "rb"))

In [None]:
df_dict_x['dfx2']

---

## Inspecting modules results

### Data ingested

In [39]:
pth = "../data/raw/Food_Inspections.csv"

In [40]:
dfx = pd.read_csv(pth)

In [41]:
dfx

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License#,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,Location
0,2453552,CLAMP DOWN BURGERS,,2749943.0,,Risk 2 (Medium),1742 W DIVISION ST,CHICAGO,IL,60622.0,10/20/2020,License,No Entry,,41.903387,-87.671740,"(-87.67174026586648, 41.903386755553484)"
1,2386633,JIN JU,JIN JU,27137.0,Restaurant,Risk 1 (High),5203 N CLARK ST,CHICAGO,IL,60640.0,08/28/2020,Canvass,No Entry,,41.976301,-87.668276,"(-87.66827593789948, 41.97630115368914)"
2,2386595,LA BIZNAGA #2,LA BIZNAGA #2,2708992.0,,Risk 1 (High),2949 W BELMONT AVE,CHICAGO,IL,60618.0,08/27/2020,Complaint,No Entry,,41.939256,-87.702270,"(-87.70226967930802, 41.939255926667535)"
3,2386464,Uni Sushi Bristo,Uni Sushi Bistro,2262637.0,Restaurant,Risk 1 (High),1752 W NORTH AVE,CHICAGO,IL,60622.0,08/25/2020,Complaint,No Entry,,41.910676,-87.672205,"(-87.67220465807979, 41.91067561170382)"
4,2386398,KIKI'S BISTRO,KIKI'S BISTRO,22899.0,Restaurant,Risk 1 (High),900 N FRANKLIN ST,CHICAGO,IL,60610.0,08/24/2020,Canvass,No Entry,,41.898998,-87.635921,"(-87.63592067312285, 41.89899799424835)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
215062,229233,"ZULLO'S MARKETS, LLC",ZULLO'S MARKET,2016915.0,Restaurant,Risk 2 (Medium),131 N Clinton ST,CHICAGO,IL,60661.0,02/18/2010,License,Fail,,41.884188,-87.641120,"(-87.64111966683218, 41.884187507127805)"
215063,68205,LA FONDA CHIQUITA TLC,LA FONDA CHIQUITA,2017215.0,Restaurant,Risk 1 (High),5940 W DIVERSEY AVE,CHICAGO,IL,60639.0,02/09/2010,License,Fail,12. HAND WASHING FACILITIES: WITH SOAP AND SAN...,41.931260,-87.775203,"(-87.77520287598688, 41.93125971874477)"
215064,88234,KABAB CUISINE 2,KABAB CUISINE 2,2014076.0,Restaurant,Risk 1 (High),6320 N LINCOLN AVE,CHICAGO,IL,60659.0,02/01/2010,License Re-Inspection,Pass,,41.996139,-87.716968,"(-87.71696834498202, 41.996139331170895)"
215065,74311,MARGARITA DISTRIBUTORS,,2017298.0,Wholesale,Risk 3 (Low),2332 S BLUE ISLAND AVE BLDG,CHICAGO,IL,60608.0,02/19/2010,License Re-Inspection,Pass,,41.849940,-87.672421,"(-87.67242100722251, 41.84994008002549)"


### Extraction dataframe

In [42]:
pth = "../data/pickles/ingest_df.pkl"

In [43]:
dfx = pickle.load(open(pth, "rb"))

In [44]:
dfx

Unnamed: 0_level_0,facility-type,risk,city,zip,inspection-type,results,violations,label
inspection-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2453552,,risk_2_-medium-,chicago,60622.0,license,no_entry,,0
2386633,restaurant,risk_1_-high-,chicago,60640.0,canvass,no_entry,,0
2386595,,risk_1_-high-,chicago,60618.0,complaint,no_entry,,0
2386464,restaurant,risk_1_-high-,chicago,60622.0,complaint,no_entry,,0
2386398,restaurant,risk_1_-high-,chicago,60610.0,canvass,no_entry,,0
...,...,...,...,...,...,...,...,...
229233,restaurant,risk_2_-medium-,chicago,60661.0,license,fail,,0
68205,restaurant,risk_1_-high-,chicago,60639.0,license,fail,12._hand_washing_facilities-_with_soap_and_san...,0
88234,restaurant,risk_1_-high-,chicago,60659.0,license_re-inspection,pass,,1
74311,wholesale,risk_3_-low-,chicago,60608.0,license_re-inspection,pass,,1


### Transformation df

In [45]:
pth = "../data/pickles/transformation_df.pkl"

In [46]:
dfx = pickle.load(open(pth, "rb"))

In [47]:
dfx

Unnamed: 0_level_0,facility-type,risk,city,zip,inspection-type,results,violations,label,serious_violations,zip-income-class
inspection-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2453552,facility-type_other,risk_2_-medium-,chicago,60622.0,license,no_entry,-_nan,0,no_result,Other
2386633,restaurant_bar,risk_1_-high-,chicago,60640.0,canvass,no_entry,-_nan,0,no_result,Other
2386595,facility-type_other,risk_1_-high-,chicago,60618.0,complaint,no_entry,-_nan,0,no_result,Other
2386464,restaurant_bar,risk_1_-high-,chicago,60622.0,complaint,no_entry,-_nan,0,no_result,Other
2386398,restaurant_bar,risk_1_-high-,chicago,60610.0,canvass,no_entry,-_nan,0,no_result,Other
...,...,...,...,...,...,...,...,...,...,...
229233,restaurant_bar,risk_2_-medium-,chicago,60661.0,license,fail,-_nan,0,no_result,Other
68205,restaurant_bar,risk_1_-high-,chicago,60639.0,license,fail,-_12._hand_washing_facilities-_with_soap_and_s...,0,serious_violations,Other
88234,restaurant_bar,risk_1_-high-,chicago,60659.0,license,pass,-_nan,1,no_result,Other
74311,facility-type_other,risk_3_-low-,chicago,60608.0,license,pass,-_nan,1,no_result,Other


### Feature engineering picke

In [48]:
pth = "../data/pickles/fe_results.pkl"

In [49]:
fe_results = pickle.load(open(pth, "rb"))

In [50]:
fe_results["df_imp_engineered_features"]

<215067x15 sparse matrix of type '<class 'numpy.float64'>'
	with 860268 stored elements in Compressed Sparse Row format>

In [51]:
fe_results["data_labels"]

inspection-id
2453552    0
2386633    0
2386595    0
2386464    0
2386398    0
          ..
229233     0
68205      0
88234      1
74311      1
67838      0
Name: label, Length: 215067, dtype: int64

In [52]:
fe_results["ohe_reference"]

{'facility-type': ['daycare', 'facility-type_other', 'restaurant_bar'],
 'risk': ['all', 'nan', 'risk_1_-high-', 'risk_2_-medium-', 'risk_3_-low-'],
 'city': ['chicago', 'city_other'],
 'inspection-type': ['canvass',
  'complaint',
  'inspection',
  'inspection-type_other',
  'license'],
 'serious_violations': ['no_result',
  'no_serious_violations',
  'serious_violations'],
 'zip-income-class': ['Other']}

### Models training pickle

In [53]:
pth = "../data/pickles/mt_results.pkl"

In [54]:
mt_results = pickle.load(open(pth, "rb"))

In [55]:
mt_results["training_labels"]

inspection-id
471453     1
1300944    1
1386417    1
1096383    0
1954085    0
          ..
1279267    0
664331     1
285128     0
1588991    0
1538063    1
Name: label, Length: 150546, dtype: int64

In [56]:
mt_results["test_labels"]

inspection-id
1441528    0
2315524    0
1343273    1
2315994    1
1360669    0
          ..
2366295    1
2232251    0
920208     1
1946275    1
2233134    0
Name: label, Length: 64521, dtype: int64

### Model selection pickle

In [57]:
pth = "../data/pickles/ms_results.pkl"

In [58]:
ms_results = pickle.load(open(pth, "rb"))

In [59]:
ms_results["model_test_predict_labels"]

array([1, 1, 1, ..., 1, 1, 0])

In [60]:
ms_results["model_test_predict_scores"]

array([[0.20655924, 0.79344076],
       [0.20655924, 0.79344076],
       [0.23003207, 0.76996793],
       ...,
       [0.28165049, 0.71834951],
       [0.17910383, 0.82089617],
       [0.71460451, 0.28539549]])

---

## Building aequitas dataframe

### Creating initial dataframe with unique ID's and real test labels

In [67]:
dfx = mt_results["test_labels"].to_frame()
dfx.rename(columns={"label": "test_real_labels"}, inplace=True)

### Adding the labels predicted by best model

In [79]:
dfx["model_test_predict_labels"] = ms_results["model_test_predict_labels"]

### Adding zip and reference group

In [86]:
rc = ["zip", "zip-income-class"]

In [81]:
pth = "../data/pickles/transformation_df.pkl"

In [84]:
dfxx = pickle.load(open(pth, "rb"))

In [88]:
dfxx = dfxx.loc[:, rc]

#### Merging with labels dataframe

In [None]:
dfx.join(dfxx, how="inner")

---

## *Notes*

---
---