# Notebook to conduct ML related tasks

# Imports

#### Standard library imports

In [1]:
import sys
sys.path.append("../")
import os
import pickle

#### Third party imports

In [2]:
import boto3
import kaggle
import pandas as pd
import numpy as np

#### Local application imports

In [3]:
%load_ext autoreload
%autoreload 2

from pkg_dir.config import *
from pkg_dir.src.utils import *
from pkg_dir.src.functions import *
from pkg_dir.src.parameters import *

# Segmented pipeline

## Extract

In [None]:
extract_pipeline_func()

In [None]:
extract_objects = os.listdir(pipeline_pkl_extract_local_dir)

In [None]:
bucket_name = base_bucket_name
extract_bucket_key = pipeline_pkl_extract_aws_key
extract_obj = 'extract_train.pkl'

dfx = read_s3_obj_to_variable(bucket_name, extract_bucket_key, extract_obj)

In [None]:
dfx.set_index('PassengerId', inplace=True)

In [None]:
dfx.loc[:, 'Transported'].copy()

## Transform

In [None]:
bucket_name = base_bucket_name
extract_bucket_key = pipeline_pkl_transform_aws_key
extract_obj = 'trans_train_x.pkl'

dfx = read_s3_obj_to_variable(bucket_name, extract_bucket_key, extract_obj)

In [None]:
dfx.info()

## Feature engineering

In [19]:
pkl_path = pipeline_pkl_transform_local_dir + 'trans_train_x.pkl'

In [20]:
with open(pkl_path, 'rb') as obj:
    dfx = pickle.load(obj)

In [None]:
exp_cols = [
    'RoomService',
    'FoodCourt',
    'ShoppingMall',
    'Spa',
    'VRDeck',
]

dfx.loc[:, exp_cols].sum(axis=1)

In [None]:
dfx.columns.tolist().index('VRDeck')

In [17]:
# transform_obj = 'trans_test_x.pkl'
# transform_obj = 'trans_train_x.pkl'
transform_obj = 'trans_train_y.pkl'

In [21]:
with open(pipeline_pkl_transform_local_dir + transform_obj, 'rb') as obj:
    dfx = pickle.load(obj)

In [None]:
len(titanicsp_data_schema)

In [None]:
titanicsp_base_data_schema

In [None]:
[
    feat
    for feat in titanicsp_base_data_schema
    if 'feature_type' not in titanicsp_base_data_schema[feat]
]

In [None]:
titanicsp_full_data_schema['prueba'] = {
    'hola': 2,
    'zapato': 1
}

# Header 1

# *Notes*

## Testing interaction with AWS s3

### Setting the s3 client

In [None]:
creds = read_yaml(creds_file_path)

In [None]:
dev = boto3.Session(
    aws_access_key_id=creds['aws']['aws_access_key_id'],
    aws_secret_access_key=creds['aws']['aws_secret_access_key'],
)

In [None]:
s3 = dev.client('s3')

### Listing buckets

In [None]:
s3.list_buckets()

### Uploading file to bucket

#### Pre-created file

In [None]:
## Uploading - test 1
file_path = dataset_dir + '/test_file.txt'
bucket = 'titanic-spaceship-aws-bucket'
upload_file(file_path, bucket, object_name=None)

In [None]:
## Uploading - test 2
file_path = dataset_dir + '/test_file.txt'
bucket = 'titanic-spaceship-aws-bucket'
object_name = 'test_folder' + '/test_file.txt'
upload_file(file_path, bucket, object_name)

### Listing objects in bucket

In [None]:
s3 = create_s3_client()

In [None]:
bucket_name = base_bucket_name

In [None]:
bucket_name = base_bucket_name
bucket_key = 'pipeline_pkls/extract'

[
    obj['Key'].split(sep='/')[-1]
    for obj
    in s3.list_objects_v2(Bucket=bucket_name, Prefix=bucket_key)['Contents']
]

In [None]:
lx = list_objects_in_bucket_key(bucket_name, bucket_key)
lx

### Reading file from bucket

In [None]:
s3 = create_s3_client()

In [None]:
bucket_name = bucket_name
# bucket_key = os.path.join(aws_pipeline_pkl_extract, pipeline_pkl_extract_name)
bucket_key = 'pipeline_pkls/extract/extract_train.pkl'

In [None]:
obj = s3.get_object(Bucket=bucket_name, Key=bucket_key)

In [None]:
dfx = pickle.loads(obj['Body'].read())

## Using kaggle library

In [None]:
kaggle.api.authenticate()

In [None]:
kaggle.api.competition_download_files

In [None]:
kaggle.api.competition_download_files(
    'spaceship-titanic', 
    path=dataset_dir, 
)

## Header 2

---

---