# Notebook to conduct ML related tasks

# Imports

#### Standard library imports

In [1]:
import sys
sys.path.append("../")
import os
import pickle
import random
from functools import partial

#### Third party imports

In [2]:
import boto3
import kaggle
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn import metrics

#### Local application imports

In [3]:
%load_ext autoreload
%autoreload 2

from pkg_dir.config import *
from pkg_dir.src.utils import *
from pkg_dir.src.functions import *
from pkg_dir.src.parameters import *

# Segmented pipeline

## Extract

### Locally saved results

In [None]:
pkl_obj = 'extract_Xy_train.pkl'
# pkl_obj = 'extract_X_test.pkl'

In [None]:
with open(pipeline_pkl_extract_local_dir + pkl_obj, 'rb') as obj:
    dfx = pickle.load(obj)

## Transform

### Locally saved results

In [None]:
# pkl_obj = 'trans_X_train.pkl'
# pkl_obj = 'trans_y_train.pkl'
# pkl_obj = 'trans_X_val.pkl'
# pkl_obj = 'trans_y_val.pkl'
# pkl_obj = 'trans_X_test.pkl'
pkl_obj = 'trans_y_test.pkl'

In [None]:
with open(pipeline_pkl_transform_local_dir + pkl_obj, 'rb') as obj:
    dfx = pickle.load(obj)

## Feature engineering

### Locally saved results

In [None]:
# pkl_obj = 'feateng_X_train.pkl'
# pkl_obj = 'feateng_y_train.pkl'
# pkl_obj = 'feateng_X_val.pkl'
# pkl_obj = 'feateng_y_val.pkl'
# pkl_obj = 'feateng_X_test.pkl'
pkl_obj = 'feateng_y_test.pkl'

In [None]:
with open(pipeline_pkl_feateng_local_dir + pkl_obj, 'rb') as obj:
    dfx = pickle.load(obj)

## Models training

### Locally saved results

In [None]:
# pkl_obj = 'modtrain_train_x.pkl'
# pkl_obj = 'modtrain_train_y.pkl'
# pkl_obj = 'modtrain_X_val.pkl'
# pkl_obj = 'modtrain_y_val.pkl'
# pkl_obj = 'modtrain_X_test.pkl'
# pkl_obj = 'modtrain_y_val.pkl'
# pkl_obj = 'modtrain_test_x.pkl'
pkl_obj = 'modtrain_model_ml.pkl'

In [None]:
with open(pipeline_pkl_modtrain_local_dir + pkl_obj, 'rb') as obj:
    dfx = pickle.load(obj)

## Models evaluation and selection

### Evaluating the model's performance with the validation dataset

#### Loading objects

In [None]:
pkl_obj = 'modtrain_X_val.pkl'

with open(pipeline_pkl_modtrain_local_dir + pkl_obj, 'rb') as obj:
    X_val = pickle.load(obj)

In [None]:
pkl_obj = 'modtrain_y_val.pkl'

with open(pipeline_pkl_modtrain_local_dir + pkl_obj, 'rb') as obj:
    y_val = pickle.load(obj)

In [None]:
pkl_obj = 'modtrain_model_ml.pkl'

with open(pipeline_pkl_modtrain_local_dir + pkl_obj, 'rb') as obj:
    modtrain_res = pickle.load(obj)

#### Model evaluation
- RandomForestClassifier(max_features=10, max_leaf_nodes=50, min_samples_leaf=10)

In [None]:
model = modtrain_res['random_forest']['best_estimator']

In [None]:
## Copy of the validation labels
y_valx = y_val.copy()

In [None]:
## Adding the model's prediciton
y_valx['predict'] = model.predict(X_val)

In [None]:
## Adding the prediction probabilities of class 1
y_valx['positive_prob'] = model.predict_proba(X_val)[:, 1]

In [None]:
metrics.accuracy_score(
    y_true=y_valx['label'],
    y_pred=y_valx['predict'],
)

In [None]:
metrics.balanced_accuracy_score(
    y_true=y_valx['label'],
    y_pred=y_valx['predict'],
)

In [None]:
metrics.average_precision_score(
    y_true=y_valx['label'],
    y_score=y_valx['positive_prob'],
    pos_label=True
)

In [None]:
metrics.brier_score_loss(
    y_true=y_valx['label'],
    y_prob=y_valx['positive_prob'],
    pos_label=True
)

In [None]:
metrics.f1_score(
    y_true=y_valx['label'],
    y_pred=y_valx['predict'],
    pos_label=True
)

In [None]:
metrics.precision_score(
    y_true=y_valx['label'],
    y_pred=y_valx['predict'],
    pos_label=True
)

In [None]:
metrics.recall_score(
    y_true=y_valx['label'],
    y_pred=y_valx['predict'],
    pos_label=True
)

In [None]:
cm = metrics.confusion_matrix(
    y_true=y_valx['label'],
    y_pred=y_valx['predict'],
)

metrics.ConfusionMatrixDisplay(cm).plot()

In [None]:
fpr, tpr, tresholds = metrics.roc_curve(
    y_true=y_valx['label'],
    y_score=y_valx['positive_prob'],
)

roc_auc = metrics.auc(
    x=fpr,
    y=tpr,
)

roc_curve = metrics.RocCurveDisplay(
    fpr=fpr,
    tpr=tpr,
    roc_auc=roc_auc,
    estimator_name='random_forest'
)

roc_curve.plot()

In [None]:
precision, recall, _ = metrics.precision_recall_curve(
    y_true=y_valx['label'],
    probas_pred=y_valx['positive_prob'],
    pos_label=True,
)

pr_curve = metrics.PrecisionRecallDisplay(
    precision=precision,
    recall=recall,
    estimator_name='random_forest',
)

pr_curve.plot()

In [None]:
metrics.accuracy_score(
    y_true=y_valx['label'],
    y_pred=y_valx['predict'],
)

In [None]:
y_valx['test'] = np.where(y_valx['positive_prob'] > 0.4, True, False)

In [None]:
np.where(y_valx['test'] == y_valx['predict'], 1, 0).sum()

In [None]:
metrics.accuracy_score(
    y_true=y_valx['label'],
    y_pred=y_valx['test'],
)

### Locally saved results

In [None]:
# pkl_obj = 'modevalsel_X_train.pkl'
# pkl_obj = 'modevalsel_y_train.pkl'
# pkl_obj = 'modevalsel_X_val.pkl'
# pkl_obj = 'modevalsel_y_val.pkl'
# pkl_obj = 'modevalsel_X_test.pkl'
# pkl_obj = 'modevalsel_y_test.pkl'
pkl_obj = 'modevalsel_metrics.pkl'

In [None]:
with open(pipeline_pkl_modevalsel_local_dir + pkl_obj, 'rb') as obj:
    dfx = pickle.load(obj)

# Submit prediction to Kaggle

In [4]:
pkl_obj = 'modtrain_X_test.pkl'

with open(pipeline_pkl_modtrain_local_dir + pkl_obj, 'rb') as obj:
    X_test = pickle.load(obj)

In [5]:
pkl_obj = 'modtrain_y_test.pkl'

with open(pipeline_pkl_modtrain_local_dir + pkl_obj, 'rb') as obj:
    y_test = pickle.load(obj)

In [6]:
pkl_obj = 'modtrain_model_ml.pkl'

with open(pipeline_pkl_modtrain_local_dir + pkl_obj, 'rb') as obj:
    modtrain_res = pickle.load(obj)

In [9]:
# model_name = 'random_forest'
model_name = 'gradient_boosting'

model = modtrain_res[model_name]['best_estimator']
class_thresh = predict_models_dict[model_name]['class_thresh']

In [None]:
y_test['pos_prob'] = model.predict_proba(X_test)[:, 1]

In [None]:
y_test['Transported'] = np.where(y_test['pos_prob'] > class_thresh, True, False)

In [None]:
y_test = y_test.loc[:, ['Transported']].reset_index()

In [None]:
y_test.to_csv('preds.csv', index=False)

# Header 1

# *Notes*

## Testing interaction with AWS s3

### Setting the s3 client

In [None]:
creds = read_yaml(creds_file_path)

In [None]:
dev = boto3.Session(
    aws_access_key_id=creds['aws']['aws_access_key_id'],
    aws_secret_access_key=creds['aws']['aws_secret_access_key'],
)

In [None]:
s3 = dev.client('s3')

### Listing buckets

In [None]:
s3.list_buckets()

### Uploading file to bucket

#### Pre-created file

In [None]:
## Uploading - test 1
file_path = dataset_dir + '/test_file.txt'
bucket = 'titanic-spaceship-aws-bucket'
upload_file(file_path, bucket, object_name=None)

In [None]:
## Uploading - test 2
file_path = dataset_dir + '/test_file.txt'
bucket = 'titanic-spaceship-aws-bucket'
object_name = 'test_folder' + '/test_file.txt'
upload_file(file_path, bucket, object_name)

### Listing objects in bucket

In [None]:
s3 = create_s3_client()

In [None]:
bucket_name = base_bucket_name

In [None]:
bucket_name = base_bucket_name
bucket_key = 'pipeline_pkls/extract'

[
    obj['Key'].split(sep='/')[-1]
    for obj
    in s3.list_objects_v2(Bucket=bucket_name, Prefix=bucket_key)['Contents']
]

In [None]:
lx = list_objects_in_bucket_key(bucket_name, bucket_key)
lx

### Reading file from bucket

In [None]:
s3 = create_s3_client()

In [None]:
bucket_name = bucket_name
# bucket_key = os.path.join(aws_pipeline_pkl_extract, pipeline_pkl_extract_name)
bucket_key = 'pipeline_pkls/extract/extract_train.pkl'

In [None]:
obj = s3.get_object(Bucket=bucket_name, Key=bucket_key)

In [None]:
dfx = pickle.loads(obj['Body'].read())

## Using kaggle library

In [None]:
kaggle.api.authenticate()

In [None]:
kaggle.api.competition_download_files

In [None]:
kaggle.api.competition_download_files(
    'spaceship-titanic', 
    path=dataset_dir, 
)

## Testing sklearn's imputer

In [None]:
dfx

In [None]:
dfx.info()

In [None]:
dfx[dfx['CryoSleep'].isnull()]

In [None]:
f_cat = [
    feat
    for feat in titanicsp_full_data_schema
    if
        'feature_type' in titanicsp_full_data_schema[feat]
        and
        'categorical' in titanicsp_full_data_schema[feat]['feature_type']
        and
        titanicsp_full_data_schema[feat]['model_relevant']
        and
        feat in dfx.columns
        and
        dfx.shape[0] - dfx[feat].count() != 0
]

In [None]:
imp_med = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

In [None]:
dfx.loc[:, f_cat] = imp_med.fit_transform(dfx.loc[:, f_cat])

In [None]:
dfx.info()

In [None]:
dfx[dfx.index.isin(['0099_02', '0105_01'])]

## Header 2

---

---