# Data Cleaning

## Dependency management

In [None]:
import pandas as pd
import os

## Minio Bucket setup

In [None]:
AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']
AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
MINIO_ENDPOINT = 'http://minio.idoml.precision.uni.lux'
RAW_FILENAME = 'lcld/raw_data'
CLEANED_FILENAME = 'lcld/cleaned_data'

In [None]:
bucket_name = 'datasets'
filepath = f'{bucket_name}/{RAW_FILENAME}.csv'

## Load pandas dataframe from S3

In [None]:
df = pd.read_csv(
    f's3://{filepath}',
    storage_options={
        'key': AWS_ACCESS_KEY_ID,
        'secret': AWS_SECRET_ACCESS_KEY,
        'token': None,
        'client_kwargs': {'endpoint_url': MINIO_ENDPOINT}
    }
)

In [None]:
df.head()

## Target

We remove samples for which we do not have target.

In [None]:
df = df.loc[df["loan_status"].isin(["Fully Paid", "Charged Off"])]

### Unavailable data

Remove data that is not available at prediction time

In [None]:
keep_list = [
    "addr_state",
    "annual_inc",
    "application_type",
    "dti",
    "earliest_cr_line",
    "emp_length",
    "emp_title",
    "fico_range_high",
    "fico_range_low",
    "grade",
    "home_ownership",
    "id",
    "initial_list_status",
    "installment",
    "int_rate",
    "issue_d",
    "loan_amnt",
    "loan_status",
    "mort_acc",
    "open_acc",
    "pub_rec",
    "pub_rec_bankruptcies",
    "purpose",
    "revol_bal",
    "revol_util",
    "sub_grade",
    "term",
    "title",
    "total_acc",
    "verification_status",
    "zip_code",
]
drop_list = [col for col in df.columns if col not in keep_list]
df = df.drop(labels=drop_list, axis=1)

## Upload pandas dataframe to S3

In [None]:
cleand_filepath = f'{bucket_name}/{CLEANED_FILENAME}.csv'

df.to_csv(
    f's3://{cleand_filepath}',
    storage_options={
        'key': AWS_ACCESS_KEY_ID,
        'secret': AWS_SECRET_ACCESS_KEY,
        'token': None,
        'client_kwargs': {'endpoint_url': MINIO_ENDPOINT}
    }
)