# Feature engineering

## Dependency management

In [None]:
import pandas as pd
import numpy as np
import os

## Minio Bucket setup

In [None]:
AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']
AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
MINIO_ENDPOINT = 'http://minio.idoml.precision.uni.lux'
CLEANED_FILENAME = 'lcld/cleaned_data'
FEATURE_FILENAME = 'lcld/feature_data'

In [None]:
bucket_name = 'datasets'
filepath = f'{bucket_name}/{CLEANED_FILENAME}.csv'

## Load pandas dataframe from S3

In [None]:
df = pd.read_csv(
    f's3://{filepath}',
    storage_options={
        'key': AWS_ACCESS_KEY_ID,
        'secret': AWS_SECRET_ACCESS_KEY,
        'token': None,
        'client_kwargs': {'endpoint_url': MINIO_ENDPOINT}
    }
)

## Feature engineering

### Remove feature with unique values

In [None]:
df.drop("id", axis=1, inplace=True)

# Remove emp_title to many different values

df = df.drop(labels="emp_title", axis=1)
df = df.drop("title", axis=1)
df = df.drop(labels=["zip_code", "addr_state"], axis=1)

    # Remove grade, redundant


In [None]:
# Remove redundant feature
df = df.drop("grade", axis=1)

### Convert feature

In [None]:
# convert term to integer
df["term"] = df["term"].apply(lambda s: np.int8(s.split()[0]))

# Convert emp_length

df["emp_length"].replace(to_replace="10+ years", value="10 years", inplace=True)
df["emp_length"].replace("< 1 year", "0 years", inplace=True)


def emp_length_to_int(s):
    if pd.isnull(s):
        return s
    else:
        return np.int8(s.split()[0])


df["emp_length"] = df["emp_length"].apply(emp_length_to_int)

# Convert date features
df["earliest_cr_line"] = pd.to_datetime(df["earliest_cr_line"])
df["issue_d"] = pd.to_datetime(df["issue_d"])

# Convert subgrade to numeric
replacements = [
        ("A", "1"),
        ("B", "2"),
        ("C", "3"),
        ("D", "4"),
        ("E", "5"),
        ("F", "6"),
        ("G", "7"),
    ]
for r in replacements:
    df["sub_grade"] = df["sub_grade"].str.replace(r[0], r[1])
df["sub_grade"] = df["sub_grade"].astype(float)
df["sub_grade"] = df["sub_grade"].replace(
    np.sort(df["sub_grade"].unique()),
    np.arange(df["sub_grade"].unique().shape[0]).astype(int),
)

In [None]:
# THE TARGET
df["charged_off"] = (df["loan_status"] == "Charged Off").apply(np.uint8)
df = df.drop("loan_status", axis=1)


## Feature creation

In [None]:
#  fico_range_low fico_range_high are correlated, take average

df["fico_score"] = 0.5 * df["fico_range_low"] + 0.5 * df["fico_range_high"]
df = df.drop(["fico_range_high", "fico_range_low"], axis=1)

# Feature creation


def diff_date_month(a, b):
    return 12 * (a.dt.year - b.dt.year) + (a.dt.month - b.dt.month)


def ratio_pub_rec_pub_rec_bankruptcies(pub_rec_bankruptcies, pub_rec):
    if pub_rec > 0:
        return pub_rec_bankruptcies / pub_rec
    else:
        return -1


df["month_of_year"] = df["issue_d"].dt.month - 1
df["ratio_loan_amnt_annual_inc"] = df["loan_amnt"] / df["annual_inc"]
df["ratio_open_acc_total_acc"] = df["open_acc"] / df["total_acc"]

df["month_since_earliest_cr_line"] = diff_date_month(
    df["issue_d"], df["earliest_cr_line"]
)
df = df.drop("earliest_cr_line", axis=1)

df["ratio_pub_rec_month_since_earliest_cr_line"] = (
    df["pub_rec"] / df["month_since_earliest_cr_line"]
)
df["ratio_pub_rec_bankruptcies_month_since_earliest_cr_line"] = (
    df["pub_rec_bankruptcies"] / df["month_since_earliest_cr_line"]
)
df["ratio_pub_rec_bankruptcies_pub_rec"] = df.apply(
    lambda x: ratio_pub_rec_pub_rec_bankruptcies(x.pub_rec_bankruptcies, x.pub_rec),
    axis=1,
)


### Missing values

In [None]:
df = df.dropna()

### Feature type

In [None]:
for e in [
    "open_acc",
    "total_acc",
    "emp_length",
    "mort_acc",
    "pub_rec_bankruptcies",
    "month_since_earliest_cr_line",
    "sub_grade",
]:
    df[e] = df[e].astype(int)

### Sort output by date

In [None]:
df = df.sort_values(by="issue_d")

In [None]:
df

## Upload pandas dataframe to S3

In [None]:
feature_filepath = f'{bucket_name}/{FEATURE_FILENAME}.csv'

df.to_csv(
    f's3://{feature_filepath}',
    storage_options={
        'key': AWS_ACCESS_KEY_ID,
        'secret': AWS_SECRET_ACCESS_KEY,
        'token': None,
        'client_kwargs': {'endpoint_url': MINIO_ENDPOINT}
    }
)