In [1]:
import numpy as np
import pandas as pd
from toolbox import Data

In [2]:
train_data: Data = Data(
    file_path="data/train_auto.csv", index_col=0,
)
train_data()

In [62]:
train_data.data = (
    train_data.data
        .assign(
            # Clean up string columns.
            education = lambda df: df.education.replace(
                {"z_": "", "<": ""}, regex=True
            ),
            job = lambda df: df.job.replace({"z_": ""}, regex=True),
            car_type = lambda df: df.car_type.replace({"z_": ""}, regex=True),


            # Convert monetary columns to numeric ones.
            income = lambda df: pd.to_numeric(
                df.income.replace({"\$": "", ",": ""}, regex=True),
                errors="coerce",
            ),
            home_value = lambda df: pd.to_numeric(
                df.home_value.replace({"\$": "", ",": ""}, regex=True),
                errors="coerce",
            ),
            bluebook_value = lambda df: pd.to_numeric(
                df.bluebook_value.replace({"\$": "", ",": ""}, regex=True),
                errors="coerce",
            ),
            last_claim_value = lambda df: pd.to_numeric(
                df.last_claim_value.replace({"\$": "", ",": ""}, regex=True),
                errors="coerce",
            ),

            # Convert columns meant to be dummy columns into true dummy columns.
            is_female = lambda df: pd.to_numeric(
                df.is_female.replace({"z_F": "1", "M": "0"}, regex=True),
                errors="coerce",
            ),
            is_red_car = lambda df: pd.to_numeric(
                df.is_red_car.replace({"yes": "1", "no": "0"}, regex=True),
                errors="coerce",
            ),
            was_revoked = lambda df: pd.to_numeric(
                df.was_revoked.replace({"Yes": "1", "No": "0"}, regex=True),
                errors="coerce",
            ),
            is_married = lambda df: pd.to_numeric(
                df.is_married.replace({"Yes": "1", "z_No": "0"}, regex=True),
                errors="coerce",
            ),
            is_single_parent = lambda df: pd.to_numeric(
                df.is_single_parent.replace({"Yes": "1", "No": "0"}, regex=True),
                errors="coerce",
            ),
        )
)

In [63]:
# Removes Nulls.
# - age
# - income
# - home value
# - job

In [64]:
train_data.data.dtypes

target_flag             int64
num_kids_driving        int64
age                   float64
num_kids_home           int64
income                float64
is_single_parent        int64
home_value            float64
is_married              int64
is_female               int64
education              object
job                    object
travel_time             int64
for_commercial_use     object
bluebook_value          int64
car_type               object
is_red_car              int64
last_claim_value        int64
claim_frequency         int64
was_revoked            object
car_age               float64
is_urban               object
dtype: object

In [69]:
# ONE BIG LOOP OVER ALL COLUMNS:
# - Drop if we don't know what it means or can't use it.
# - Removes _z and $
# - Converts to correct data type.

train_data.data.was_revoked.value_counts()

No     7161
Yes    1000
Name: was_revoked, dtype: int64

In [70]:
train_data.data.was_revoked.isnull().sum()

0