In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
dataset_name = 'club_loan'

In [3]:
input_dir = './raw/'
inp_fname = 'loan_data.csv'
output_dir = f'./../../processed/{dataset_name}/'
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')


## Read Data

In [4]:
data = pd.read_csv(os.path.join(input_dir, inp_fname))
data.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [5]:
id_col = "id"
target_col = "not.fully.paid"

## Insert Id Column

In [6]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())


   id  credit.policy             purpose  int.rate  installment  \
0   0              1  debt_consolidation    0.1189       829.10   
1   1              1         credit_card    0.1071       228.22   
2   2              1  debt_consolidation    0.1357       366.86   
3   3              1  debt_consolidation    0.1008       162.34   
4   4              1         credit_card    0.1426       102.92   

   log.annual.inc    dti  fico  days.with.cr.line  revol.bal  revol.util  \
0       11.350407  19.48   737        5639.958333      28854        52.1   
1       11.082143  14.29   707        2760.000000      33623        76.7   
2       10.373491  11.63   682        4710.000000       3511        25.6   
3       11.350407   8.10   712        2699.958333      33667        73.2   
4       11.299732  14.97   667        4066.000000       4740        39.5   

   inq.last.6mths  delinq.2yrs  pub.rec  not.fully.paid  
0               0            0        0               0  
1               0       

## Convert byte strings to strings

In [7]:
byte_string_columns = data.select_dtypes(include=['O']).columns
byte_string_columns.tolist()

['purpose']

In [8]:
import ast
def convert_byte_string_repr(entry):
    try:
        # Check if the entry looks like a byte string representation
        if isinstance(entry, str) and entry.startswith("b'") and entry.endswith("'"):
            byte_value = ast.literal_eval(entry)
            return byte_value.decode('utf-8')
    except (ValueError, SyntaxError):
        pass
    return entry  # Return the original entry if conversion fails

data[byte_string_columns] = data[byte_string_columns].applymap(convert_byte_string_repr).astype(str)
data.head()

  data[byte_string_columns] = data[byte_string_columns].applymap(convert_byte_string_repr).astype(str)


Unnamed: 0,id,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


## Drop unuseful columns

In [9]:
unique_columns = [col for col in data.columns if data[col].nunique() == 1]
print(unique_columns)

data.drop(columns=unique_columns, inplace=True)

[]


## Convert ? to NaN

In [10]:
data.replace('?', np.nan, inplace=True)

## Save Main Data File

In [11]:
data.to_csv(outp_fname, index=False)