# **1. Data Preparation**

## **1.1 Read Data**

In [1]:
# Import Library
import pandas as pd

# Load Configuration
import src.utils as utils

Create config file to load and dump data

In [2]:
config_data = utils.config_load()
config_data

{'raw_dataset_path': 'data/raw/Training Data.csv',
 'dataset_path': 'data/output/data.pkl',
 'predictors_set_path': 'data/output/predictors.pkl',
 'response_set_path': 'data/output/response.pkl',
 'train_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'test_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'data_train_path': 'data/output/data_train.pkl',
 'data_train_binned_path': 'data/output/data_train_binned.pkl',
 'crosstab_list_path': 'data/output/crosstab_list.pkl',
 'WOE_table_path': 'data/output/WOE_table.pkl',
 'IV_table_path': 'data/output/IV_table.pkl',
 'WOE_map_dict_path': 'data/output/WOE_map_dict.pkl',
 'X_train_woe_path': 'data/output/X_train_woe.pkl',
 'response_variable': 'risk_flag',
 'test_size': 0.3,
 'num_columns': ['income',
  'age',
  'experience',
  'current_job_years',
  'current_house_years'],
 'cat_columns': ['married',
  'house_ownership',
  'car_ownership',
  'profession',
  'city',
  'state'],
 'num_of_bins': 4,
 'num_of_cv': 1

In [3]:
def read_data():
    """Load data and dump data"""

    # Load data
    data_path = config_data['raw_dataset_path']
    data = pd.read_csv(data_path)

    # Validate data shape
    print("Data shape :", data.shape)

    # Pickle dumping (save the result)
    dump_path = config_data['dataset_path']
    utils.pickle_dump(data, dump_path)

    return data

In [4]:
# Check the function
data = read_data()
data = data.drop(columns = ['Id'], axis = 1)
data.head()

Data shape : (252000, 13)


Unnamed: 0,income,age,experience,married,house_ownership,car_ownership,profession,city,state,current_job_years,current_house_years,risk_flag
0,1303835,23,3,single,rented,no,Mechanical_engineer,Rewa,Madhya_Pradesh,3,13,0
1,7574516,40,10,single,rented,no,Software_Developer,Parbhani,Maharashtra,9,13,0
2,3991815,66,4,married,rented,no,Technical_writer,Alappuzha,Kerala,4,10,0
3,6256451,41,2,single,rented,yes,Software_Developer,Bhubaneswar,Odisha,2,12,1
4,5768871,47,11,single,rented,no,Civil_servant,Tiruchirappalli[10],Tamil_Nadu,3,14,1


In [5]:
data.describe()

Unnamed: 0,income,age,experience,current_job_years,current_house_years,risk_flag
count,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0
mean,4997117.0,49.954071,10.084437,6.333877,11.997794,0.123
std,2878311.0,17.063863,6.00259,3.647053,1.399037,0.328438
min,10310.0,21.0,0.0,0.0,10.0,0.0
25%,2503015.0,35.0,5.0,3.0,11.0,0.0
50%,5000694.0,50.0,10.0,6.0,12.0,0.0
75%,7477502.0,65.0,15.0,9.0,13.0,0.0
max,9999938.0,79.0,20.0,14.0,14.0,1.0


## **1.2 Sample Splitting**

In [8]:
config_data = utils.config_load()
config_data

{'raw_dataset_path': 'data/raw/Training Data.csv',
 'dataset_path': 'data/output/data.pkl',
 'predictors_set_path': 'data/output/predictors.pkl',
 'response_set_path': 'data/output/response.pkl',
 'response_variable': 'risk_flag'}

In [9]:
def split_input_output():
    """Split input (predictors) and output (responses)"""

    # Load data
    dataset_path = config_data['dataset_path']
    data = utils.pickle_load(dataset_path)

    # Define y
    response_variable = config_data['response_variable']
    y = data[response_variable]

    # Define x
    X = data.drop(columns = [response_variable],
                axis = 1)

    # Validate the splitting
    print('y shape :', y.shape)
    print('X shape:', X.shape)

    # Dumping
    dump_path_predictors = config_data['predictors_set_path']
    utils.pickle_dump(X, dump_path_predictors)

    dump_path_response = config_data['response_set_path']
    utils.pickle_dump(y, dump_path_response)

    return X,y

In [10]:
# Check the function
X, y = split_input_output()

y shape : (252000,)
X shape: (252000, 12)


In [11]:
X = X.drop(columns = ['Id'], axis = 1)
X.head()

Unnamed: 0,income,age,experience,married,house_ownership,car_ownership,profession,city,state,current_job_years,current_house_years
0,1303835,23,3,single,rented,no,Mechanical_engineer,Rewa,Madhya_Pradesh,3,13
1,7574516,40,10,single,rented,no,Software_Developer,Parbhani,Maharashtra,9,13
2,3991815,66,4,married,rented,no,Technical_writer,Alappuzha,Kerala,4,10
3,6256451,41,2,single,rented,yes,Software_Developer,Bhubaneswar,Odisha,2,12
4,5768871,47,11,single,rented,no,Civil_servant,Tiruchirappalli[10],Tamil_Nadu,3,14


In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252000 entries, 0 to 251999
Data columns (total 12 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   income               252000 non-null  int64 
 1   age                  252000 non-null  int64 
 2   experience           252000 non-null  int64 
 3   married              252000 non-null  object
 4   house_ownership      252000 non-null  object
 5   car_ownership        252000 non-null  object
 6   profession           252000 non-null  object
 7   city                 252000 non-null  object
 8   state                252000 non-null  object
 9   current_job_years    252000 non-null  int64 
 10  current_house_years  252000 non-null  int64 
 11  risk_flag            252000 non-null  int64 
dtypes: int64(6), object(6)
memory usage: 23.1+ MB


In [12]:
y.head()

0    0
1    0
2    0
3    1
4    1
Name: risk_flag, dtype: int64

In [13]:
# Import library
from sklearn.model_selection import train_test_split

In [14]:
config_data = utils.config_load()
config_data

{'raw_dataset_path': 'data/raw/Training Data.csv',
 'dataset_path': 'data/output/data.pkl',
 'predictors_set_path': 'data/output/predictors.pkl',
 'response_set_path': 'data/output/response.pkl',
 'train_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'test_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'response_variable': 'risk_flag',
 'test_size': 0.3}

In [15]:
def split_train_test():
    """Split train & test, then dump the data"""

    # Load the X & y
    X = utils.pickle_load(config_data['predictors_set_path'])
    y = utils.pickle_load(config_data['response_set_path'])

    X = X.drop(columns = ['Id'], axis = 1)

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        stratify = y,
                                                        test_size = config_data['test_size'],
                                                        random_state = 42)
    # Validate splitting
    print('X_train shape :', X_train.shape)
    print('y_train shape :', y_train.shape)
    print('X_test shape :', X_test.shape)
    print('y_test shape :', y_test.shape)

    # Dump data
    utils.pickle_dump(X_train, config_data['train_path'][0])
    utils.pickle_dump(y_train, config_data['train_path'][1])
    utils.pickle_dump(X_test, config_data['test_path'][0])
    utils.pickle_dump(y_test, config_data['test_path'][1])

    return X_train, y_train, X_test, y_test

In [16]:
# Check the function
X_train, y_train, X_test, y_test = split_train_test()

X_train shape : (176400, 11)
y_train shape : (176400,)
X_test shape : (75600, 11)
y_test shape : (75600,)
