In [1]:
import pandas as pd

In [2]:
def load_data(fname):
    """
    This function loads the data from the csv file 
    and convert it into a pandas dataframe.

    Parameters
    ----------- 
    fname : string 
        location of the csv file 
    
    Returns
    ------- 
    dataframe : DataFrame
        loaded data in pandas dataframe format
    """
    dataframe = pd.read_csv(fname)
    print(f"Data Shape: {dataframe.shape}")
    return dataframe

In [3]:
fname = load_data("data/raw/credit_risk_dataset.csv")

Data Shape: (32581, 12)


In [4]:
fname.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [5]:
def split_input_output(data, target_col):
    """
    This function splits the data into input 
    and output based on the target column.
    
    Parameters
    ----------
    data: DataFrame 
        data to be split 

    target_col: str
        name of the target column   

    Returns
    --------
    X : DataFrame
        feature of dataset, 
    
    y : Dataframe
        column target of dataset
    """
    y = data[target_col]
    X = data.drop(target_col, axis=1)
    print(f"Original data shape: {data.shape}")
    print(f"X data shape: {X.shape}")
    print(f"y data shape: {y.shape}")
    return X, y

In [6]:
target_col = "loan_status"
X, y = split_input_output(fname, target_col)

Original data shape: (32581, 12)
X data shape: (32581, 11)
y data shape: (32581,)


In [7]:
X.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,0.55,Y,4


In [8]:
y.head()

0    1
1    0
2    1
3    1
4    1
Name: loan_status, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split

def split_train_test(X, y, test_size, random_state=None):
    """
    This function splits the data into train and test sets.
    
    Parameters
    ----------
    X: DataFrame
        feature of dataset to split
        
    y: Dataframe
        output of dataset to split
    
    test_size: double (0 <= test_size <= 1)
        proportion of the split dataset    
    
    random_state: int
        Controls the shuffling applied to the data before applying the split
        
    Returns
    -------
    split version of the X and y in form X_train, X_test, y_train, y_test
    """
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    print(f"X train shape: {X_train.shape}")
    print(f"X test shape: {X_test.shape}")
    print(f"y train shape: {y_train.shape}")
    print(f"y test shape: {y_test.shape}")

    return X_train, X_test, y_train, y_test

In [10]:
X_train, X_non_train, y_train, y_non_train = split_train_test(X, y, test_size=0.2, random_state=42)

X train shape: (26064, 11)
X test shape: (6517, 11)
y train shape: (26064,)
y test shape: (6517,)


In [11]:
X_valid, X_test, y_valid, y_test = split_train_test(X_non_train, y_non_train, test_size=0.2, random_state=42)

X train shape: (5213, 11)
X test shape: (1304, 11)
y train shape: (5213,)
y test shape: (1304,)


In [12]:
import joblib

def serialize_data(data, path):
    """
    This function serializes the data into a pickle file.
    
    Parameters
    ----------
    data: DataFrame
        data to be serialized
    
    path: string
        location of the pickle file
    
    Returns
    -------
        None
    """
    joblib.dump(data, path)

In [13]:
serialize_data(X_train, path="data/interim/X_train.pkl")
serialize_data(y_train, path="data/interim/y_train.pkl")

serialize_data(X_test, path="data/interim/X_test.pkl")
serialize_data(y_test, path="data/interim/y_test.pkl")

serialize_data(X_valid, path="data/interim/X_valid.pkl")
serialize_data(y_valid, path="data/interim/y_valid.pkl")

In [14]:
import joblib

def deserialize_data(path):
    """
    This function deserializes the data into a pickle file.
    
    Parameters
    ----------
    path: string
        location of the pickle file 
    
    Returns
    -------
    data : DataFrame
        data to be deserialized
    """
    data = joblib.load(path)
    return data

In [15]:
X_train = deserialize_data("data/interim/X_train.pkl")

In [16]:
X_train.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
32377,64,46000,RENT,2.0,PERSONAL,C,4800,11.09,0.1,Y,24
1338,26,26000,OWN,0.0,DEBTCONSOLIDATION,E,8500,16.45,0.33,N,3
7047,23,51000,MORTGAGE,3.0,PERSONAL,C,16000,13.11,0.31,Y,3
8225,22,56004,MORTGAGE,6.0,MEDICAL,A,6000,7.88,0.11,N,4
7178,24,79000,RENT,3.0,PERSONAL,C,7000,12.54,0.09,N,3
