In [3]:
# Feature Engineering
# This notebook performs feature engineering on the raw dataset to prepare it for model development.

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os

# Load the processed data
processed_path = 'data/processed/processed_credit_data.csv'
data = pd.read_csv(processed_path)

# Display the first few rows of the dataset
print("Dataset Preview:")
print(data.head())

# Check for missing values in the processed data
print("\nMissing Values:")
missing_values = data.isnull().sum()
print(missing_values)

# Assuming that 'Existing Loan' is categorical and needs to be encoded
# Apply One Hot Encoding to categorical features
data_encoded = pd.get_dummies(data, columns=['Existing Loan'], drop_first=True)

# Feature engineering: Scaling numerical features
numerical_features = ['Age', 'Annual Income', 'Credit Score', 'Loan Amount', 'Debt to Income']

# Initialize the scaler
scaler = StandardScaler()

# Apply scaling to the numerical features
data_encoded[numerical_features] = scaler.fit_transform(data_encoded[numerical_features])

# Display the first few rows of the dataset after feature engineering
print("\nFeature Engineered Data Preview:")
print(data_encoded.head())

# Split the data into features (X) and target (y)
X = data_encoded.drop(columns=['Has Default'])
y = data_encoded['Has Default']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save the feature-engineered data (if necessary)
feature_engineered_path = 'data/processed/feature_engineered_credit_data.csv'
data_encoded.to_csv(feature_engineered_path, index=False)
print(f"\nFeature engineered data has been saved to: {feature_engineered_path}")

# Display the target variable distribution in the training set
print("\nTarget Variable Distribution in Training Set:")
print(y_train.value_counts())




Dataset Preview:
   Age  Annual Income  Credit Score  Loan Amount  Existing Loan  \
0   56          44000           560     13781.56              1   
1   69          35500           359      5492.83              0   
2   46          64000           563     12665.01              0   
3   32          29000           847     47290.40              0   
4   60          21000           309     31223.96              1   

   Debt to Income  Has Default  
0            0.35            1  
1            0.33            1  
2            0.94            1  
3            0.60            0  
4            0.15            1  

Missing Values:
Age               0
Annual Income     0
Credit Score      0
Loan Amount       0
Existing Loan     0
Debt to Income    0
Has Default       0
dtype: int64

Feature Engineered Data Preview:
        Age  Annual Income  Credit Score  Loan Amount  Debt to Income  \
0  0.835222      -1.086700     -0.092011    -1.050412       -0.769671   
1  1.703240      -1.312096     -

In [4]:
# Feature Engineering
# This notebook performs feature engineering on the raw dataset to prepare it for model development.

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os
import time

# Start timer
start_time = time.time()

# Your existing code to load data, split, and save

# End timer
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time
print(f"Time taken to generate and save the splits: {elapsed_time:.4f} seconds")


# Load the processed data
processed_path = 'data/processed/processed_credit_data.csv'
data = pd.read_csv(processed_path)

# Display the first few rows of the dataset
print("Dataset Preview:")
print(data.head())

# Check for missing values in the processed data
print("\nMissing Values:")
missing_values = data.isnull().sum()
print(missing_values)

# Assuming that 'Existing Loan' is categorical and needs to be encoded
# Apply One Hot Encoding to categorical features
data_encoded = pd.get_dummies(data, columns=['Existing Loan'], drop_first=True)

# Feature engineering: Scaling numerical features
numerical_features = ['Age', 'Annual Income', 'Credit Score', 'Loan Amount', 'Debt to Income']

# Initialize the scaler
scaler = StandardScaler()

# Apply scaling to the numerical features
data_encoded[numerical_features] = scaler.fit_transform(data_encoded[numerical_features])

# Display the first few rows of the dataset after feature engineering
print("\nFeature Engineered Data Preview:")
print(data_encoded.head())

# Split the data into features (X) and target (y)
X = data_encoded.drop(columns=['Has Default'])
y = data_encoded['Has Default']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save the splits into the 'data/splits/' directory
splits_dir = 'Credit Risk Prediction/notebook/data/splits/'

# Create the directory if it doesn't exist
os.makedirs(splits_dir, exist_ok=True)

# Save X_train, X_test, y_train, y_test as CSV files
X_train_path = os.path.join(splits_dir, 'X_train.csv')
X_test_path = os.path.join(splits_dir, 'X_test.csv')
y_train_path = os.path.join(splits_dir, 'y_train.csv')
y_test_path = os.path.join(splits_dir, 'y_test.csv')

X_train.to_csv(X_train_path, index=False)
X_test.to_csv(X_test_path, index=False)
y_train.to_csv(y_train_path, index=False)
y_test.to_csv(y_test_path, index=False)

print(f"\nData splits saved successfully:\n{X_train_path}\n{X_test_path}\n{y_train_path}\n{y_test_path}")


Time taken to generate and save the splits: 0.0001 seconds
Dataset Preview:
   Age  Annual Income  Credit Score  Loan Amount  Existing Loan  \
0   56          44000           560     13781.56              1   
1   69          35500           359      5492.83              0   
2   46          64000           563     12665.01              0   
3   32          29000           847     47290.40              0   
4   60          21000           309     31223.96              1   

   Debt to Income  Has Default  
0            0.35            1  
1            0.33            1  
2            0.94            1  
3            0.60            0  
4            0.15            1  

Missing Values:
Age               0
Annual Income     0
Credit Score      0
Loan Amount       0
Existing Loan     0
Debt to Income    0
Has Default       0
dtype: int64

Feature Engineered Data Preview:
        Age  Annual Income  Credit Score  Loan Amount  Debt to Income  \
0  0.835222      -1.086700     -0.092011    -1