In [1]:
# Clone the repo (safe to re-run if it already exists)
import os

if not os.path.exists("Loan-Approval-Prediction---New-York---2025"):
    !git clone https://github.com/remussamoila/Loan-Approval-Prediction---New-York---2025.git

%cd Loan-Approval-Prediction---New-York---2025

Cloning into 'Loan-Approval-Prediction---New-York---2025'...
remote: Enumerating objects: 204, done.[K
remote: Counting objects: 100% (44/44), done.[K
remote: Compressing objects: 100% (40/40), done.[K
remote: Total 204 (delta 19), reused 3 (delta 3), pack-reused 160 (from 1)[K
Receiving objects: 100% (204/204), 2.82 MiB | 8.75 MiB/s, done.
Resolving deltas: 100% (83/83), done.
/content/Loan-Approval-Prediction---New-York---2025


[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](
https://colab.research.google.com/github/remussamoila/Loan-Approval-Prediction---New-York---2025/blob/main/notebooks/02_preprocessing.ipynb)


# 🧼 Preprocessing - Loan Approval

In [5]:
# 👤 Author: Anne Cojocaru
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [3]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test_nolabel.csv')

  train = pd.read_csv('data/train.csv')


## 🔍 Separate Feature Types

In [4]:
num_cols = train.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = train.select_dtypes(include='object').columns.tolist()

## 🧼 Handle Missing Values

In [6]:
train[num_cols] = train[num_cols].fillna(train[num_cols].median())

# Filter num_cols to only include columns present in test DataFrame
test_num_cols_present = [col for col in num_cols if col in test.columns]
test[test_num_cols_present] = test[test_num_cols_present].fillna(train[num_cols].median())

for col in cat_cols:
    mode = train[col].mode()[0]
    train[col] = train[col].fillna(mode)
    test[col] = test[col].fillna(mode)





# Fill missing values in categorical columns (mode)
for col in cat_cols:
    mode = train[col].mode()[0]
    train[col] = train[col].fillna(mode)
    if col in test.columns:
        test[col] = test[col].fillna(mode)



## 🔁 Label Encoding

In [7]:
# Label Encoding
encoders = {}
# Exclude 'id' column from label encoding if it exists in cat_cols
cat_cols_for_encoding = [col for col in cat_cols if col != 'id']

for col in cat_cols_for_encoding:
    le = LabelEncoder()
    # Combine train and test data for fitting to handle unseen labels in test
    # We concatenate the column from train and test, get unique values, and fit the encoder on these
    combined_series = pd.concat([train[col], test[col]], axis=0).astype(str).unique()
    le.fit(combined_series)

    # Now transform both the train and test data using the fitted encoder
    train[col] = le.transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))
    encoders[col] = le



_Note: Ensure the same preprocessing is applied to test data._

i tried label encoding again - 2 ///
this time with label encoder  ///

In [None]:
# Apply one-hot encoding to both train and test
train = pd.get_dummies(train, columns=cat_cols)
test = pd.get_dummies(test, columns=cat_cols)

# Align the columns to make sure both DataFrames match
train, test = train.align(test, join='left', axis=1, fill_value=0)


In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Example DataFrame with a categorical column
data = {'Color': ['Red', 'Green', 'Blue', 'Red', 'Green', 'Red']}
df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)

# Initialize the LabelEncoder
le = LabelEncoder()

# Fit and transform the categorical column
# .fit() learns the unique categories in the data
# .transform() converts the categories to numerical labels
df['Color_Encoded'] = le.fit_transform(df['Color'])

print("\nDataFrame after Label Encoding:")
print(df)

# You can also see the mapping between categories and their assigned numerical labels
print("\nMapping of categories to numerical labels:")
# We convert the numpy array to a list for better readability
print(dict(zip(le.classes_, le.transform(le.classes_))))

# To transform new data using the same learned mapping, you use .transform()
# Example with new data (assuming 'Red' and 'Blue' are present in original data)
new_data = pd.DataFrame({'Color': ['Blue', 'Red']})
new_data['Color_Encoded'] = le.transform(new_data['Color'])

print("\nNew data transformed using the fitted encoder:")
print(new_data)



Original DataFrame:
   Color
0    Red
1  Green
2   Blue
3    Red
4  Green
5    Red

DataFrame after Label Encoding:
   Color  Color_Encoded
0    Red              2
1  Green              1
2   Blue              0
3    Red              2
4  Green              1
5    Red              2

Mapping of categories to numerical labels:
{'Blue': np.int64(0), 'Green': np.int64(1), 'Red': np.int64(2)}

New data transformed using the fitted encoder:
  Color  Color_Encoded
0  Blue              0
1   Red              2


another try - 3


In [None]:
# Separate Feature Types
num_cols = train.select_dtypes(include=['int64', 'float64']).columns.tolist()
# Initially get all object columns, including 'id' if it exists
cat_cols = train.select_dtypes(include='object').columns.tolist()

# Remove target column from feature lists (doing this now is fine)
if 'Accept' in num_cols:
    num_cols.remove('Accept')
if 'Accept' in cat_cols:
    cat_cols.remove('Accept')

# Optional: Check for 'id' column and drop it if unnecessary
if 'id' in train.columns:
    train.drop(columns='id', inplace=True)
if 'id' in test.columns:
    test.drop(columns='id', inplace=True)

#  Handle Missing Values (numerical: median)
train[num_cols] = train[num_cols].fillna(train[num_cols].median())
test[num_cols] = test[num_cols].fillna(train[num_cols].median())

#  Handle Missing Values (categorical: mode)
for col in cat_cols:
    mode = train[col].mode()[0]
    train[col] = train[col].fillna(mode)
    if col in test.columns:
        test[col] = test[col].fillna(mode)

#  Encode Categorical Features (One-Hot) - SO THIS TIME WITH GET_DUMMIES NOT LABELENCODER
train = pd.get_dummies(train, columns=cat_cols)
test = pd.get_dummies(test, columns=cat_cols)

# ensure same columns
train, test = train.align(test, join='left', axis=1, fill_value=0)

# 🎯 Split X, y, X_test
X = train.drop('Accept', axis=1)
y = train['Accept']
X_test = test.copy()

# Confirm
print("✅ Shape of X:", X.shape)
print("✅ Shape of y:", y.shape)
print("✅ Shape of X_test:", X_test.shape)

#its obvious what happened
# i suggest to run them by smaller parts, but i did that and the sessions crushed