In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PowerTransformer, OrdinalEncoder, OneHotEncoder

In [2]:
df= pd.read_csv("prosperLoanData.csv")
df.head()

Unnamed: 0,ListingKey,ListingNumber,ListingCreationDate,CreditGrade,Term,LoanStatus,ClosedDate,BorrowerAPR,BorrowerRate,LenderYield,...,LP_ServiceFees,LP_CollectionFees,LP_GrossPrincipalLoss,LP_NetPrincipalLoss,LP_NonPrincipalRecoverypayments,PercentFunded,Recommendations,InvestmentFromFriendsCount,InvestmentFromFriendsAmount,Investors
0,1021339766868145413AB3B,193129,2007-08-26 19:09:29.263000000,C,36,Completed,2009-08-14 00:00:00,0.16516,0.158,0.138,...,-133.18,0.0,0.0,0.0,0.0,1.0,0,0,0.0,258
1,10273602499503308B223C1,1209647,2014-02-27 08:28:07.900000000,,36,Current,,0.12016,0.092,0.082,...,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0.0,1
2,0EE9337825851032864889A,81716,2007-01-05 15:00:47.090000000,HR,36,Completed,2009-12-17 00:00:00,0.28269,0.275,0.24,...,-24.2,0.0,0.0,0.0,0.0,1.0,0,0,0.0,41
3,0EF5356002482715299901A,658116,2012-10-22 11:02:35.010000000,,36,Current,,0.12528,0.0974,0.0874,...,-108.01,0.0,0.0,0.0,0.0,1.0,0,0,0.0,158
4,0F023589499656230C5E3E2,909464,2013-09-14 18:38:39.097000000,,36,Current,,0.24614,0.2085,0.1985,...,-60.27,0.0,0.0,0.0,0.0,1.0,0,0,0.0,20


### data splitting

In [3]:
# Define features and target variables
X = df.drop(['LoanStatus', 'BorrowerRate'], axis=1)
y_classification = df['LoanStatus']
y_regression = df['BorrowerRate']

# Split the data into training and testing sets
X_train, X_test, y_class_train, y_class_test, y_reg_train, y_reg_test = train_test_split(
    X, y_classification, y_regression, test_size=0.2, random_state=42
)

In [4]:
# Impute missing values
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X_train.select_dtypes(include=['object']).columns
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')
X_train[numerical_cols] = num_imputer.fit_transform(X_train[numerical_cols])
X_train[categorical_cols] = cat_imputer.fit_transform(X_train[categorical_cols])
X_test[numerical_cols] = num_imputer.transform(X_test[numerical_cols])
X_test[categorical_cols] = cat_imputer.transform(X_test[categorical_cols])

In [5]:
# Apply Yeo-Johnson transformation
pt = PowerTransformer(method='yeo-johnson')
X_train[numerical_cols] = pt.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = pt.transform(X_test[numerical_cols])

In [6]:
# Define columns for ordinal and one-hot encoding
ordinal_cols = ['CreditGrade', 'ProsperRating (Alpha)', 'IncomeRange', 'LoanOriginationQuarter', 'EmploymentStatus']
one_hot_cols = ['BorrowerState', 'Occupation']

# Ordinal encoding
ordinal_encoder = OrdinalEncoder()
X_train[ordinal_cols] = ordinal_encoder.fit_transform(X_train[ordinal_cols])
X_test[ordinal_cols] = ordinal_encoder.transform(X_test[ordinal_cols])

# One-hot encoding
one_hot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
one_hot_encoded_train = one_hot_encoder.fit_transform(X_train[one_hot_cols])
one_hot_encoded_test = one_hot_encoder.transform(X_test[one_hot_cols])
one_hot_encoded_train_df = pd.DataFrame(one_hot_encoded_train, columns=one_hot_encoder.get_feature_names_out(one_hot_cols), index=X_train.index)
one_hot_encoded_test_df = pd.DataFrame(one_hot_encoded_test, columns=one_hot_encoder.get_feature_names_out(one_hot_cols), index=X_test.index)

# Concatenate one-hot encoded columns back to the original DataFrame
X_train = pd.concat([X_train.drop(one_hot_cols, axis=1), one_hot_encoded_train_df], axis=1)
X_test = pd.concat([X_test.drop(one_hot_cols, axis=1), one_hot_encoded_test_df], axis=1)

