In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('LoanExport.csv')

# Initial inspection
print("First few rows of the dataset:\n", df.head())
print("\nDataset info:\n", df.info())
print("\nSummary statistics:\n", df.describe())


  df = pd.read_csv('LoanExport.csv')


First few rows of the dataset:
    CreditScore  FirstPaymentDate FirstTimeHomebuyer  MaturityDate    MSA  MIP  \
0            0            199902                  N        202901  16974   25   
1            0            199902                  N        202901  19740    0   
2            0            199902                  N        202901  29940    0   
3            0            199902                  N        202901  31084    0   
4            0            199902                  N        202901  35644    0   

   Units Occupancy  OCLTV  DTI  ...  PostalCode    LoanSeqNum  LoanPurpose  \
0      1         O     89   27  ...       60400  F199Q1268030            P   
1      1         O     73   17  ...       80200  F199Q1015092            N   
2      1         O     75   16  ...       66000  F199Q1266886            N   
3      1         O     76   14  ...       90700  F199Q1178167            N   
4      1         O     78   18  ...        7600  F199Q1178517            N   

  OrigLoanTe

In [2]:
# Identify missing values
print("Missing values per column:\n", df.isnull().sum())

# Fill missing values
df.fillna({
    'OrigInterestRate': df['OrigInterestRate'].mean(),  # Fill numerical with mean
    'FirstTimeHomebuyer': 'Unknown'  # Fill categorical with placeholder
}, inplace=True)

# Verify missing values after imputation
print("\nMissing values after handling:\n", df.isnull().sum())


Missing values per column:
 CreditScore               0
FirstPaymentDate          0
FirstTimeHomebuyer        0
MaturityDate              0
MSA                       0
MIP                       0
Units                     0
Occupancy                 0
OCLTV                     0
DTI                       0
OrigUPB                   0
LTV                       0
OrigInterestRate          0
Channel                   0
PPM                       0
ProductType               0
PropertyState             0
PropertyType              0
PostalCode                0
LoanSeqNum                0
LoanPurpose               0
OrigLoanTerm              0
NumBorrowers              0
SellerName            24994
ServicerName              0
EverDelinquent            0
MonthsDelinquent          0
MonthsInRepayment         0
dtype: int64

Missing values after handling:
 CreditScore               0
FirstPaymentDate          0
FirstTimeHomebuyer        0
MaturityDate              0
MSA                       0
MI

In [3]:
# Check for duplicates
print("Number of duplicates before removal:", df.duplicated().sum())

# Remove duplicates
df.drop_duplicates(inplace=True)

# Verify duplicates removal
print("Number of duplicates after removal:", df.duplicated().sum())


Number of duplicates before removal: 0
Number of duplicates after removal: 0


In [4]:
# Convert columns to appropriate data types
df['FirstPaymentDate'] = pd.to_datetime(df['FirstPaymentDate'], format='%Y%m')
df['OrigInterestRate'] = df['OrigInterestRate'].astype(float)
df['EverDelinquent'] = df['EverDelinquent'].astype(int)

# Verify data types
print("\nData types:\n", df.dtypes)



Data types:
 CreditScore                    int64
FirstPaymentDate      datetime64[ns]
FirstTimeHomebuyer            object
MaturityDate                   int64
MSA                           object
MIP                            int64
Units                          int64
Occupancy                     object
OCLTV                          int64
DTI                            int64
OrigUPB                        int64
LTV                            int64
OrigInterestRate             float64
Channel                       object
PPM                           object
ProductType                   object
PropertyState                 object
PropertyType                  object
PostalCode                    object
LoanSeqNum                    object
LoanPurpose                   object
OrigLoanTerm                   int64
NumBorrowers                  object
SellerName                    object
ServicerName                  object
EverDelinquent                 int32
MonthsDelinquent        

In [5]:
# Identify categorical variables
categorical_cols = [
    'FirstTimeHomebuyer', 'Occupancy', 'Channel', 'ProductType',
    'PropertyState', 'PropertyType', 'LoanPurpose', 'SellerName', 'ServicerName'
]

# Apply one-hot encoding
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Verify the result
print("\nFirst few rows of the encoded dataset:\n", df_encoded.head())



First few rows of the encoded dataset:
    CreditScore FirstPaymentDate  MaturityDate    MSA  MIP  Units  OCLTV  DTI  \
0            0       1999-02-01        202901  16974   25      1     89   27   
1            0       1999-02-01        202901  19740    0      1     73   17   
2            0       1999-02-01        202901  29940    0      1     75   16   
3            0       1999-02-01        202901  31084    0      1     76   14   
4            0       1999-02-01        202901  35644    0      1     78   18   

   OrigUPB  LTV  ...  ServicerName_GMACMTGECORP          \
0   117000   89  ...                                  0   
1   109000   73  ...                                  0   
2    88000   75  ...                                  0   
3   160000   76  ...                                  1   
4   109000   78  ...                                  1   

  ServicerName_HOMESIDELENDINGINC  ServicerName_JPMORGANCHASEBANKNA  \
0                                0                  

In [6]:
# Separate features and target variable
X = df_encoded.drop(columns=['EverDelinquent'])
y = df_encoded['EverDelinquent']

# Verify the separation
print("\nFeatures and target variable separation:\n", X.head(), y.head())



Features and target variable separation:
    CreditScore FirstPaymentDate  MaturityDate    MSA  MIP  Units  OCLTV  DTI  \
0            0       1999-02-01        202901  16974   25      1     89   27   
1            0       1999-02-01        202901  19740    0      1     73   17   
2            0       1999-02-01        202901  29940    0      1     75   16   
3            0       1999-02-01        202901  31084    0      1     76   14   
4            0       1999-02-01        202901  35644    0      1     78   18   

   OrigUPB  LTV  ...  ServicerName_GMACMTGECORP          \
0   117000   89  ...                                  0   
1   109000   73  ...                                  0   
2    88000   75  ...                                  0   
3   160000   76  ...                                  1   
4   109000   78  ...                                  1   

  ServicerName_HOMESIDELENDINGINC  ServicerName_JPMORGANCHASEBANKNA  \
0                                0                

# Building Machine Learning Models

In [7]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("\nTraining and test set sizes:\n", X_train.shape, X_test.shape)



Training and test set sizes:
 (204015, 123) (87436, 123)


## Regression Model

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

if 'OrigUPB' in df_encoded.columns:
    X = df_encoded.drop(columns=['EverDelinquent', 'OrigUPB'])
    y = df_encoded['OrigUPB']
    
    # Convert datetime columns to numeric if any
    if 'FirstPaymentDate' in X.columns:
        X['FirstPaymentDate'] = X['FirstPaymentDate'].map(pd.Timestamp.toordinal)
    
    # Drop non-numeric columns
    X = X.select_dtypes(include=['number'])
    
    # Split the data for regression
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Build and train the regression model
    reg_model = LinearRegression()
    reg_model.fit(X_train, y_train)
    
    # Predict and evaluate
    y_pred = reg_model.predict(X_test)
    print("\nRegression Model Performance:")
    print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
    print("R-squared:", r2_score(y_test, y_pred))
else:
    print("Column 'OrigUPB' not found for regression.")




Regression Model Performance:
Mean Squared Error: 2180018131.8654275
R-squared: 0.24376669845815357


## Classification Models Logistic Regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_scaled, y_train)

y_pred = log_reg.predict(X_test_scaled)

# Evaluate the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Build and train the Random Forest Classifier model
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)

# Predict and evaluate
y_pred_rf_clf = rf_clf.predict(X_test)
print("\nRandom Forest Classifier Model Performance:")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf_clf))
print("Accuracy Score:", accuracy_score(y_test, y_pred_rf_clf))
