## Prepare Workspace

In [1]:
# IMPORT LIBRARIES

import numpy as np													# Numerical computations
import pandas as pd 												# Data structures
import matplotlib.pyplot as plt										# Statistic visualizations
from pandas.plotting import scatter_matrix							# Scatter matrix
import seaborn as sns												# Statistic visualizations
import statsmodels.api as sm										# Model construction
from scipy import stats			                                    # Statistics
import statsmodels.api as sm                                        # Model construction
from sklearn.linear_model import LinearRegression					# Regression model
from sklearn.preprocessing import MinMaxScaler                      # Assigns numeric ranges 0-1
from sklearn.ensemble import RandomForestRegressor					# Decision Trees
from sklearn.model_selection import train_test_split				# Test models
from sklearn.metrics import root_mean_squared_error, r2_score		# Model accuracy

In [2]:
df = pd.read_csv("data/data.csv")
df

Unnamed: 0,interest_rate,fico_range,amount_requested,amount_funded_by_investors,loan_length
0,8.90,735-739,20000.0,20000.00,36.0
1,12.12,715-719,19200.0,19200.00,36.0
2,21.98,690-694,35000.0,35000.00,60.0
3,9.99,695-699,10000.0,9975.00,36.0
4,11.71,695-699,12000.0,12000.00,36.0
...,...,...,...,...,...
2493,16.77,705-709,30000.0,29950.00,60.0
2494,14.09,740-744,16000.0,16000.00,60.0
2495,13.99,680-684,10000.0,10000.00,36.0
2496,12.42,675-679,6000.0,6000.00,36.0


In [None]:
display(df.info())
display(df.describe())

In [None]:
print(df["fico_range"].nunique())
print(df["fico_range"].unique())

In [3]:
df['fico_score'] = df['fico_range'].apply(lambda x: int((int(x.split('-')[0]) + int(x.split('-')[1])) / 2))
df

Unnamed: 0,interest_rate,fico_range,amount_requested,amount_funded_by_investors,loan_length,fico_score
0,8.90,735-739,20000.0,20000.00,36.0,737
1,12.12,715-719,19200.0,19200.00,36.0,717
2,21.98,690-694,35000.0,35000.00,60.0,692
3,9.99,695-699,10000.0,9975.00,36.0,697
4,11.71,695-699,12000.0,12000.00,36.0,697
...,...,...,...,...,...,...
2493,16.77,705-709,30000.0,29950.00,60.0,707
2494,14.09,740-744,16000.0,16000.00,60.0,742
2495,13.99,680-684,10000.0,10000.00,36.0,682
2496,12.42,675-679,6000.0,6000.00,36.0,677


In [None]:
print(df["loan_length"].nunique())
print(df["loan_length"].unique())

In [None]:
scatter_matrix(df, figsize=(20, 20))
plt.show()

In [None]:
# Make column as fico scores into ordinal (?) data (Need dictionary or some mapping of each range to its value?) -----DONE
	# ANSWER - Instead of assigning an ordinal number to represent a FICO range, I decided each range can be represented by the avg in that range. This can be converted in the model with MinMaxScaler

# Turn loan length data into encoded data (0 and 1 for 3 years and 5 years) -----DONE
	# ANSWER - One Hot Encoding wasn't needed. We verified only two values, which was transformed to 0/1 during MinMaxScaler

# Create new column of percentage funded

# Rerun scatter matrix

# Drop the amount requested in lieu of amount funded -----DONE
	# Column wasn't deleted, but was not included when defining 'X'

# Scale data to fit 0 - 1 -----DONE
	# Via MinMaxScaler

# Run Linear Regression model

	# Training test

	# Get R2 value

	# OLS with Statsmodels to get the helpful table

# Draw scatter plot with red horizontal line

# Strongest relationships or predicitve potential. Add visualizations

# Practical guidance to business owners

	# Human-readable plots

## Apply Model

In [4]:
# Setting feature and target variables

X = df.drop(['interest_rate', 'fico_range', 'amount_requested'], axis=1)
y = df["interest_rate"]

In [48]:
# Splitting data into groups before fitting

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.255, random_state=42)

In [None]:
num_cols = ['amount_funded_by_investors', 'loan_length', 'fico_score']
df[num_cols] = MinMaxScaler().fit_transform(df[num_cols])

In [49]:
# Apply Linear Regression

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
np.round(y_pred, 2)

array([16.07,  4.43,  5.91, 11.1 ,  7.23, 14.18, 10.5 , 15.62, 19.08,
       13.64, 11.97, 13.59, 14.12,  7.12, 17.98, 12.62,  9.33, 18.17,
       21.06, 15.2 , 12.42, 13.93,  5.58, 19.45, 12.82, 10.42, 11.66,
       14.12, 15.37,  9.94, 15.61, 12.44, 10.12, 14.22, 14.05, 16.3 ,
       13.89, 13.28, 11.41, 14.03,  2.7 , 15.11, 13.98, 20.26, 19.65,
       14.62, 15.53, 20.46, 16.04, 22.6 , 12.34, 15.67,  4.1 , 14.96,
       13.72, 12.34, 13.66, 11.34, 14.13, 13.01,  9.96,  9.71, 14.5 ,
       15.01,  9.26,  6.42,  9.98, 13.82,  6.18, 13.51, 15.65,  9.26,
       16.84, 15.32, 15.17, 10.47, 15.31, 14.48, 14.33, 10.28, 13.08,
       12.74,  9.19, 15.35, 12.9 , 13.56, 17.03, 16.35, 13.01, 15.1 ,
       16.76, 16.2 , 13.45, 10.19, 10.12, 13.78, 18.24, 12.13, 10.87,
       11.26,  6.51, 16.27, 10.83,  7.92, 14.41,  2.38, 14.38,  9.85,
        7.  , 10.03, 15.36,  5.83, 13.89, 12.15,  8.11, 15.8 , 13.56,
       13.47, 14.33, 13.04, 14.21,  6.32, 12.41, 13.06,  8.05, 13.11,
       14.63, 13.18,

In [7]:
# OPTIONAL - Apply Regression with Statsmodels to get OLS table

X_2 = sm.add_constant(X_train)
model_2 = sm.OLS(y_train, X_2).fit()

print(model_2.summary())

                            OLS Regression Results                            
Dep. Variable:          interest_rate   R-squared:                       0.749
Model:                            OLS   Adj. R-squared:                  0.749
Method:                 Least Squares   F-statistic:                     1983.
Date:                Mon, 24 Nov 2025   Prob (F-statistic):               0.00
Time:                        08:17:59   Log-Likelihood:                -4314.5
No. Observations:                1998   AIC:                             8637.
Df Residuals:                    1994   BIC:                             8659.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                                 coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------
const               

In [51]:
# R2 Score

print(f"Test R^2 Score: {model.score(X_test, y_test):.4f}")

Test R^2 Score: 0.7502


## Test Model

In [56]:
test_funded = 25000
test_length = 36
test_fico = 800

test_vars = np.array([[test_funded, test_length, test_fico]])
test_terms = pd.DataFrame(test_vars, columns = ['amount_funded_by_investors', 'loan_length', 'fico_score'])
predicted_rate = model.predict(test_terms)

print(f"Predicted interest rate: {predicted_rate[0]:,.2f}%")

Predicted interest rate: 6.04%
