In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, regexp_replace, trim

# Initialize Spark session
spark = SparkSession.builder.appName("CreditRiskModel").getOrCreate()

# Load the data
data = spark.read.csv('lending_club_loans.csv', header=True, inferSchema=True)


PySparkRuntimeError: [JAVA_GATEWAY_EXITED] Java gateway process exited before sending its port number.

In [None]:

# Display schema and first few rows
data.printSchema()
data.show(5)

# Drop irrelevant columns (as an example)
data = data.drop("id", "member_id", "url", "desc")

# Handle missing values: Fill or drop
data = data.na.fill({
    "annual_inc": 0,
    "dti": 0,
    "loan_amnt": 0,
    "funded_amnt": 0,
    "funded_amnt_inv": 0,
    "total_pymnt": 0,
    "total_rec_int": 0
})

# Convert categorical variables to numeric
data = data.withColumn("home_ownership", when(col("home_ownership") == "RENT", 1)
                       .when(col("home_ownership") == "OWN", 2)
                       .when(col("home_ownership") == "MORTGAGE", 3)
                       .otherwise(0))

# Clean up any string columns if needed
data = data.withColumn("purpose", regexp_replace(col("purpose"), " ", "_")) \
           .withColumn("purpose", trim(col("purpose")))

# Show cleaned data
data.show(5)


In [None]:
from pyspark.sql.functions import expr

# Example: Create Debt-to-Income (DTI) Ratio
data = data.withColumn("dti_ratio", col("dti") / 100)

# Create Loan-to-Value (LTV) ratio
data = data.withColumn("ltv_ratio", col("loan_amnt") / (col("home_value") * 0.8))  # Assuming home_value is provided

# Drop any rows with invalid LTV ratio if necessary
data = data.filter(col("ltv_ratio").isNotNull())

# Select relevant features for modeling
features = data.select("loan_amnt", "annual_inc", "dti", "home_ownership", "purpose", "dti_ratio", "ltv_ratio", "default")


In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

# Assemble features
assembler = VectorAssembler(inputCols=["loan_amnt", "annual_inc", "dti_ratio", "home_ownership"], outputCol="features")
data = assembler.transform(features)

# Split data into training and testing sets
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

# Initialize Logistic Regression model
lr = LogisticRegression(featuresCol="features", labelCol="default")

# Create a pipeline
pipeline = Pipeline(stages=[assembler, lr])

# Fit the model
model = pipeline.fit(train_data)


In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Make predictions
predictions = model.transform(test_data)

# Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol="default")
roc_auc = evaluator.evaluate(predictions)

print(f"ROC-AUC: {roc_auc:.2f}")


import joblib
joblib.dump(model, 'credit_risk_model.pkl')
