In [0]:
# Databricks notebook: Linear Regression with PySpark

# Import necessary libraries
from pyspark.sql import SparkSession  # Create and manage a Spark session.
from pyspark.ml.feature import VectorAssembler  # Combine features into a single vector for ML.
from pyspark.ml.regression import LinearRegression  # Train and apply linear regression models.
from pyspark.ml.evaluation import RegressionEvaluator  # Evaluate regression model performance.

# Step 1: Load the CSV file into PySpark DataFrame

# Create a Spark session
spark = SparkSession.builder.appName("LinearRegression-Bank").getOrCreate()  # Start a new Spark session.

data_df = spark.sql("SELECT * FROM bank_data")
# Display the first few rows of the DataFrame to verify loading
data_df.show()  # View the loaded data.


+-----------+---+---------------+-----------+----------------------+------------------+--------------+--------------------+
|Customer_ID|Age|Account_Balance|Loan_Amount|Loan_Approval_Duration|     Interest_Rate|Monthly_Income| Default_Probability|
+-----------+---+---------------+-----------+----------------------+------------------+--------------+--------------------+
|          1| 58|          90135|      47217|                    20| 9.089529444142698|          6491|0.026366974497252005|
|          2| 48|          36222|      22056|                     8|6.7329432007084575|         13130|  0.3764633668780496|
|          3| 34|          78373|      39059|                     7| 6.564370426710861|          9651|  0.8105533307818329|
|          4| 27|          80575|      26809|                     3| 7.502428981645953|          3167|  0.9872761293149445|
|          5| 40|          97354|      22419|                    17|10.492266647061204|          3062| 0.15041689110352818|
|       

In [0]:
#PySpark's VectorAssembler requires numeric data types to process features.
# You need to ensure the columns in the DataFrame have numeric data types before passing them to the VectorAssembler.
from pyspark.sql.functions import col

data_df = data_df.withColumn("Age", col("Age").cast("double")) \
                 .withColumn("Account_Balance", col("Account_Balance").cast("double")) \
                 .withColumn("Loan_Amount", col("Loan_Amount").cast("double")) \
                 .withColumn("Interest_Rate", col("Interest_Rate").cast("double")) \
                 .withColumn("Monthly_Income", col("Monthly_Income").cast("double")) \
                 .withColumn("Default_Probability", col("Default_Probability").cast("double"))

data_df.printSchema()


root
 |-- Customer_ID: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Account_Balance: double (nullable = true)
 |-- Loan_Amount: double (nullable = true)
 |-- Loan_Approval_Duration: string (nullable = true)
 |-- Interest_Rate: double (nullable = true)
 |-- Monthly_Income: double (nullable = true)
 |-- Default_Probability: double (nullable = true)



In [0]:


# Step 2: Prepare the data
# Select relevant features for the model  
feature_columns = ["Age", "Account_Balance", "Loan_Amount", "Interest_Rate", "Monthly_Income"]  # selected Features for the model.

# Combine selected features into a single vector column (Optimized Performance,Compatibility with ML Algorithms,Scalability)
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")  # Create a feature vector , VectorAssembler transforms raw data into this required format .
data_prepared = assembler.transform(data_df).select("features", "Default_Probability")  # Prepare input for ML.

# Display the prepared data to confirm the transformation
data_prepared.show()  # Verify the transformed data.


+--------------------+--------------------+
|            features| Default_Probability|
+--------------------+--------------------+
|[58.0,90135.0,472...|0.026366974497252005|
|[48.0,36222.0,220...|  0.3764633668780496|
|[34.0,78373.0,390...|  0.8105533307818329|
|[27.0,80575.0,268...|  0.9872761293149445|
|[40.0,97354.0,224...| 0.15041689110352818|
|[58.0,85651.0,543...|  0.5941307153521351|
|[38.0,64335.0,334...|  0.3808908566310215|
|[42.0,11965.0,447...|  0.9699143978146032|
|[30.0,25538.0,131...|  0.8421189231357087|
|[30.0,71592.0,258...|  0.8383287047111379|
|[43.0,99018.0,376...|  0.4686931597949703|
|[55.0,9110.0,2827...|  0.4148195023376652|
|[59.0,80309.0,412...| 0.27340707193070624|
|[43.0,28266.0,410...|0.056375496650927115|
|[22.0,53992.0,697...|  0.8647223762550532|
|[41.0,83948.0,240...|  0.8129010091300776|
|[21.0,99806.0,351...|  0.9997176732861306|
|[43.0,7910.0,4504...|  0.9966368370739054|
|[49.0,91982.0,155...|  0.5554317056026274|
|[57.0,1206.0,2317...|  0.768987

In [0]:

# Step 3: Split the data into training and test sets
# Use an 80-20 split for training and testing
train_data, test_data = data_prepared.randomSplit([0.8, 0.2], seed=42)  # Create training and testing datasets.


In [0]:

# Step 4: Train the Linear Regression Model
# Initialize the Linear Regression model
#The features column contains vectors like [age, income, credit_score].
#The Default_Probability column contains numerical values representing the probability that a person will default on a loan.
lr = LinearRegression(featuresCol="features", labelCol="Default_Probability")  # Define the regression model.


# Fit the model on the training data
lr_model = lr.fit(train_data)  # Train the linear regression model.
# The fit() method is used to train the machine learning model on the dataset provided
# It processes the train_data DataFrame to find the best coefficients and intercept for the linear regression equation
# It minimizes the cost function (e.g., Mean Squared Error) during training



In [0]:

# Step 5: Output model coefficients and intercept
# Display the coefficients (weights) and intercept (bias term) of the model
print("Coefficients: ", lr_model.coefficients)  # Output feature weights.
print("Intercept: ", lr_model.intercept)  # Output model bias.
 

# The intercept is value of model if all feature are 0
# To determine the most influential feature in your example, look at the absolute values of the coefficients, as these indicate the strength of the impact on the target variable. Larger absolute values imply a stronger effect.

#Calculate Absolute Values of Coefficients

# Now after calculating Absolute Values Ranking by Strength:
# Feature 1: 0.006000 (Strongest influence)
# Feature 4: 0.003158 (Second strongest influence)
# Feature 5: 0.00000206
# Feature 3: 0.00000149
# Feature 2: 0.000000878 (Weakest influence)

#Conclusion:
#Feature 1 has the strongest influence on the target variable because it has the largest absolute coefficient (−0.006 , −0.006).
#A unit change in Feature 1 leads to a 0.006 decrease in the predicted Default_Probability.
#Feature 4 is the second most influential, but its impact is about half that of Feature 1.
#Features 2, 3, and 5 have very small coefficients, indicating that they have minimal influence on the prediction.


Coefficients:  [-0.006000376620951353,8.783689937575941e-07,1.4981374503054674e-06,-0.0031586697031177054,2.0604575901113477e-06]
Intercept:  0.6645449830442633


In [0]:

# Step 6: Make predictions
# Use the test data to make predictions
predictions = lr_model.transform(test_data)  # Apply the model to the test dataset.

# Display predictions alongside actual values
predictions.select("features", "Default_Probability", "prediction").show()  # Compare predictions with actuals.


+--------------------+--------------------+-------------------+
|            features| Default_Probability|         prediction|
+--------------------+--------------------+-------------------+
|[21.0,50080.0,356...|0.028782676313338973| 0.6142986284995059|
|[22.0,53992.0,697...|  0.8647223762550532|  0.589754475654688|
|[23.0,22976.0,473...|  0.5139894891598108| 0.6097333106783765|
|[26.0,34827.0,416...|  0.6715731955927996| 0.5844243433954437|
|[27.0,66318.0,474...| 0.17701048427674682| 0.6091467008215131|
|[28.0,1854.0,2523...|0.039312139841098936| 0.5285721314603798|
|[30.0,71592.0,258...|  0.8383287047111379| 0.5695401092584892|
|[33.0,49925.0,398...|0.014544665667881929| 0.5312280391328901|
|[37.0,24776.0,256...| 0.39654278232127016|  0.496276047095216|
|[37.0,76450.0,187...|  0.8521815003185401| 0.5371657739428969|
|[37.0,99098.0,431...| 0.07586332810866392| 0.5947788256929901|
|[39.0,3693.0,1161...|   0.755137255673619| 0.4422328681550375|
|[40.0,56820.0,119...|  0.52030770090379

In [0]:

# Step 7: Evaluate the model
# Initialize a regression evaluator to calculate metrics
evaluator = RegressionEvaluator(
    labelCol="Default_Probability",
    predictionCol="prediction",
    metricName="rmse"  # Evaluate RMSE
)  # Set up an evaluator for regression metrics.

rmse = evaluator.evaluate(predictions)  # Calculate RMSE.
print("Root Mean Squared Error (RMSE):", rmse)  # Print the RMSE.

# Calculate R-squared using the same evaluator
r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})  # Calculate R².
print("R² (Coefficient of Determination):", r2)  # Print the R² value.

# Conclusion
# Display a summary of the model performance
print("Linear Regression Model Summary:")  # Begin summary.
print(f"  Coefficients: {lr_model.coefficients}")  # Output coefficients.
print(f"  Intercept: {lr_model.intercept}")  # Output intercept.
print(f"  RMSE: {rmse}")  # Output RMSE.
print(f"  R²: {r2}")  # Output R².

#Root Mean Squared Error (RMSE): 0.35910683199481575
#What it measures: The average magnitude of the error (difference between predicted and actual values), with larger errors weighted more heavily because they are squared.

#Interpretation: A lower RMSE indicates better model performance. In this case, the RMSE is approximately 0.359, meaning the model's predictions deviate from the actual values by about 0.359 on average.
#R² (Coefficient of Determination): -0.18548115618368066
#What it measures: The proportion of variance in the dependent variable (target) explained by the independent variables (features).

#Interpretation:
#Values close to 1 indicate a good fit.
#A negative R² value indicates that the model performs worse than a horizontal line (mean of the target values).
#Here, an R² of -0.185 suggests the model is performing poorly and does not explain the variance in the target variable.
#-------------------------------------------------------------------------------------------------------------------------
#Coefficients:
#[-0.006000376620951353, 8.783689937575941e-07, 1.4981374503054674e-06, -0.0031586697031177054, 2.0604575901113477e-06]

#What they represent: The weights applied to each feature in the linear regression equation.
#Interpretation:
#A positive coefficient indicates that as the feature increases, the target value increases.
#A negative coefficient indicates that as the feature increases, the target value decreases.
#In this case:
#Feature 1: A small negative effect.
#Feature 2 and 3: Very small positive effects (close to zero).
#Feature 4: Small negative effect.
#Feature 5: Small positive effect.

#Intercept: 0.6645449830442633
#What it represents: The predicted value of the target variable when all features are zero.
#Interpretation: If all feature values are zero, the predicted target value is approximately 0.6645.
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++






Root Mean Squared Error (RMSE): 0.35910683199481575
R² (Coefficient of Determination): -0.18548115618368066
Linear Regression Model Summary:
  Coefficients: [-0.006000376620951353,8.783689937575941e-07,1.4981374503054674e-06,-0.0031586697031177054,2.0604575901113477e-06]
  Intercept: 0.6645449830442633
  RMSE: 0.35910683199481575
  R²: -0.18548115618368066


In [0]:
#Summary ------}}}}}}}}}}}}}
#This model is not performing well:

# RMSE suggests the predictions are moderately far from actual values.
# R² is negative, indicating the model explains less of the variance than a trivial model (predicting the mean).
# Possible reasons for poor performance include:

# The model may not be appropriate for the data (e.g., linear regression for non-linear relationships).
# Features may not be sufficiently predictive of the target.
# Data preprocessing issues (e.g., missing values, poor scaling).

# Improving the model might involve:
# Examining feature importance and correlations.
# Trying different feature transformations or additional features.
# Experimenting with more complex models.