In [20]:
## Use this section to suppress warnings generated by the code:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

In [21]:
## Step 1. Import the necessary libraries
import findspark
findspark.init()

from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression as SparkLR
from pyspark.ml.evaluation import RegressionEvaluator

from functools import reduce
from operator import and_

# Below are needed for Sequential Feature Selector
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LinearRegression as SKlearnLR
from sklearn.model_selection import cross_val_score
import numpy as np

In [22]:
## Step 2. Create a SparkSession
spark = SparkSession.builder \
        .appName("Linear Regression using Spark ML") \
        .getOrCreate()

In [23]:
## Step 3. Read the data from a CSV file
sdf = spark.read.csv("sources/housing.csv", header=True, inferSchema=True)

# print the schema
sdf.printSchema()
# show top 5 rows from the dataframe
sdf.show(5)

root
 |-- Id: integer (nullable = true)
 |-- MSSubClass: integer (nullable = true)
 |-- MSZoning: string (nullable = true)
 |-- LotFrontage: string (nullable = true)
 |-- LotArea: integer (nullable = true)
 |-- Street: string (nullable = true)
 |-- Alley: string (nullable = true)
 |-- LotShape: string (nullable = true)
 |-- LandContour: string (nullable = true)
 |-- Utilities: string (nullable = true)
 |-- LotConfig: string (nullable = true)
 |-- LandSlope: string (nullable = true)
 |-- Neighborhood: string (nullable = true)
 |-- Condition1: string (nullable = true)
 |-- Condition2: string (nullable = true)
 |-- BldgType: string (nullable = true)
 |-- HouseStyle: string (nullable = true)
 |-- OverallQual: integer (nullable = true)
 |-- OverallCond: integer (nullable = true)
 |-- YearBuilt: integer (nullable = true)
 |-- YearRemodAdd: integer (nullable = true)
 |-- RoofStyle: string (nullable = true)
 |-- RoofMatl: string (nullable = true)
 |-- Exterior1st: string (nullable = true)
 |--

In [24]:
## Step 4. Data Cleaning

# Clean up empty values
sdf = sdf.dropna()

# Drop the 'Id' column since it is not relevant, it's just an index
sdf = sdf.drop('Id')

# Get a list of non-string column names
non_string_columns = [name for name, dtype in sdf.dtypes if dtype != 'string']

# Define a list to hold the outlier filtering conditions
conditions = []

# Loop over each non-string column
for c in non_string_columns:
    # Calculate the lower and upper bounds for the column
    quantiles = sdf.approxQuantile(c, [0.25, 0.75], 0.05)
    IQR = quantiles[1] - quantiles[0]
    lower_bound = quantiles[0] - 1.5 * IQR
    upper_bound = quantiles[1] + 1.5 * IQR

    # Add a condition to filter out outliers in the column
    conditions.append((col(c) >= lower_bound) & (col(c) <= upper_bound))

# Apply the conditions to the DataFrame and then select only the non-string columns
sdf = sdf.filter(reduce(and_, conditions))
sdf = sdf.select(*[col(c) for c in non_string_columns])

Below, we will perform cross validation using Sequential Feature Selector (SFS) from mlxtend in order to select the features. For that reason, the sample data will be converted to Pandas. And since the data size is small here, we can easily use 80%-100% of the Spark dataframe for the sample size. However, if you plan to use the same SFS for a much larger dataset, you should consider using a smaller percentage for the sample size.

In [25]:
## Step 5. Sample the data for cross validation
# Note 1: No need to reduce the data size for a sample if it's a small dataset, but below we do so for demonstration purposes
# Note 2: You should get a smaller sample size if the Spark DataFrame is too large. But for this example, we will use 80% sample size
sampled_data = sdf.sample(False, 0.8)

# Print total number of rows in the original and sampled data
print("Total rows in original DataFrame: ", sdf.count())
print("Total rows in sampled DataFrame: ", sampled_data.count())

# Convert the sampled data to Pandas DataFrame so that we can use it for cross validation in the next step
pandas_df = sampled_data.toPandas()

Total rows in original DataFrame:  701
Total rows in sampled DataFrame:  554


In [26]:
## Step 6. Perform cross validation using Sequential Feature Selector
# X is your matrix of features and y is the target variable
X = pandas_df.drop('SalePrice', axis=1)
y = pandas_df['SalePrice']

# Create a linear regression model with scikit-learn
lr = SKlearnLR()

# Define the range of features to consider
k_features_range = range(1, X.shape[1] + 1)  # From 1 to the total number of features in X

# Dictionary to hold the results and selected features
results = {}
selected_features = {}

for k in k_features_range:
    sfs = SFS(lr, 
              k_features=k, 
              forward=True, 
              floating=False, 
              scoring='r2',
              cv=5)

    # Fit the model
    sfs.fit(X, y)
    
    # Store the mean cross-validated score
    mean_score = np.mean(cross_val_score(lr, X.iloc[:, list(sfs.k_feature_idx_)], y, cv=5, scoring='r2'))
    results[k] = mean_score
    selected_features[k] = X.columns[list(sfs.k_feature_idx_)].tolist()
    print(f'Number of features: {k}, Cross-Validated R²: {mean_score}')

# Find the number of features with the best mean score
best_feature_count = max(results, key=results.get)
best_score = results[best_feature_count]
best_features = selected_features[best_feature_count]

print(f'Best number of features: {best_feature_count}, with R² score: {best_score}')
print(f'Selected features for the best result: {best_features}')

Number of features: 1, Cross-Validated R²: 0.7009031133150757
Number of features: 2, Cross-Validated R²: 0.7846376761569269
Number of features: 3, Cross-Validated R²: 0.8148884210614884
Number of features: 4, Cross-Validated R²: 0.8408845137501378
Number of features: 5, Cross-Validated R²: 0.8468133599848173
Number of features: 6, Cross-Validated R²: 0.848923039513628
Number of features: 7, Cross-Validated R²: 0.8509169200674951
Number of features: 8, Cross-Validated R²: 0.8537653511604335
Number of features: 9, Cross-Validated R²: 0.8541893184346117
Number of features: 10, Cross-Validated R²: 0.8546632155507012
Number of features: 11, Cross-Validated R²: 0.8548624941468201
Number of features: 12, Cross-Validated R²: 0.8548755765059696
Number of features: 13, Cross-Validated R²: 0.8548755765059699
Number of features: 14, Cross-Validated R²: 0.8548755765059696
Number of features: 15, Cross-Validated R²: 0.8548755765059693
Number of features: 16, Cross-Validated R²: 0.8548755765059701
Nu

In [27]:
## Step 7. Select the columns printed from the previous step as the features, and create a vector assembler
inputCols = best_features

assembler = VectorAssembler(inputCols=inputCols, outputCol="features")

In [28]:
## Step 8. Transform the selected columns from the Spark DataFrame
sdf_transformed = assembler.transform(sdf)

In [29]:
## Step 9. Show the assembled features and the target column "SalePrice"
sdf_transformed.select("features","SalePrice").show(10)

+--------------------+---------+
|            features|SalePrice|
+--------------------+---------+
|[60.0,8450.0,7.0,...|   208500|
|[60.0,11250.0,7.0...|   223500|
|[60.0,14260.0,8.0...|   250000|
|[20.0,10084.0,8.0...|   307000|
|[20.0,11200.0,5.0...|   129500|
|[20.0,10652.0,7.0...|   279500|
|[20.0,13695.0,5.0...|   159000|
|[20.0,7560.0,5.0,...|   139000|
|[120.0,4224.0,5.0...|   129900|
|[20.0,14230.0,8.0...|   256300|
+--------------------+---------+
only showing top 10 rows



In [30]:
## Step 10. Split the transformed data into training and test sets
(training_data, testing_data) = sdf_transformed.randomSplit([0.7, 0.3], seed=42)

In [31]:
## Step 11. Create the Spark Linear Regression model
lr = SparkLR(featuresCol="features", labelCol="SalePrice", regParam=0.1)

In [32]:
## Step 12. Fit the model using the training data
model = lr.fit(training_data)

# Ignore the warning messages (if any)

In [33]:
## Step 13. Evaluate the model using the testing data
predictions = model.transform(testing_data)

In [34]:
## Step 14. Select and display the predictions alongside the actual values for the target column
predictions.select("features", "SalePrice", "prediction").show(10)

+--------------------+---------+------------------+
|            features|SalePrice|        prediction|
+--------------------+---------+------------------+
|[20.0,5400.0,5.0,...|   117000|108725.79427925334|
|[20.0,6173.0,5.0,...|   125500|112979.24340482312|
|[20.0,6600.0,5.0,...|   109500|110079.41919955844|
|[20.0,6627.0,3.0,...|    72500|61598.087196754524|
|[20.0,6853.0,8.0,...|   220000| 213776.5989658574|
|[20.0,6951.0,5.0,...|   119500|129782.49834754248|
|[20.0,6993.0,5.0,...|   135000|  143675.353248809|
|[20.0,7064.0,5.0,...|   135000|131630.12589829927|
|[20.0,7100.0,5.0,...|   129900|111787.88418934005|
|[20.0,7153.0,6.0,...|   179200|170067.49157458544|
+--------------------+---------+------------------+
only showing top 10 rows



In [35]:
## Step 15. Evaluate the model further using R squared, Root Mean Squared Error, and Mean Absolute Error
evaluator = RegressionEvaluator(labelCol="SalePrice", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(predictions)
print("R Squared =", r2)

evaluator = RegressionEvaluator(labelCol="SalePrice", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("RMSE =", rmse)

evaluator = RegressionEvaluator(labelCol="SalePrice", predictionCol="prediction", metricName="mae")
mae = evaluator.evaluate(predictions)
print("MAE =", mae)

R Squared = 0.864780142946739
RMSE = 17383.280837895938
MAE = 13543.001991194576


In [36]:
## Step 16. Print the coefficients and intercept of the model
print("Coefficients:", model.coefficients)
print("Intercept:", model.intercept)

Coefficients: [-48.22251971975891,1.6723591986408233,15429.20024806852,5419.313901168996,684.8140207541009,25.34292195267414,58.53236404869137,-7497.609492184228,-5490.981782024686,-409.4829278418036,19.63748922833068,27.902194042593695,0.0,0.0,0.0,0.0]
Intercept: -1402298.8985725874


In [37]:
## Step 17. Stop the SparkSession
spark.stop()