In [22]:
import pandas as pd
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression as sk_LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression as ps_LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler


RANDOM_STATE = 87

# create a regression dataset (pandas)
X, y = make_regression(
    n_samples=100,
    n_features=2,
    noise=10,
    random_state=RANDOM_STATE)

pandas_df = pd.DataFrame(
    {"Feature_1": X[:, 0],
    "Feature_2": X[:, 1],
    "Target": y})

# create a regression dataset (pyspark)
spark = (
    SparkSession.builder
    .appName("PySparkRegressionData")
    .getOrCreate())

spark_df = [
    (float(X[i, 0]),
    float(X[i, 1]),
    float(y[i]))
    for i in range(len(X))]

spark_df = spark.createDataFrame(
    spark_df,
    ["Feature_1",
    "Feature_2",
    "Target"])

# view datasets and info
print(pandas_df.head())
print(pandas_df.shape)
print(pandas_df.dtypes)
print(pandas_df.columns)
print(pandas_df.info())
print(pandas_df.describe())

spark_df.show(5)
print((spark_df.count(), len(spark_df.columns)))
print(spark_df.dtypes)
print(spark_df.columns)
print(spark_df.printSchema())
spark_df.summary().show()


""" 
Multiple Linear Regression with Scikit-Learn
"""

print("-=-=-=-=-=-=-=-=-=-=-=-=-\nScikit-Learn\n-=-=-=-=-=-=-=-=-=-=-=-=-")
# train test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)

# train model
model = sk_LinearRegression()
model.fit(X_train, y_train)

# make predictions on test set
y_pred = model.predict(X_test)

# evaluate model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# print results
print(f"Coefficients: {model.coef_}")
print(f"Intercept: {model.intercept_}")

print("Actual/Predicted")
for i in range(5):
    print(f"{y_test[i]:.2f} \t {y_pred[i]:.2f}")

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

# make predictions on unseen data
new_data = [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]
new_predictions = model.predict(new_data)
print(f"Features: {new_data}")
print(f"Predicted: {new_predictions}")

""" 
Multiple Linear Regression with PySpark
"""

print("-=-=-=-=-=-=-=-=-=-=-=-=-\nPySpark\n-=-=-=-=-=-=-=-=-=-=-=-=-")
# split into train and test
train_data, test_data = spark_df.randomSplit(
    [0.8, 0.2],
    seed=RANDOM_STATE)

# Combine features into a single column
assembler = VectorAssembler(
    inputCols=["Feature_1", "Feature_2"],
    outputCol="features")
train_data = assembler.transform(train_data)
test_data = assembler.transform(test_data)

# train model
lin_reg = ps_LinearRegression(
    featuresCol="features", labelCol="Target")
model = lin_reg.fit(train_data)

# make predictions on test set
predictions = model.transform(test_data)

# evaluate model
evaluator = RegressionEvaluator(
    labelCol="Target", predictionCol="prediction", metricName="mse")
mse = evaluator.evaluate(predictions)
r2 = model.summary.r2

# print results
print(f"Coefficients: {model.coefficients}")
print(f"Intercept: {model.intercept}")

predictions.select(
    "Target", "prediction").show(5)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

# make predictions on unseen data
new_data = [(1.0, 2.0), (3.0, 4.0), (5.0, 6.0)]
new_data = spark.createDataFrame(
    new_data, ["Feature_1", "Feature_2"])
new_data = assembler.transform(new_data)
new_predictions = model.transform(new_data)
new_predictions.select("features", "prediction").show()









   Feature_1  Feature_2     Target
0  -0.520651   0.997917  43.757858
1   1.913468  -0.369210 -20.723767
2   0.380636  -0.691253 -50.779536
3  -0.809664   0.797625  51.607599
4   0.321007   0.347236  31.365276
(100, 3)
Feature_1    float64
Feature_2    float64
Target       float64
dtype: object
Index(['Feature_1', 'Feature_2', 'Target'], dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Feature_1  100 non-null    float64
 1   Feature_2  100 non-null    float64
 2   Target     100 non-null    float64
dtypes: float64(3)
memory usage: 2.5 KB
None
        Feature_1   Feature_2      Target
count  100.000000  100.000000  100.000000
mean     0.032582   -0.009182   -0.878280
std      0.971680    1.033750   62.157595
min     -2.803027   -2.634019 -141.229019
25%     -0.552571   -0.709232  -48.508251
50%     -0.002834   -0.084394   -5.565788
75

24/10/24 22:42:16 WARN Instrumentation: [d61a3843] regParam is zero, which might cause numerical instability and overfitting.


Coefficients: [4.31875071172504,60.65569029214616]
Intercept: -0.3658724040125461
+------------------+-------------------+
|            Target|         prediction|
+------------------+-------------------+
|-37.68229576923021|-38.849884561086874|
|  -50.779536150581|  -40.6504295377783|
|-7.019046955711484| -2.150770260409581|
| 46.54174250927497|  35.85757644918602|
|-59.72697323427095|  -57.6604470677383|
+------------------+-------------------+
only showing top 5 rows

Mean Squared Error: 110.96350191315771
R-squared: 0.9772698211382972
+---------+------------------+
| features|        prediction|
+---------+------------------+
|[1.0,2.0]|125.26425889200482|
|[3.0,4.0]|255.21314089974723|
|[5.0,6.0]|385.16202290748964|
+---------+------------------+

