In [1]:
import os, sys
from pyspark.sql import SparkSession
import pyspark.sql.functions as fun
from pyspark.sql.functions import col
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

In [2]:
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

In [3]:
spark = SparkSession.builder.master("local[1]").appName("ML_LinearRegression").getOrCreate()

In [6]:
df = spark.read.csv("headbrain.csv", header=True)

In [7]:
df.show(5)

+------+---------+---------------+-------------------+
|Gender|Age Range|Head Size(cm^3)|Brain Weight(grams)|
+------+---------+---------------+-------------------+
|     1|        1|           4512|               1530|
|     1|        1|           3738|               1297|
|     1|        1|           4261|               1335|
|     1|        1|           3777|               1282|
|     1|        1|           4177|               1590|
+------+---------+---------------+-------------------+
only showing top 5 rows



In [8]:
df.printSchema()

root
 |-- Gender: string (nullable = true)
 |-- Age Range: string (nullable = true)
 |-- Head Size(cm^3): string (nullable = true)
 |-- Brain Weight(grams): string (nullable = true)



In [9]:
df.count(), len(df.columns)

(237, 4)

In [10]:
df.select("Head Size(cm^3)", "Brain Weight(grams)").describe().show()

+-------+------------------+-------------------+
|summary|   Head Size(cm^3)|Brain Weight(grams)|
+-------+------------------+-------------------+
|  count|               237|                237|
|   mean|3633.9915611814345|  1282.873417721519|
| stddev| 365.2614224198132| 120.34044578645734|
|    min|              2720|               1012|
|    max|              4747|                955|
+-------+------------------+-------------------+



In [12]:
# Type casting columns to int
# df.withColumn("Brain Weight(grams)", col("Brain Weight(grams)").cast("Integer"))

# Convert all columns into integer
df = df.select(*(col(c).cast("Integer").alias(c) for c in df.columns))

In [13]:
df.printSchema()

root
 |-- Gender: integer (nullable = true)
 |-- Age Range: integer (nullable = true)
 |-- Head Size(cm^3): integer (nullable = true)
 |-- Brain Weight(grams): integer (nullable = true)



In [15]:
# Check for null/missing values in data
df.toPandas().isna().sum()

Gender                 0
Age Range              0
Head Size(cm^3)        0
Brain Weight(grams)    0
dtype: int64

In [18]:
df.agg(*[fun.count(fun.when(fun.isnull(column), column)).alias(column) for column in df.columns]).show()

+------+---------+---------------+-------------------+
|Gender|Age Range|Head Size(cm^3)|Brain Weight(grams)|
+------+---------+---------------+-------------------+
|     0|        0|              0|                  0|
+------+---------+---------------+-------------------+



In [43]:
# Outlier detection and treatment
# Using IQR - Interquartile Range
# We consider those values to be outliers 
# that are below - (Q1 - 1.5 x IQR) 
# or above - (Q3 + 1.5 x IQR)

columns = df.columns

def outlierTreatment(column):
    quantiles = df.approxQuantile(column, [0.25, 0.75], 0.05)
    Q1, Q3 = quantiles[0], quantiles[1]
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers_df = df.filter((col(column) < lower_bound) | (col(column) > upper_bound))
    
    # Capping - Replace outliers above or below a threshold boundary values

    df_capped = df.withColumn(column,
                             fun.when(col(column) > upper_bound, upper_bound)
                             .when(col(column) < lower_bound, lower_bound)
                             .otherwise(col(column)))

    
    # return outliers_df
    return df_capped

In [40]:
columns

['Head Size(cm^3)', 'Brain Weight(grams)']

In [44]:
# output_df = outlierTreatment(columns[0])
# output_df.show()

In [45]:
# output_df = outlierTreatment(columns[1])
# output_df.show()

In [47]:
outlierTreatment(columns[0]).show(5)

+---------------+-------------------+
|Head Size(cm^3)|Brain Weight(grams)|
+---------------+-------------------+
|         4512.0|               1530|
|         3738.0|               1297|
|         4261.0|               1335|
|         3777.0|               1282|
|         4177.0|               1590|
+---------------+-------------------+
only showing top 5 rows



In [19]:
# df.fillna()
# df.dropna()

In [20]:
df = df.drop(*["Gender", "Age Range"])

In [21]:
df.show(5)

+---------------+-------------------+
|Head Size(cm^3)|Brain Weight(grams)|
+---------------+-------------------+
|           4512|               1530|
|           3738|               1297|
|           4261|               1335|
|           3777|               1282|
|           4177|               1590|
+---------------+-------------------+
only showing top 5 rows



In [22]:
# Drop target column to get features only
X = df.drop("Brain Weight(grams)")

In [23]:
assembler = VectorAssembler(inputCols=X.columns, outputCol="features")

In [24]:
output = assembler.transform(df).select("features", "Brain Weight(grams)")

In [26]:
output.show(5)

+--------+-------------------+
|features|Brain Weight(grams)|
+--------+-------------------+
|[4512.0]|               1530|
|[3738.0]|               1297|
|[4261.0]|               1335|
|[3777.0]|               1282|
|[4177.0]|               1590|
+--------+-------------------+
only showing top 5 rows



In [27]:
regression = LinearRegression(featuresCol="features", labelCol="Brain Weight(grams)")
model = regression.fit(output)

In [28]:
model.coefficients

DenseVector([0.2634])

In [29]:
model.intercept

325.5734210494322

In [31]:
model.summary.meanSquaredError

5201.384028002333