# LogisticRegression

In [None]:
import numpy as np
import pyspark.sql.functions as F
import os

In [None]:
display(os.listdir('work'))

### Step 1: Create Spark object

In [None]:
# No environment path hacking required!
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("HousingModelDocker") \
    .getOrCreate()

### Step 2: Create Dataframe

In [None]:
import os
display(os.listdir('work'))

In [None]:
data = spark.read.csv('work/housing_prices.csv',header=True, inferSchema=True)
data.printSchema()

### Step 3: Data Cleansing

#### Counting NULL values in SINGLE column

In [None]:
(
    data.select
    (
        F.count(
                F.when(
                        F.col('ocean_proximity').isNull(),1
                    )
                ).alias('ocean_proximity')
    ).show()
 )

#### Counting NULL values in all columns

In [None]:
(
    data.select
    (
        [
            F.count(
                F.when(
                        F.col(c).isNull(),1
                    )
                ).alias(c) for c in data.columns
        ]
    ).show()
)

#### Handling NULL value in ``total_bedrooms`` column

In [None]:
filtered_data = data.dropna(subset=['total_bedrooms'])
filtered_data.count()

### Step 4: Split dataframe using ``randomSplit([0.8,0.2], seed=42)``

In [None]:
train_data, test_data = filtered_data.randomSplit([0.8, 0.2], seed=42)
print('Train size ', train_data.count())
print('Test size ', test_data.count())

### Step 5: Create Features using VectorAssembler

In [None]:
from pyspark.ml.feature import VectorAssembler

feature_columns = ['housing_median_age',
                   'total_rooms',
                   'total_bedrooms',
                   'population',
                   'households',
                   'median_income']
assemblers = VectorAssembler(inputCols=feature_columns,outputCol='features')

### Create training data

- use train_data from the split

In [None]:
transformed_trained_data = assemblers.transform(train_data)
transformed_trained_data.show()

### Create testing data

- use test_data from the split

In [None]:
transformed_test_data = assemblers.transform(test_data)
transformed_test_data.show()

### Create Model

In [None]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol='features', labelCol='median_house_value', regParam=0.001)
model = lr.fit(transformed_trained_data)

predictions = model.transform(transformed_test_data)
predictions.show()

### Evaluate using 

- Mean Absolute Error

Imagine you're guessing how many candies are in a big jar.

You guess 10 candies. There are actually 12 candies.

Your mistake is 2 candies too few.

Your friend guesses 15 candies. They are 5 candies too many.

Now, Mean Absolute Error (MAE) is like asking: "On average, how far off were our guesses?"

Step 1: Take the size of each mistake (ignore if too high or too low):

Your mistake: 2 candies

Friend's mistake: 5 candies

Step 2: Average them: (2 + 5) √∑ 2 = 3.5 candies

MAE = 3.5

That means: "Our guesses were wrong by 3.5 candies on average."

Lower MAE = Better guessing! üéØ

Real example: If your model predicts house prices, MAE = $20,000 means "On average, my predictions are off by $20,000."


In [154]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator_mae = RegressionEvaluator(labelCol='median_house_value', predictionCol='prediction', metricName='mae')
mae = evaluator_mae.evaluate(predictions)
print(f'Mean Absolute Error (MAE): {mae}')

Mean Absolute Error (MAE): 56292.33482622761


# Model Evaluation

- Mean Absolute Error

Imagine you're guessing how many candies are in a big jar.

You guess 10 candies. There are actually 12 candies.

Your mistake is 2 candies too few.

Your friend guesses 15 candies. They are 5 candies too many.

Now, Mean Absolute Error (MAE) is like asking: "On average, how far off were our guesses?"

Step 1: Take the size of each mistake (ignore if too high or too low):

Your mistake: 2 candies

Friend's mistake: 5 candies

Step 2: Average them: (2 + 5) √∑ 2 = 3.5 candies

MAE = 3.5

That means: "Our guesses were wrong by 3.5 candies on average."

Lower MAE = Better guessing! üéØ

Real example: If your model predicts house prices, MAE = $20,000 means "On average, my predictions are off by $20,000."

- Root Mean Squared Error

You guess: 10 candies (actual: 12) ‚Üí Mistake = 2
Friend guesses: 15 candies (actual: 12) ‚Üí Mistake = 3

MAE (like before):
Just average: (2 + 3) √∑ 2 = 2.5 candies ‚ùå

RMSE (punishes big mistakes):
Step 1: Square the mistakes (big ones hurt more!)

Your mistake: 2¬≤ = 4

Friend's mistake: 3¬≤ = 9
Step 2: Average: (4 + 9) √∑ 2 = 6.5
Step 3: Square root: ‚àö6.5 ‚âà 2.55 candies