# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


#### Optional: Run this cell to see available notebook commands ("magics").


In [None]:
%help

####  Run this cell to set up and start your interactive session.


In [1]:
%idle_timeout 2880
%glue_version 4.0
%worker_type G.1X
%number_of_workers 5

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.5 
Current idle_timeout is None minutes.
idle_timeout has been set to 2880 minutes.
Setting Glue version to: 4.0
Previous worker type: None
Setting new worker type to: G.1X
Previous number of workers: None
Setting new number of workers to: 5
Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 5
Idle Timeout: 2880
Session ID: 98f7c99e-b69e-40e7-b767-fc777db7e0fc
Applying the following default arguments:
--glue_kernel_version 1.0.5
--enable-glue-datacatalog true
Waiting for session 98f7c99e-b69e-40e7-b767-fc777db7e0fc to get into ready status...
Session 98f7c99e-b69e-40e7-b767-fc777db7e0fc ha

In [2]:
data_path ="s3://sai-capstone/housing/prepared/houusing_prepared.csv" 
housing_df = spark.read.csv(data_path, header=True, inferSchema=True)




In [3]:
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder




In [4]:
feature_columns = [col for col in housing_df.columns if col not in ["median_house_value", "ocean_proximity"]]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
housing_prepared = assembler.transform(housing_df).select("features", "median_house_value")




In [5]:
train_data, test_data = housing_prepared.randomSplit([0.8, 0.2], seed=42)

# Train Linear Regression model
lr = LinearRegression(labelCol="median_house_value", featuresCol="features")
lr_model = lr.fit(train_data)
lr_predictions = lr_model.transform(test_data)
lr_rmse = RegressionEvaluator(labelCol="median_house_value", predictionCol="prediction", metricName="rmse").evaluate(lr_predictions)
print(f"Linear Regression RMSE: {lr_rmse}")

Linear Regression RMSE: 67052.72238626775


In [6]:
dt = DecisionTreeRegressor(labelCol="median_house_value", featuresCol="features")
dt_model = dt.fit(train_data)
dt_predictions = dt_model.transform(test_data)
dt_rmse = RegressionEvaluator(labelCol="median_house_value", predictionCol="prediction", metricName="rmse").evaluate(dt_predictions)
print(f"Decision Tree RMSE: {dt_rmse}")

Decision Tree RMSE: 70010.93504259548


In [7]:
rf = RandomForestRegressor(labelCol="median_house_value", featuresCol="features")
rf_model = rf.fit(train_data)
rf_predictions = rf_model.transform(test_data)
rf_rmse = RegressionEvaluator(labelCol="median_house_value", predictionCol="prediction", metricName="rmse").evaluate(rf_predictions)
print(f"Random Forest RMSE: {rf_rmse}")

Random Forest RMSE: 66409.40246426823


In [8]:
evaluator = RegressionEvaluator(labelCol="median_house_value", predictionCol="prediction", metricName="rmse")

lr_rmse = evaluator.evaluate(lr_model.transform(test_data))
dt_rmse = evaluator.evaluate(dt_model.transform(test_data))
rf_rmse = evaluator.evaluate(rf_model.transform(test_data))

# Create a DataFrame to store the RMSE scores
rmse_scores_df = spark.createDataFrame([
    ("Linear Regression", lr_rmse),
    ("Decision Tree", dt_rmse),
    ("Random Forest", rf_rmse)
], ["Model", "RMSE"])

# Specify the output path for storing the RMSE scores
output_path = "s3://sai-capstone/housing/model/rmse_scores.csv"

# Write the RMSE scores DataFrame to a CSV file
rmse_scores_df.write.csv(output_path, mode="overwrite", header=True)

print("RMSE scores saved to CSV file.")

RMSE scores saved to CSV file.
