In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("BigDataAnalytics") \
    .getOrCreate()


In [None]:
from pyspark.sql.types import *

schema = StructType([
    StructField("_id", StringType(), True),
    StructField("data", StructType([
        StructField("plan", StructType([
            StructField("itineraries", ArrayType(StructType([
                StructField("duration", LongType(), True),
                StructField("walkDistance", DoubleType(), True),
                StructField("legs", ArrayType(StructType([
                    StructField("endTime", LongType(), True),
                    StructField("startTime", LongType(), True),
                    StructField("mode", StringType(), True),
                    StructField("from", StructType([
                        StructField("lat", DoubleType(), True),
                        StructField("lon", DoubleType(), True),
                        StructField("name", StringType(), True)
                    ]), True),
                    StructField("to", StructType([
                        StructField("lat", DoubleType(), True),
                        StructField("lon", DoubleType(), True),
                        StructField("name", StringType(), True)
                    ]), True),
                ])), True)
            ])), True)
        ]), True)
    ]), True),
    StructField("source", StringType(), True),
    StructField("timestamp", StringType(), True),
    StructField("transit_data", StructType([
        StructField("duration", LongType(), True),
        StructField("walkDistance", LongType(), True),
        StructField("legs", ArrayType(StructType([
            StructField("endTime", StringType(), True),
            StructField("startTime", StringType(), True),
            StructField("mode", StringType(), True),
            StructField("from", StructType([
                StructField("lat", DoubleType(), True),
                StructField("lon", DoubleType(), True),
                StructField("name", StringType(), True)
            ]), True),
            StructField("to", StructType([
                StructField("lat", DoubleType(), True),
                StructField("lon", DoubleType(), True),
                StructField("name", StringType(), True)
            ]), True),
        ])), True)
    ]), True),
    StructField("itinerary", StructType([
        StructField("duration", LongType(), True),
        StructField("walkDistance", DoubleType(), True),
        StructField("legs", ArrayType(StructType([
            StructField("endTime", LongType(), True),
            StructField("startTime", LongType(), True),
            StructField("mode", StringType(), True),
            StructField("from", StructType([
                StructField("lat", DoubleType(), True),
                StructField("lon", DoubleType(), True),
                StructField("name", StringType(), True)
            ]), True),
            StructField("to", StructType([
                StructField("lat", DoubleType(), True),
                StructField("lon", DoubleType(), True),
                StructField("name", StringType(), True)
            ]), True),
        ])), True)
    ]), True)
])

json_file = "exported_data.json"
df = spark.read.schema(schema).option("multiline", "true").json(json_file)

df.show(truncate=False)
df.printSchema()


from pyspark.sql.functions import col, explode
important_features_df = df.select(
    col("_id"),
    col("data.plan.itineraries").alias("itineraries"),
    col("source"),
    col("timestamp")
)

# Flatten
flattened_df = important_features_df.select(
    col("_id"),
    col("source"),
    col("timestamp"),
    explode(col("itineraries")).alias("itinerary")
)


final_df = flattened_df.select(
    col("_id"),
    col("source"),
    col("timestamp"),
    col("itinerary.duration").alias("duration"),
    col("itinerary.walkDistance").alias("walk_distance"),
    explode(col("itinerary.legs")).alias("leg")
)

#details
final_df = final_df.select(
    col("_id"),
    col("source"),
    col("timestamp"),
    col("duration"),
    col("walk_distance"),
    col("leg.startTime").alias("start_time"),
    col("leg.endTime").alias("end_time"),
    col("leg.mode").alias("mode"),
    col("leg.from.name").alias("from_name"),
    col("leg.to.name").alias("to_name")
)

final_df.show(truncate=False)


+------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
df.printSchema()

root
 |-- _id: string (nullable = true)
 |-- data: struct (nullable = true)
 |    |-- plan: struct (nullable = true)
 |    |    |-- itineraries: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- duration: long (nullable = true)
 |    |    |    |    |-- walkDistance: double (nullable = true)
 |    |    |    |    |-- legs: array (nullable = true)
 |    |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |    |-- endTime: long (nullable = true)
 |    |    |    |    |    |    |-- startTime: long (nullable = true)
 |    |    |    |    |    |    |-- mode: string (nullable = true)
 |    |    |    |    |    |    |-- from: struct (nullable = true)
 |    |    |    |    |    |    |    |-- lat: double (nullable = true)
 |    |    |    |    |    |    |    |-- lon: double (nullable = true)
 |    |    |    |    |    |    |    |-- name: string (nullable = true)
 |    |    |    |    |    |    |-- to: struct (nu

========>>>>>>>>  Modelling

In [None]:
from pyspark.sql.functions import col, explode, from_unixtime

flattened_df = df.select(
    col("_id"),
    col("source"),
    col("timestamp"),
    explode(col("data.plan.itineraries")).alias("itinerary")
)

final_df = flattened_df.select(
    col("_id"),
    col("source"),
    col("timestamp"),
    col("itinerary.duration").alias("duration"),
    col("itinerary.walkDistance").alias("walk_distance"),
    explode(col("itinerary.legs")).alias("leg")
)

final_df = final_df.select(
    col("_id"),
    col("source"),
    col("timestamp"),
    col("duration"),
    col("walk_distance"),
    col("leg.startTime").alias("start_time"),
    col("leg.endTime").alias("end_time"),
    col("leg.mode").alias("mode"),
    col("leg.from.name").alias("from_name"),
    col("leg.to.name").alias("to_name")
)

final_df = final_df.withColumn("start_time", from_unixtime(col("start_time") / 1000))
final_df = final_df.withColumn("end_time", from_unixtime(col("end_time") / 1000))

final_df.show(truncate=False)


+------------------------+------+---------+--------+------------------+-------------------+-------------------+----+------------------+------------------+
|_id                     |source|timestamp|duration|walk_distance     |start_time         |end_time           |mode|from_name         |to_name           |
+------------------------+------+---------+--------+------------------+-------------------+-------------------+----+------------------+------------------+
|674f69210af0bcf29462cbd8|NULL  |NULL     |738     |308.68238462365167|2024-12-03 20:34:01|2024-12-03 20:34:44|WALK|Origin            |Asemapäällikönkatu|
|674f69210af0bcf29462cbd8|NULL  |NULL     |738     |308.68238462365167|2024-12-03 20:34:44|2024-12-03 20:42:22|BUS |Asemapäällikönkatu|Tupasaari         |
|674f69210af0bcf29462cbd8|NULL  |NULL     |738     |308.68238462365167|2024-12-03 20:42:22|2024-12-03 20:46:19|WALK|Tupasaari         |Destination       |
|674f69210af0bcf29462cbd8|NULL  |NULL     |1302    |703.0343077081648 

In [None]:
from pyspark.sql.functions import unix_timestamp, col

processed_df = final_df.withColumn(
    "leg_duration",
    (col("end_time") - col("start_time")) / 1000
)

model_df = processed_df.select(
    col("source"),
    col("mode"),
    col("from_name"),
    col("to_name"),
    col("start_time"),
    col("walk_distance"),
    col("duration").alias("target_duration")
)


model_df.show(truncate=False)

+------+----+------------------+------------------+-------------------+------------------+---------------+
|source|mode|from_name         |to_name           |start_time         |walk_distance     |target_duration|
+------+----+------------------+------------------+-------------------+------------------+---------------+
|NULL  |WALK|Origin            |Asemapäällikönkatu|2024-12-03 20:34:01|308.68238462365167|738            |
|NULL  |BUS |Asemapäällikönkatu|Tupasaari         |2024-12-03 20:34:44|308.68238462365167|738            |
|NULL  |WALK|Tupasaari         |Destination       |2024-12-03 20:42:22|308.68238462365167|738            |
|NULL  |WALK|Origin            |Pasilan asema     |2024-12-03 20:27:20|703.0343077081648 |1302           |
|NULL  |BUS |Pasilan asema     |Tupasaari         |2024-12-03 20:34:41|703.0343077081648 |1302           |
|NULL  |WALK|Tupasaari         |Destination       |2024-12-03 20:45:05|703.0343077081648 |1302           |
|NULL  |WALK|Origin            |Asema

In [None]:
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline


model_df = model_df.fillna({"source": "unknown", "mode": "unknown", "from_name": "unknown", "to_name": "unknown"})


indexer = StringIndexer(
    inputCols=["source", "mode", "from_name", "to_name"],
    outputCols=["source_indexed", "mode_indexed", "from_indexed", "to_indexed"],
    handleInvalid="skip"
)


model_df = model_df.fillna({"walk_distance": 0.0, "target_duration": 0.0})

assembler = VectorAssembler(
    inputCols=["source_indexed", "mode_indexed", "from_indexed", "to_indexed", "walk_distance"],
    outputCol="features"
)

# Pipeline
pipeline = Pipeline(stages=[indexer, assembler])
prepared_data = pipeline.fit(model_df).transform(model_df)
train_data, test_data = prepared_data.randomSplit([0.8, 0.2], seed=123)
prepared_data.show(5)


+-------+----+------------------+------------------+-------------------+------------------+---------------+--------------+------------+------------+----------+--------------------+
| source|mode|         from_name|           to_name|         start_time|     walk_distance|target_duration|source_indexed|mode_indexed|from_indexed|to_indexed|            features|
+-------+----+------------------+------------------+-------------------+------------------+---------------+--------------+------------+------------+----------+--------------------+
|unknown|WALK|            Origin|Asemapäällikönkatu|2024-12-03 20:34:01|308.68238462365167|            738|           0.0|         0.0|         0.0|       2.0|(5,[3,4],[2.0,308...|
|unknown| BUS|Asemapäällikönkatu|         Tupasaari|2024-12-03 20:34:44|308.68238462365167|            738|           0.0|         1.0|         2.0|       1.0|[0.0,1.0,2.0,1.0,...|
|unknown|WALK|         Tupasaari|       Destination|2024-12-03 20:42:22|308.68238462365167|    

In [None]:
train_data.show()

+-------+----+------------------+---------+-------------------+------------------+---------------+--------------+------------+------------+----------+--------------------+
| source|mode|         from_name|  to_name|         start_time|     walk_distance|target_duration|source_indexed|mode_indexed|from_indexed|to_indexed|            features|
+-------+----+------------------+---------+-------------------+------------------+---------------+--------------+------------+------------+----------+--------------------+
|unknown| BUS|Asemapäällikönkatu|Tupasaari|2024-12-03 20:32:24|308.68238462365167|            742|           0.0|         1.0|         2.0|       1.0|[0.0,1.0,2.0,1.0,...|
|unknown| BUS|Asemapäällikönkatu|Tupasaari|2024-12-03 20:32:24|308.68238462365167|            742|           0.0|         1.0|         2.0|       1.0|[0.0,1.0,2.0,1.0,...|
|unknown| BUS|Asemapäällikönkatu|Tupasaari|2024-12-03 20:32:24|308.68238462365167|            777|           0.0|         1.0|         2.0| 

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator



features = ['walk_distance', 'source_indexed', 'mode_indexed', 'from_indexed', 'to_indexed']
target = 'target_duration'
assembler = VectorAssembler(inputCols=features, outputCol="new_features")
lr = LinearRegression(featuresCol="new_features", labelCol=target)
#pipeline
pipeline = Pipeline(stages=[assembler, lr])
model = pipeline.fit(train_data)
predictions = model.transform(test_data)
# RMSE
evaluator = RegressionEvaluator(labelCol=target, predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse}")

predictions.select('prediction', target).show(10)

Root Mean Squared Error (RMSE): 79.26075431582646
+-----------------+---------------+
|       prediction|target_duration|
+-----------------+---------------+
|799.7724026071422|            777|
|799.7724026071422|            777|
|799.7724026071422|            804|
|799.7724026071422|            804|
|799.7724026071422|            880|
|799.7724026071422|            834|
|799.7724026071422|            837|
|799.7724026071422|            837|
|799.7724026071422|            802|
|799.7724026071422|            768|
+-----------------+---------------+
only showing top 10 rows



In [None]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator


rf = RandomForestRegressor(featuresCol="features", labelCol="target_duration")
rf_model = rf.fit(train_data)
predictions = rf_model.transform(test_data)
#RMSE
evaluator = RegressionEvaluator(labelCol="target_duration", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse}")
predictions.select('prediction', target).show(10)

Root Mean Squared Error (RMSE): 70.52056784263608
+-----------------+---------------+
|       prediction|target_duration|
+-----------------+---------------+
|791.9867907164725|            777|
|791.9867907164725|            777|
|791.9867907164725|            804|
|791.9867907164725|            804|
|791.9867907164725|            880|
|791.9867907164725|            834|
|791.9867907164725|            837|
|791.9867907164725|            837|
|791.9867907164725|            802|
|791.9867907164725|            768|
+-----------------+---------------+
only showing top 10 rows



In [None]:
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import MinMaxScaler, VectorAssembler
from pyspark.ml import Pipeline


assembler = VectorAssembler(inputCols=['features'], outputCol='assembled_features')
scaler = MinMaxScaler(inputCol='assembled_features', outputCol='scaled_features')s
train_data, test_data = train_data.randomSplit([0.8, 0.2], seed=42)
gbt = GBTRegressor(featuresCol="scaled_features", labelCol="target_duration", maxIter=50)
pipeline = Pipeline(stages=[assembler, scaler, gbt])

gbt_model = pipeline.fit(train_data)
predictions = gbt_model.transform(test_data)
evaluator = RegressionEvaluator(
    labelCol="target_duration", predictionCol="prediction", metricName="rmse"
)
rmse = evaluator.evaluate(predictions)
print(f"RMSE: {rmse}")


RMSE: 50.70121901786601


In [None]:
predictions_and_labels = predictions.select("prediction", "target_duration").take(10)

for row in predictions_and_labels:
    print(f"Prediction: {row['prediction']:.2f}, Actual: {row['target_duration']:.2f}")


Prediction: 772.10, Actual: 777.00
Prediction: 772.10, Actual: 804.00
Prediction: 772.10, Actual: 804.00
Prediction: 772.10, Actual: 880.00
Prediction: 772.10, Actual: 834.00
Prediction: 772.10, Actual: 733.00
Prediction: 772.10, Actual: 784.00
Prediction: 772.10, Actual: 767.00
Prediction: 772.10, Actual: 835.00
Prediction: 772.10, Actual: 835.00


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
import numpy as np


X = np.array([row['features'] for row in train_data.collect()])
y = np.array([row['target_duration'] for row in train_data.collect()])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#simple neural network
model = Sequential([
    Dense(64, input_dim=X_train.shape[1], activation='relu'),
    Dense(32, activation='relu'),
    Dense(1)
])
model.compile(optimizer='adam', loss='mse')
model.fit(X_train, y_train, epochs=50, batch_size=32)

#RMSE
predictions = model.predict(X_test)
rmse = np.sqrt(np.mean((predictions - y_test)**2))
print(f"Root Mean Squared Error (RMSE): {rmse}")

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 837701.0625
Epoch 2/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 489696.5938
Epoch 3/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 220602.7500 
Epoch 4/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 55966.9922
Epoch 5/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 31636.0391
Epoch 6/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 28874.7266 
Epoch 7/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 29322.6348
Epoch 8/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 28922.8086
Epoch 9/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 29801.6113
Epoch 10/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms

In [None]:
for i in range(10):  # Displaying the first 10 predictions
    print(f"Prediction: {predictions[i][0]}, Actual: {y_test[i]}")

Prediction: 1399.716796875, Actual: 1283
Prediction: 1389.3519287109375, Actual: 1336
Prediction: 1399.716796875, Actual: 1302
Prediction: 618.2598266601562, Actual: 820
Prediction: 628.9579467773438, Actual: 820
Prediction: 618.2598266601562, Actual: 781
Prediction: 618.2598266601562, Actual: 1079
Prediction: 618.2598266601562, Actual: 1062
Prediction: 1399.716796875, Actual: 1257
Prediction: 628.9579467773438, Actual: 789


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import numpy as np

X = np.array([row['features'] for row in train_data.collect()])
y = np.array([row['target_duration'] for row in train_data.collect()])

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
model = Sequential([
    Dense(128, input_dim=X_train.shape[1], activation='relu'),
    Dropout(0.2),  # Dropout to reduce overfitting
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1)
])
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='mse')
model.fit(X_train, y_train, epochs=150, batch_size=32, validation_data=(X_test, y_test))
predictions = model.predict(X_test)
#(RMSE)
rmse = np.sqrt(np.mean((predictions - y_test)**2))
print(f"Root Mean Squared Error (RMSE): {rmse}")

for pred, actual in zip(predictions[:10], y_test[:10]):
    print(f"Prediction: {pred[0]}, Actual: {actual}")

Epoch 1/150


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 1038942.7500 - val_loss: 1041652.7500
Epoch 2/150
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 1057510.6250 - val_loss: 1031469.1250
Epoch 3/150
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 1033321.1250 - val_loss: 980534.7500
Epoch 4/150
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 962700.2500 - val_loss: 809743.7500
Epoch 5/150
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 735751.9375 - val_loss: 454913.6562
Epoch 6/150
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 364721.5938 - val_loss: 99715.1719
Epoch 7/150
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 74994.4844 - val_loss: 44258.1250
Epoch 8/150
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 39525.0391 - v

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from tensorflow.keras.callbacks import EarlyStopping

X = np.array([row['features'] for row in train_data.collect()])
y = np.array([row['target_duration'] for row in train_data.collect()])

scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()
X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1)).flatten()

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

#the model
model = Sequential([
    Dense(128, input_dim=X_train.shape[1], activation='relu', kernel_regularizer=l2(0.005)),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1)
])
optimizer = Adam(learning_rate=0.0003)
model.compile(optimizer=optimizer, loss='mse')

early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    epochs=300,
    batch_size=32,
    validation_data=(X_test, y_test),
    callbacks=[early_stopping],
    verbose=1
)

predictions_scaled = model.predict(X_test)
predictions = scaler_y.inverse_transform(predictions_scaled)
y_test_original = scaler_y.inverse_transform(y_test.reshape(-1, 1))

for pred, actual in zip(predictions[:10], y_test_original[:10]):
    print(f"Prediction: {pred[0]:.2f}, Actual: {actual[0]:.2f}")

Epoch 1/300


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - loss: 0.3315 - val_loss: 0.1436
Epoch 2/300
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.1437 - val_loss: 0.0768
Epoch 3/300
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0834 - val_loss: 0.0546
Epoch 4/300
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0646 - val_loss: 0.0468
Epoch 5/300
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0614 - val_loss: 0.0440
Epoch 6/300
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0558 - val_loss: 0.0419
Epoch 7/300
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0554 - val_loss: 0.0403
Epoch 8/300
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0523 - val_loss: 0.0385
Epoch 9/300
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

In [None]:
rmse = np.sqrt(np.mean((predictions - y_test_original) ** 2))
print(f"RMSE: {rmse:.2f}")

RMSE: 49.32


In [None]:
model.save('final_model.h5')



In [None]:
model.save('/content/final_model.h5')




In [None]:
from google.colab import files
files.download('/content/final_model.h5')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>