Import Functions

In [1]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession
from pymongo import MongoClient
import pandas as pd

Connect to data

In [2]:
client = MongoClient("mongodb+srv://climateindicators:climatemongo@usf.6ejgq.mongodb.net/")

db = client["climate_data"]
collection = db["co2_levels"]

data = list(collection.find({}, {"_id": 0, "decimal_date": 1, "monthly_average": 1}))

df = pd.DataFrame(data)

df["decimal_date"] = pd.to_numeric(df["decimal_date"])
df["monthly_average"] = pd.to_numeric(df["monthly_average"])

print(df.head())

   decimal_date  monthly_average
0     1958.2027           315.71
1     1958.2877           317.45
2     1958.3699           317.51
3     1958.4548           317.27
4     1958.5370           315.87


Connect to spark

In [3]:
spark = SparkSession.builder.appName("CO2_Prediction").getOrCreate()

spark_df = spark.createDataFrame(df)
spark_df.show()


25/03/03 21:17:24 WARN Utils: Your hostname, Pranavs-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.0.0.131 instead (on interface en0)
25/03/03 21:17:24 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/03 21:17:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


+------------+---------------+
|decimal_date|monthly_average|
+------------+---------------+
|   1958.2027|         315.71|
|   1958.2877|         317.45|
|   1958.3699|         317.51|
|   1958.4548|         317.27|
|    1958.537|         315.87|
|   1958.6219|         314.93|
|   1958.7068|         313.21|
|    1958.789|         312.42|
|    1958.874|         313.33|
|   1958.9562|         314.67|
|   1959.0411|         315.58|
|    1959.126|         316.49|
|   1959.2027|         316.65|
|   1959.2877|         317.72|
|   1959.3699|         318.29|
|   1959.4548|         318.15|
|    1959.537|         316.54|
|   1959.6219|          314.8|
|   1959.7068|         313.84|
|    1959.789|         313.33|
+------------+---------------+
only showing top 20 rows



Vector assembler for ML

In [4]:
assembler = VectorAssembler(inputCols=["decimal_date"], outputCol="features")
data = assembler.transform(spark_df).select("features", "monthly_average")
data.show()


+-----------+---------------+
|   features|monthly_average|
+-----------+---------------+
|[1958.2027]|         315.71|
|[1958.2877]|         317.45|
|[1958.3699]|         317.51|
|[1958.4548]|         317.27|
| [1958.537]|         315.87|
|[1958.6219]|         314.93|
|[1958.7068]|         313.21|
| [1958.789]|         312.42|
| [1958.874]|         313.33|
|[1958.9562]|         314.67|
|[1959.0411]|         315.58|
| [1959.126]|         316.49|
|[1959.2027]|         316.65|
|[1959.2877]|         317.72|
|[1959.3699]|         318.29|
|[1959.4548]|         318.15|
| [1959.537]|         316.54|
|[1959.6219]|          314.8|
|[1959.7068]|         313.84|
| [1959.789]|         313.33|
+-----------+---------------+
only showing top 20 rows



In [5]:
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

Simple Linear Regression

In [6]:
lr = LinearRegression(featuresCol="features", labelCol="monthly_average")
model = lr.fit(train_data)

print(f"Coefficients: {model.coefficients}")
print(f"Intercept: {model.intercept}")

25/03/03 21:17:27 WARN Instrumentation: [a48d0d7f] regParam is zero, which might cause numerical instability and overfitting.
25/03/03 21:17:28 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/03/03 21:17:28 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


Coefficients: [1.3144306041821148]
Intercept: -2263.0762294641427


Calculate RMSE

In [7]:
predictions = model.transform(test_data)
evaluator = RegressionEvaluator(labelCol="monthly_average", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse}")

Root Mean Squared Error (RMSE): 2.7848995201157676


Predict CO2 levels for next 5 years

In [8]:
# Create a DataFrame for future dates (2025-2030)
future_dates = spark.createDataFrame([(2025 + i,) for i in range(6)], ["decimal_date"])
future_features = assembler.transform(future_dates)

# Predict future values
future_predictions = model.transform(future_features)
future_predictions.show()

+------------+--------+------------------+
|decimal_date|features|        prediction|
+------------+--------+------------------+
|        2025|[2025.0]| 398.6457440046397|
|        2026|[2026.0]|399.96017460882194|
|        2027|[2027.0]| 401.2746052130042|
|        2028|[2028.0]|402.58903581718596|
|        2029|[2029.0]| 403.9034664213682|
|        2030|[2030.0]| 405.2178970255504|
+------------+--------+------------------+

