In [9]:
from pyspark.sql import SparkSession

In [10]:
spark = SparkSession.builder.appName("practice").getOrCreate()

In [11]:
df = spark.read.csv("data.csv", header=True, inferSchema=True)

In [12]:
df.show(5)

+----+----+----------+-----------+
|city|year|weekofyear|total_cases|
+----+----+----------+-----------+
|  sj|1990|        18|          4|
|  sj|1990|        19|          5|
|  sj|1990|        20|          4|
|  sj|1990|        21|          3|
|  sj|1990|        22|          6|
+----+----+----------+-----------+
only showing top 5 rows



In [13]:
df.printSchema()

root
 |-- city: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- weekofyear: integer (nullable = true)
 |-- total_cases: integer (nullable = true)



In [14]:
from pyspark.ml.feature import VectorAssembler
feature_assmebler = VectorAssembler(inputCols=["year", "weekofyear"], outputCol="independent_features")

In [15]:
output = feature_assmebler.transform(df)

In [16]:
output.show(5)

+----+----+----------+-----------+--------------------+
|city|year|weekofyear|total_cases|independent_features|
+----+----+----------+-----------+--------------------+
|  sj|1990|        18|          4|       [1990.0,18.0]|
|  sj|1990|        19|          5|       [1990.0,19.0]|
|  sj|1990|        20|          4|       [1990.0,20.0]|
|  sj|1990|        21|          3|       [1990.0,21.0]|
|  sj|1990|        22|          6|       [1990.0,22.0]|
+----+----+----------+-----------+--------------------+
only showing top 5 rows



In [17]:
final_data = output.select("independent_features", "total_cases")

In [18]:
final_data.show(5)

+--------------------+-----------+
|independent_features|total_cases|
+--------------------+-----------+
|       [1990.0,18.0]|          4|
|       [1990.0,19.0]|          5|
|       [1990.0,20.0]|          4|
|       [1990.0,21.0]|          3|
|       [1990.0,22.0]|          6|
+--------------------+-----------+
only showing top 5 rows



In [19]:
from pyspark.ml.regression import LinearRegression
train_data, test_data = final_data.randomSplit([0.7, 0.3])
regressor = LinearRegression(featuresCol="independent_features", labelCol="total_cases")
regressor = regressor.fit(train_data)

24/04/29 17:31:33 WARN Instrumentation: [61a1bc64] regParam is zero, which might cause numerical instability and overfitting.


In [20]:
regressor.coefficients

DenseVector([-2.6046, 0.6275])

In [21]:
regressor.intercept

5220.809180220915

In [22]:
pred = regressor.evaluate(test_data)

In [23]:
pred.predictions.show(5)

+--------------------+-----------+------------------+
|independent_features|total_cases|        prediction|
+--------------------+-----------+------------------+
|       [1990.0,18.0]|          4| 48.85452311170138|
|       [1990.0,20.0]|          4|50.109554448317795|
|       [1990.0,27.0]|          6|  54.5021641264766|
|       [1990.0,28.0]|          8| 55.12967979478526|
|       [1990.0,30.0]|          6| 56.38471113140258|
+--------------------+-----------+------------------+
only showing top 5 rows



In [24]:
pred.meanAbsoluteError

20.405235142692383

In [25]:
pred.meanSquaredError

948.3048841491413

In [26]:
pred.rootMeanSquaredError

30.794559327081487