# Machine Learning - Linear Regression

In [2]:
from pyspark.sql import SparkSession

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

In [3]:
spark = SparkSession.builder.getOrCreate()

In [4]:
data_path = '../Data'
file_path = data_path + '/utilization.csv'

In [5]:
df = spark.read.format('csv').options(header=False, inferSchema=True).load(file_path)

# rename columns
df = df.withColumnRenamed('_c0', 'event_datetime')\
    .withColumnRenamed('_c1', 'server_id')\
    .withColumnRenamed('_c2', 'cpu_utilization')\
    .withColumnRenamed('_c3', 'free_memory')\
    .withColumnRenamed('_c4', 'session_count')

In [6]:
df.show(5)

+-------------------+---------+---------------+-----------+-------------+
|     event_datetime|server_id|cpu_utilization|free_memory|session_count|
+-------------------+---------+---------------+-----------+-------------+
|03/05/2019 08:06:14|      100|           0.57|       0.51|           47|
|03/05/2019 08:11:14|      100|           0.47|       0.62|           43|
|03/05/2019 08:16:14|      100|           0.56|       0.57|           62|
|03/05/2019 08:21:14|      100|           0.57|       0.56|           50|
|03/05/2019 08:26:14|      100|           0.35|       0.46|           43|
+-------------------+---------+---------------+-----------+-------------+
only showing top 5 rows



---------

# Vectorizing Features, tranform using VectorAssembler

In [7]:
vector_assembler = VectorAssembler(inputCols=['cpu_utilization'], outputCol='features')

In [8]:
df_vutil = vector_assembler.transform(df)

In [9]:
df_vutil.show(5)

+-------------------+---------+---------------+-----------+-------------+--------+
|     event_datetime|server_id|cpu_utilization|free_memory|session_count|features|
+-------------------+---------+---------------+-----------+-------------+--------+
|03/05/2019 08:06:14|      100|           0.57|       0.51|           47|  [0.57]|
|03/05/2019 08:11:14|      100|           0.47|       0.62|           43|  [0.47]|
|03/05/2019 08:16:14|      100|           0.56|       0.57|           62|  [0.56]|
|03/05/2019 08:21:14|      100|           0.57|       0.56|           50|  [0.57]|
|03/05/2019 08:26:14|      100|           0.35|       0.46|           43|  [0.35]|
+-------------------+---------+---------------+-----------+-------------+--------+
only showing top 5 rows



# Model Creation and Prediction

In [22]:
linear_regression = LinearRegression(featuresCol='features', labelCol='session_count')

In [23]:
lr_model = linear_regression.fit(df_vutil)

# Coefficient and Y Intercept

In [24]:
lr_model.coefficients

DenseVector([47.024])

In [25]:
lr_model.intercept

40.41695103550495

# Checking RMSE

In [26]:
lr_model.summary.rootMeanSquaredError

12.837990225931527

--------