In [1]:
# !apt update
# !apt-get install openjdk-11-jdk-headless -qq > /dev/null
# !wget -q http://archive.apache.org/dist/spark/spark-3.3.0/spark-3.3.0-bin-hadoop3.tgz
# !tar -xvf spark-3.3.0-bin-hadoop3.tgz
# !pip install -q findspark
# import os
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
# os.environ["SPARK_HOME"] = "/content/spark-3.3.0-bin-hadoop3"
import findspark
findspark.init()

In [None]:
# from google.colab import drive
# drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
# %cd '/content/gdrive/My Drive/LDS9/Practice/Chapter6/'

/content/gdrive/My Drive/LDS9/Practice/Chapter6


In [2]:
# import libraries
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from datetime import datetime
from pyspark.sql.functions import mean, stddev, col, log
from pyspark.sql.functions import to_date, dayofweek, to_timestamp
from pyspark.sql import types
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DateType
from pyspark.sql.functions import year, month
from pyspark.sql.functions import dayofmonth, weekofyear
from pyspark.sql.functions import split, explode
from pyspark.sql.functions import coalesce, first, lit
from pyspark.ml.feature import Binarizer
from pyspark.ml.feature import Bucketizer
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.sql.functions import regexp_extract, col
from pyspark.sql.functions import datediff
from pyspark.sql.functions import when

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import LinearRegressionModel

In [3]:
sc =SparkContext()

In [4]:
spark = SparkSession(sc)

## Chuẩn bị, chuẩn hóa dữ liệu, xác định input, output

In [5]:
# Use Spark to read in the Ecommerce Customers csv file.
data = spark.read.csv("Ecommerce_Customers.csv", inferSchema=True,header=True)

In [6]:
# Print the Schema of the DataFrame
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [None]:
print((data.count(), len(data.columns)))

(500, 8)


In [None]:
# Khi in bằng head thì định dạng hiển thị là row (khác với head ở pandas dataframe)
data.head(1)

[Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005)]

In [None]:
for item in data.head():
    print(item)

mstephenson@fernandez.com
835 Frank TunnelWrightmouth, MI 82180-9605
Violet
34.49726772511229
12.65565114916675
39.57766801952616
4.0826206329529615
587.9510539684005


In [7]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [9]:
data.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [11]:
assembler = VectorAssembler(
    inputCols=['Avg Session Length','Time on App',
                'Time on Website','Length of Membership'],
    outputCol='features'
)

In [13]:
data_pre = assembler.transform(data)

In [14]:
data_pre.select('features').show(2, False)

+--------------------------------------------------------------------------+
|features                                                                  |
+--------------------------------------------------------------------------+
|[34.49726772511229,12.65565114916675,39.57766801952616,4.0826206329529615]|
|[31.92627202636016,11.109460728682564,37.268958868297744,2.66403418213262]|
+--------------------------------------------------------------------------+
only showing top 2 rows



In [15]:
final_data = data_pre.select('features','Yearly Amount Spent')

In [17]:
train_data, test_data = final_data.randomSplit([0.7,0.3])

In [20]:
train_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                348|
|   mean|  496.0865102640647|
| stddev|  81.65120935054968|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [21]:
test_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                152|
|   mean| 506.70337866711077|
| stddev|  73.41168339420653|
|    min|  314.4385182951061|
|    max|  684.1634310159512|
+-------+-------------------+



In [22]:
lr = LinearRegression(featuresCol='features',
                     labelCol='Yearly Amount Spent',
                     predictionCol='Predict_Yearly Amount Spent')

In [23]:
lrModel = lr.fit(train_data)

In [24]:
print('Coefficiets: {} Intercept: {}'.format(lrModel.coefficients, lrModel.intercept))

Coefficiets: [25.650894872751756,38.86225838653607,0.3801518951830216,61.76022824385161] Intercept: -1049.1141554746655


In [25]:
test_result =lrModel.evaluate(test_data)

In [26]:
test_result.residuals.show(5)

+-------------------+
|          residuals|
+-------------------+
| 10.209696181128834|
|  6.307193946518964|
|-13.296535255314382|
| 0.5111930294265221|
|-18.312982781657638|
+-------------------+
only showing top 5 rows



In [28]:
print('RMSE:', test_result.rootMeanSquaredError)
print('MSE:', test_result.meanSquaredError)
print('r2:', test_result.r2)

RMSE: 9.629661007530153
MSE: 92.73037111994664
r2: 0.9826795858337282


In [29]:
test_model = lrModel.transform(test_data)

In [32]:
test_model.select('Predict_Yearly Amount Spent','Yearly Amount Spent').show(5)

+---------------------------+-------------------+
|Predict_Yearly Amount Spent|Yearly Amount Spent|
+---------------------------+-------------------+
|         451.57104601510105|  461.7807421962299|
|         488.33141581037376|  494.6386097568927|
|         462.22982846298873| 448.93329320767435|
|         409.55841803055637|  410.0696110599829|
|          564.2584749230625|  545.9454921414049|
+---------------------------+-------------------+
only showing top 5 rows



In [33]:
lrModel.save('lrModel_Ecommerce_Customers')

In [34]:
from pyspark.ml.regression import LinearRegressionModel
lrModel2 = LinearRegressionModel.load('lrModel_Ecommerce_Customers')

In [35]:
unlabeled_data = test_data.select('features')

In [37]:
predictions = lrModel2.transform(unlabeled_data)

In [38]:
predictions.show()

+--------------------+---------------------------+
|            features|Predict_Yearly Amount Spent|
+--------------------+---------------------------+
|[30.7377203726281...|         451.57104601510105|
|[30.9716756438877...|         488.33141581037376|
|[31.0662181616375...|         462.22982846298873|
|[31.3895854806643...|         409.55841803055637|
|[31.5702008293202...|          564.2584749230625|
|[31.5761319713222...|          543.7225862249902|
|[31.6610498227460...|          417.4199418533685|
|[31.7216523605090...|         349.47196093116463|
|[31.8209982016720...|         417.21897934964363|
|[31.8293464559211...|          384.1294209318296|
|[31.8530748017465...|         461.91362228654316|
|[31.8627411090001...|          558.7623419892514|
|[31.8648325480987...|          450.5290615648105|
|[31.8745516945853...|         398.05252963956286|
|[31.9096268275227...|          552.4008482087479|
|[31.9453957483445...|           663.540963795514|
|[31.9563005605233...|         