# Goal: To predict yearly amount spent by customers using Linear Regression in PySpark

## install pyspark

In [2]:
!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/27/67/5158f846202d7f012d1c9ca21c3549a58fd3c6707ae8ee823adcaca6473c/pyspark-3.0.2.tar.gz (204.8MB)
[K     |████████████████████████████████| 204.8MB 63kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 20.8MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.0.2-py2.py3-none-any.whl size=205186687 sha256=9a06ecc043e21cffda7893cde6b5ff48ebca6d8ea2e295ad683376c8fcbc788a
  Stored in directory: /root/.cache/pip/wheels/8b/09/da/c1f2859bcc86375dc972c5b6af4881b3603269bcc4c9be5d16
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.0.2


## importing necessary pyspark libraries

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark =SparkSession.builder.appName('lr_ex').getOrCreate()

In [5]:
from pyspark.ml.regression import LinearRegression

## loading the dataset

In [6]:
from google.colab import files
import io
uploaded = files.upload()


Saving Ecommerce_Customers.csv to Ecommerce_Customers.csv


In [7]:
df_customers=spark.read.csv('Ecommerce_Customers.csv',inferSchema=True,header=True)

In [8]:
df_customers.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [9]:
df_customers.show(5)

+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|               Email|             Address|          Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|          Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|
|   hduke@hotmail.com|4547 Archer Commo...|       DarkGreen| 31.92627202636016|11.109460728682564|37.268958868297744|    2.66403418213262|  392.2049334443264|
|    pallen@yahoo.com|24645 Valerie Uni...|          Bisque|33.000914755642675|11.330278057777512|37.110597442120856|   4.104543202376424| 487.54750486747207|
|riverarebecca@gma...|1414 David Throug...|   

## Converting the necessary features into vectors accepted by the pyspark mlib

In [10]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [11]:
df_customers.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

## Here email, address, avatar are categorical variables which infer nothing. So, selecting only the numerical columns for the prediction 

## As said earlier converting the numerical columns into a single vector column 

In [12]:
assembler=VectorAssembler(inputCols=['Avg Session Length','Time on App','Time on Website','Length of Membership']
                          ,outputCol='features')

In [13]:
output=assembler.transform(df_customers)

In [14]:
output.show(2)

+--------------------+--------------------+---------+------------------+------------------+------------------+--------------------+-------------------+--------------------+
|               Email|             Address|   Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|            features|
+--------------------+--------------------+---------+------------------+------------------+------------------+--------------------+-------------------+--------------------+
|mstephenson@ferna...|835 Frank TunnelW...|   Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|[34.4972677251122...|
|   hduke@hotmail.com|4547 Archer Commo...|DarkGreen| 31.92627202636016|11.109460728682564|37.268958868297744|    2.66403418213262|  392.2049334443264|[31.9262720263601...|
+--------------------+--------------------+---------+------------------+------------------+------------------+--------------------+----

## Yearly Amount Spent column is the target column to predict

In [15]:
final_data = output.select('features','Yearly Amount Spent')

In [16]:
final_data.show(5)

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[34.4972677251122...|  587.9510539684005|
|[31.9262720263601...|  392.2049334443264|
|[33.0009147556426...| 487.54750486747207|
|[34.3055566297555...|  581.8523440352177|
|[33.3306725236463...|  599.4060920457634|
+--------------------+-------------------+
only showing top 5 rows



## Splitting the dataset into 70:30 ratio

In [17]:
train_data, test_data = final_data.randomSplit([0.7,0.3])

In [18]:
train_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                344|
|   mean|  497.1258676696652|
| stddev|  78.72749787706132|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [19]:
test_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                156|
|   mean| 504.13923494186196|
| stddev|  80.63965184066433|
|    min|  304.1355915788555|
|    max|  708.9351848669818|
+-------+-------------------+



## Specifying the target column for Linear Regression model

In [20]:
lr=LinearRegression(labelCol='Yearly Amount Spent')

In [21]:
lr_model=lr.fit(train_data)

In [22]:
test_results= lr_model.evaluate(test_data)

## residuals is the difference between the actual and predicted label from the test data

In [23]:
test_results.residuals.show()

+-------------------+
|          residuals|
+-------------------+
|  11.15247452536346|
|-3.6986418172420485|
|-6.2680008752670915|
| -1.229959515875521|
| 2.4804901987998846|
|  1.185229012655327|
| -5.345844224420603|
|   4.17742038783382|
|-17.655288802674022|
|-14.043270450113368|
|-6.5685335707445915|
|-11.065526894539744|
|   7.67338866820927|
| -5.818320881486159|
| -9.045561979262118|
|-13.829641982058774|
| -4.743311803929714|
|  4.907406148827363|
|  6.441596362020505|
| 5.3876181967928005|
+-------------------+
only showing top 20 rows



In [24]:
test_results.rootMeanSquaredError

9.364739279567457

In [25]:
test_results.r2

0.9864266516914535

## predicting the test data by selecting only the features

In [26]:
unlabeled_data=test_data.select('features')

In [27]:
predictions = lr_model.transform(unlabeled_data)

In [28]:
predictions.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[29.5324289670579...|  397.487876547264|
|[30.8794843441274...| 493.9052418020967|
|[31.0613251567161...| 493.8234589331687|
|[31.2606468698795...| 422.5565907728269|
|[31.3091926408918...|430.24022764113374|
|[31.3895854806643...|408.88438204732756|
|[31.5257524169682...| 449.3114710343025|
|[31.5316044825729...|432.33818534152874|
|[31.5702008293202...| 563.6007809440789|
|[31.5741380228732...| 558.4525426107002|
|[31.7242025238451...| 509.9564208587051|
|[31.8093003166791...| 547.8374262573809|
|[31.8209982016720...| 417.0018923450041|
|[31.8745516945853...|398.10356512775365|
|[31.8854062999117...| 399.1488349517376|
|[31.9365486184489...|441.02902687738697|
|[31.9453957483445...| 661.7632357415816|
|[32.0215955013870...|     516.664768609|
|[32.0305497162129...| 587.8328870565913|
|[32.0542618511847...| 556.4870394721902|
+--------------------+------------

In [30]:
test_data.show()

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[29.5324289670579...|  408.6403510726275|
|[30.8794843441274...|  490.2065999848547|
|[31.0613251567161...|  487.5554580579016|
|[31.2606468698795...|  421.3266312569514|
|[31.3091926408918...|  432.7207178399336|
|[31.3895854806643...|  410.0696110599829|
|[31.5257524169682...|  443.9656268098819|
|[31.5316044825729...| 436.51560572936256|
|[31.5702008293202...|  545.9454921414049|
|[31.5741380228732...|  544.4092721605869|
|[31.7242025238451...|  503.3878872879605|
|[31.8093003166791...|  536.7718993628412|
|[31.8209982016720...| 424.67528101321335|
|[31.8745516945853...|  392.2852442462675|
|[31.8854062999117...|  390.1032729724755|
|[31.9365486184489...|  427.1993848953282|
|[31.9453957483445...|  657.0199239376519|
|[32.0215955013870...|  521.5721747578274|
|[32.0305497162129...|  594.2744834186118|
|[32.0542618511847...|   561.874657668983|
+----------

## combining the predictions and test_data dataframes on features column to compare the predictions done by the model with the actual values

In [35]:
df_result = test_data.join(predictions, ['features'])
df_result = df_result.withColumnRenamed("prediction", "prediction_by_model")\
.withColumnRenamed("Yearly Amount Spent", "actual_values")
df_result.show()

+--------------------+------------------+-------------------+
|            features|     actual_values|prediction_by_model|
+--------------------+------------------+-------------------+
|[29.5324289670579...| 408.6403510726275|   397.487876547264|
|[30.8794843441274...| 490.2065999848547|  493.9052418020967|
|[31.0613251567161...| 487.5554580579016|  493.8234589331687|
|[31.2606468698795...| 421.3266312569514|  422.5565907728269|
|[31.3091926408918...| 432.7207178399336| 430.24022764113374|
|[31.3895854806643...| 410.0696110599829| 408.88438204732756|
|[31.5257524169682...| 443.9656268098819|  449.3114710343025|
|[31.5316044825729...|436.51560572936256| 432.33818534152874|
|[31.5702008293202...| 545.9454921414049|  563.6007809440789|
|[31.5741380228732...| 544.4092721605869|  558.4525426107002|
|[31.7242025238451...| 503.3878872879605|  509.9564208587051|
|[31.8093003166791...| 536.7718993628412|  547.8374262573809|
|[31.8209982016720...|424.67528101321335|  417.0018923450041|
|[31.874