In [1]:
#Machine Learning using Pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Customers').getOrCreate()

In [2]:
from pyspark.ml.regression import LinearRegression


In [5]:
data = spark.read.csv('/Users/riteshtripathi/Downloads/ecomm.csv', inferSchema = True, header = True)

In [7]:
data

DataFrame[Email: string, Address: string, Avg Session Length: double, Time on App: double, Time on Website: double, Length of Membership: double, Yearly Amount Spent: double]

In [8]:
data.show()

+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+
|               Email|             Address|Avg Session Length|Time on App|Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|       34.49726773|12.65565115|    39.57766802|         4.082620633|         587.951054|
|   hduke@hotmail.com|4547 Archer Commo...|       31.92627203|11.10946073|    37.26895887|         2.664034182|        392.2049334|
|    pallen@yahoo.com|24645 Valerie Uni...|       33.00091476|11.33027806|    37.11059744|         4.104543202|        487.5475049|
|riverarebecca@gma...|1414 David Throug...|       34.30555663|13.71751367|    36.72128268|         3.120178783|         581.852344|
|mstephens@davidso...|14023 Rodriguez P...|       33.33067252|12.79518855|  

In [10]:
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [12]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [14]:
features = VectorAssembler(inputCols = ['Avg Session Length', 'Time on App', 'Time on Website', 'Length of Membership'], outputCol = 'Independent Features')

In [19]:
#we need to transform above 'feature' variable
output = features.transform(data)


In [20]:
output.show()
#an extra feature created which is the combination of all the features

+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+--------------------+
|               Email|             Address|Avg Session Length|Time on App|Time on Website|Length of Membership|Yearly Amount Spent|Independent Features|
+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+--------------------+
|mstephenson@ferna...|835 Frank TunnelW...|       34.49726773|12.65565115|    39.57766802|         4.082620633|         587.951054|[34.49726773,12.6...|
|   hduke@hotmail.com|4547 Archer Commo...|       31.92627203|11.10946073|    37.26895887|         2.664034182|        392.2049334|[31.92627203,11.1...|
|    pallen@yahoo.com|24645 Valerie Uni...|       33.00091476|11.33027806|    37.11059744|         4.104543202|        487.5475049|[33.00091476,11.3...|
|riverarebecca@gma...|1414 David Throug...|       34.30555663|13.71751367|    36.7

In [22]:
#the new feature isnt clearly visible, hence lets see that feature
output.select('Independent Features').show()

+--------------------+
|Independent Features|
+--------------------+
|[34.49726773,12.6...|
|[31.92627203,11.1...|
|[33.00091476,11.3...|
|[34.30555663,13.7...|
|[33.33067252,12.7...|
|[33.87103788,12.0...|
|[32.0215955,11.36...|
|[32.73914294,12.3...|
|[33.9877729,13.38...|
|[31.93654862,11.8...|
|[33.99257277,13.3...|
|[33.87936082,11.5...|
|[29.53242897,10.9...|
|[33.19033404,12.9...|
|[32.38797585,13.1...|
|[30.73772037,12.6...|
|[32.1253869,11.73...|
|[32.33889932,12.0...|
|[32.18781205,14.7...|
|[32.61785606,13.9...|
+--------------------+
only showing top 20 rows



In [23]:
#I want to see my column names
output.columns

['Email',
 'Address',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent',
 'Independent Features']

In [24]:
#my model will be having combination of all the independent features and dependent features
finalized = output.select('Independent Features', 'Yearly Amount Spent')

In [25]:
finalized.show()

+--------------------+-------------------+
|Independent Features|Yearly Amount Spent|
+--------------------+-------------------+
|[34.49726773,12.6...|         587.951054|
|[31.92627203,11.1...|        392.2049334|
|[33.00091476,11.3...|        487.5475049|
|[34.30555663,13.7...|         581.852344|
|[33.33067252,12.7...|         599.406092|
|[33.87103788,12.0...|        637.1024479|
|[32.0215955,11.36...|        521.5721748|
|[32.73914294,12.3...|        549.9041461|
|[33.9877729,13.38...|         570.200409|
|[31.93654862,11.8...|        427.1993849|
|[33.99257277,13.3...|        492.6060127|
|[33.87936082,11.5...|        522.3374046|
|[29.53242897,10.9...|        408.6403511|
|[33.19033404,12.9...|        573.4158673|
|[32.38797585,13.1...|        470.4527333|
|[30.73772037,12.6...|        461.7807422|
|[32.1253869,11.73...|        457.8476959|
|[32.33889932,12.0...|        407.7045475|
|[32.18781205,14.7...|        452.3156755|
|[32.61785606,13.9...|        605.0610388|
+----------

In [28]:
#Random split for train and test
train_data, test_data = finalized.randomSplit([0.75, 0.25])

In [29]:
#above i can provide to my model
regressor = LinearRegression(featuresCol = 'Independent Features', labelCol = 'Yearly Amount Spent')
regressor = regressor.fit(train_data)

Exception ignored in: <function JavaWrapper.__del__ at 0x7fd44ed7bee0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/pyspark/ml/wrapper.py", line 39, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'LinearRegression' object has no attribute '_java_obj'


In [30]:
regressor.coefficients

DenseVector([25.8065, 38.4498, 0.5736, 61.4191])

In [31]:
regressor.intercept

-1055.4161224870015

In [35]:
pred = regressor.evaluate(test_data)

In [36]:
pred.predictions.show()

+--------------------+-------------------+------------------+
|Independent Features|Yearly Amount Spent|        prediction|
+--------------------+-------------------+------------------+
|[30.81620065,11.8...|        266.0863409| 283.3205756684506|
|[30.97167564,11.7...|        494.6386098|487.47884184644863|
|[31.04722214,11.1...|        392.4973992|  388.329747722697|
|[31.06132516,12.3...|        487.5554581| 493.2247008827885|
|[31.26064687,13.2...|        421.3266313| 421.8753807502119|
|[31.26810421,12.1...|        423.4705332| 427.0190551626415|
|[31.6005122,12.22...|        479.1728515|460.89070776973904|
|[31.60983957,12.7...|        444.5455497| 427.1983612316096|
|[31.66104982,11.3...|        416.3583536|417.33152125069796|
|[31.7207699,11.75...|        538.7749335| 545.7526461495111|
|[31.82934646,11.2...|         385.152338| 384.3714497000183|
|[31.85125313,12.4...|        472.9922467| 464.4158157651186|
|[31.8530748,12.14...|        459.2851235| 461.6308670122246|
|[31.885