## Initializing Spark Environment

In [1]:
import findspark
findspark.init('/home/shahayush954/spark-3.4.1-bin-hadoop3')

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('lr_custom_example').getOrCreate()

23/08/15 17:12:10 WARN Utils: Your hostname, ubuntu-22 resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
23/08/15 17:12:10 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/15 17:12:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
from pyspark.ml.regression import LinearRegression

In [5]:
data = spark.read.csv('Ecommerce_Customers.csv', inferSchema=True, header=True)

In [6]:
data.show()

+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|               Email|             Address|          Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|          Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|
|   hduke@hotmail.com|4547 Archer Commo...|       DarkGreen| 31.92627202636016|11.109460728682564|37.268958868297744|    2.66403418213262|  392.2049334443264|
|    pallen@yahoo.com|24645 Valerie Uni...|          Bisque|33.000914755642675|11.330278057777512|37.110597442120856|   4.104543202376424| 487.54750486747207|
|riverarebecca@gma...|1414 David Throug...|   

## Cleaning and Transforming Data for Linear Regression

In [7]:
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [10]:
data.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [9]:
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler 

In [12]:
assembler = VectorAssembler(inputCols=['Avg Session Length','Time on App','Time on Website','Length of Membership'], outputCol='features')

In [13]:
data_transformed = assembler.transform(data)

In [14]:
data_transformed.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)
 |-- features: vector (nullable = true)



In [15]:
final_data = data_transformed.select('features', 'Yearly Amount Spent')

In [16]:
final_data.show()

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[34.4972677251122...|  587.9510539684005|
|[31.9262720263601...|  392.2049334443264|
|[33.0009147556426...| 487.54750486747207|
|[34.3055566297555...|  581.8523440352177|
|[33.3306725236463...|  599.4060920457634|
|[33.8710378793419...|   637.102447915074|
|[32.0215955013870...|  521.5721747578274|
|[32.7391429383803...|  549.9041461052942|
|[33.9877728956856...|  570.2004089636196|
|[31.9365486184489...|  427.1993848953282|
|[33.9925727749537...|  492.6060127179966|
|[33.8793608248049...|  522.3374046069357|
|[29.5324289670579...|  408.6403510726275|
|[33.1903340437226...|  573.4158673313865|
|[32.3879758531538...|  470.4527333009554|
|[30.7377203726281...|  461.7807421962299|
|[32.1253868972878...| 457.84769594494855|
|[32.3388993230671...| 407.70454754954415|
|[32.1878120459321...|  452.3156754800354|
|[32.6178560628234...|   605.061038804892|
+----------

In [17]:
## spliting into training and testing data
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [18]:
train_data.show()

                                                                                

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[29.5324289670579...|  408.6403510726275|
|[30.3931845423455...|  319.9288698031936|
|[30.8364326747734...|  467.5019004269896|
|[30.8794843441274...|  490.2065999848547|
|[30.9716756438877...|  494.6386097568927|
|[31.0472221394875...|  392.4973991890214|
|[31.0613251567161...|  487.5554580579016|
|[31.0662181616375...| 448.93329320767435|
|[31.1239743499119...|  486.9470538397658|
|[31.1280900496166...|  557.2526867470547|
|[31.1695067987115...|  427.3565308022928|
|[31.2606468698795...|  421.3266312569514|
|[31.2834474760581...|  591.7810894256675|
|[31.3091926408918...|  432.7207178399336|
|[31.3584771924370...|  495.1759504494754|
|[31.3895854806643...|  410.0696110599829|
|[31.4252268808548...|  530.7667186547619|
|[31.4459724827577...| 484.87696493512857|
|[31.4474464941278...|   418.602742095224|
|[31.5147378578019...|  489.8124879964614|
+----------

## Linear Regression

In [19]:
lr = LinearRegression(labelCol='Yearly Amount Spent')

In [20]:
lr_model = lr.fit(train_data)

23/08/15 17:49:46 WARN Instrumentation: [1761500f] regParam is zero, which might cause numerical instability and overfitting.
                                                                                

In [21]:
test_results = lr_model.evaluate(test_data)

                                                                                

In [22]:
test_results.rootMeanSquaredError

9.327079168087508

In [23]:
unlabelled_data = test_data.select('features')

In [25]:
predictions = lr_model.transform(unlabelled_data)