In [45]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import isnan, when, count, col, udf, lit, unix_timestamp, from_unixtime, date_format, to_timestamp, hour, minute, second, dayofmonth, month, year
from pyspark.ml.feature import IndexToString, StringIndexer
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

In [2]:
spark = SparkSession.builder.master("local") \
                    .appName('Bicycle') \
                    .getOrCreate()

In [47]:
##loading train dataset
df_train = spark.read.option("header",True)\
                  .format("csv").load("train.csv")
#columns_to_drop = ['datetime','casual','registered','count']
#df_train = df_train.drop(*columns_to_drop)
print("Train Data:")
df_train.show()
#df_train.createTempView("train")
###loading test dataset
df_test = spark.read.option("header",True)\
                  .format("csv").load("test.csv")
#columns_to_drop_test = ['datetime']
#df_test = df_test.drop(*columns_to_drop_test)
#df_test.createTempView("test")
print("Test Data:")
df_test.show()

Train Data:
+-------------------+------+-------+----------+-------+-----+------+--------+---------+------+----------+-----+
|           datetime|season|holiday|workingday|weather| temp| atemp|humidity|windspeed|casual|registered|count|
+-------------------+------+-------+----------+-------+-----+------+--------+---------+------+----------+-----+
|2011-01-01 00:00:00|     1|      0|         0|      1| 9.84|14.395|      81|        0|     3|        13|   16|
|2011-01-01 01:00:00|     1|      0|         0|      1| 9.02|13.635|      80|        0|     8|        32|   40|
|2011-01-01 02:00:00|     1|      0|         0|      1| 9.02|13.635|      80|        0|     5|        27|   32|
|2011-01-01 03:00:00|     1|      0|         0|      1| 9.84|14.395|      75|        0|     3|        10|   13|
|2011-01-01 04:00:00|     1|      0|         0|      1| 9.84|14.395|      75|        0|     0|         1|    1|
|2011-01-01 05:00:00|     1|      0|         0|      2| 9.84| 12.88|      75|   6.0032|     

In [4]:
df_train.dtypes

[('datetime', 'string'),
 ('season', 'string'),
 ('holiday', 'string'),
 ('workingday', 'string'),
 ('weather', 'string'),
 ('temp', 'string'),
 ('atemp', 'string'),
 ('humidity', 'string'),
 ('windspeed', 'string'),
 ('casual', 'string'),
 ('registered', 'string'),
 ('count', 'string')]

In [5]:
df_test.dtypes

[('datetime', 'string'),
 ('season', 'string'),
 ('holiday', 'string'),
 ('workingday', 'string'),
 ('weather', 'string'),
 ('temp', 'string'),
 ('atemp', 'string'),
 ('humidity', 'string'),
 ('windspeed', 'string')]

In [6]:
df_train.describe().show()

+-------+-------------------+------------------+-------------------+------------------+------------------+------------------+-----------------+------------------+------------------+-----------------+------------------+------------------+
|summary|           datetime|            season|            holiday|        workingday|           weather|              temp|            atemp|          humidity|         windspeed|           casual|        registered|             count|
+-------+-------------------+------------------+-------------------+------------------+------------------+------------------+-----------------+------------------+------------------+-----------------+------------------+------------------+
|  count|              10886|             10886|              10886|             10886|             10886|             10886|            10886|             10886|             10886|            10886|             10886|             10886|
|   mean|               null|2.5066139996325556|

In [7]:
df_test.describe().show()

+-------+-------------------+------------------+--------------------+-------------------+------------------+------------------+------------------+-----------------+-----------------+
|summary|           datetime|            season|             holiday|         workingday|           weather|              temp|             atemp|         humidity|        windspeed|
+-------+-------------------+------------------+--------------------+-------------------+------------------+------------------+------------------+-----------------+-----------------+
|  count|               6493|              6493|                6493|               6493|              6493|              6493|              6493|             6493|             6493|
|   mean|               null|  2.49330047743724|0.029108270445094717| 0.6858154936085015|1.4367780686893579|20.620606807330972|24.012864623440585| 64.1252117665178|12.63115720006173|
| stddev|               null|1.0912579418644106| 0.16812296760854603|0.46422601479880

In [8]:
df_train.describe()

DataFrame[summary: string, datetime: string, season: string, holiday: string, workingday: string, weather: string, temp: string, atemp: string, humidity: string, windspeed: string, casual: string, registered: string, count: string]

In [9]:
df_test.describe()

DataFrame[summary: string, datetime: string, season: string, holiday: string, workingday: string, weather: string, temp: string, atemp: string, humidity: string, windspeed: string]

In [10]:
##Checking for Nan/Null values in train data
df_train.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_train.columns]).show()

+--------+------+-------+----------+-------+----+-----+--------+---------+------+----------+-----+
|datetime|season|holiday|workingday|weather|temp|atemp|humidity|windspeed|casual|registered|count|
+--------+------+-------+----------+-------+----+-----+--------+---------+------+----------+-----+
|       0|     0|      0|         0|      0|   0|    0|       0|        0|     0|         0|    0|
+--------+------+-------+----------+-------+----+-----+--------+---------+------+----------+-----+



In [11]:
##Checking for Nan/Null values in test data
df_test.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_test.columns]).show()

+--------+------+-------+----------+-------+----+-----+--------+---------+
|datetime|season|holiday|workingday|weather|temp|atemp|humidity|windspeed|
+--------+------+-------+----------+-------+----+-----+--------+---------+
|       0|     0|      0|         0|      0|   0|    0|       0|        0|
+--------+------+-------+----------+-------+----+-----+--------+---------+



In [48]:
##Converting Numerical variable of season to Categorical variable using StringIndexer and IndexToString
##TRAINING DATASET
df_train = (df_train.withColumn("season_1", when(col("season")==1,1).otherwise(0))
                    .withColumn("season_2", when(col("season")==2,2).otherwise(0))
                   .withColumn("season_3", when(col("season")==3,3).otherwise(0))
                   .withColumn("season_4", when(col("season")==4,4).otherwise(0)))
df_train = df_train.drop('season')

##TESTING DATASET
df_test = (df_test.withColumn("season_1", when(col("season")==1,1).otherwise(0))
                    .withColumn("season_2", when(col("season")==2,2).otherwise(0))
                   .withColumn("season_3", when(col("season")==3,3).otherwise(0))
                   .withColumn("season_4", when(col("season")==4,4).otherwise(0)))
df_test = df_test.drop('season')

In [49]:
##Converting Numerical variable of weather to Categorical variable using StringIndexer and IndexToString
##TRAINING DATASET
df_train = (df_train.withColumn("weather_1", when(col("weather")==1,1).otherwise(0))
                   .withColumn("weather_2", when(col("weather")==2,2).otherwise(0))
                   .withColumn("weather_3", when(col("weather")==3,3).otherwise(0))
                   .withColumn("weather_4", when(col("weather")==4,4).otherwise(0)))
df_train = df_train.drop('weather')

##TESTING DATASET
df_test = (df_test.withColumn("weather_1", when(col("weather")==1,1).otherwise(0))
                   .withColumn("weather_2", when(col("weather")==2,2).otherwise(0))
                   .withColumn("weather_3", when(col("weather")==3,3).otherwise(0))
                   .withColumn("weather_4", when(col("weather")==4,4).otherwise(0)))
df_test = df_test.drop('weather')

In [50]:
df_train.show()

+-------------------+-------+----------+-----+------+--------+---------+------+----------+-----+--------+--------+--------+--------+---------+---------+---------+---------+
|           datetime|holiday|workingday| temp| atemp|humidity|windspeed|casual|registered|count|season_1|season_2|season_3|season_4|weather_1|weather_2|weather_3|weather_4|
+-------------------+-------+----------+-----+------+--------+---------+------+----------+-----+--------+--------+--------+--------+---------+---------+---------+---------+
|2011-01-01 00:00:00|      0|         0| 9.84|14.395|      81|        0|     3|        13|   16|       1|       0|       0|       0|        1|        0|        0|        0|
|2011-01-01 01:00:00|      0|         0| 9.02|13.635|      80|        0|     8|        32|   40|       1|       0|       0|       0|        1|        0|        0|        0|
|2011-01-01 02:00:00|      0|         0| 9.02|13.635|      80|        0|     5|        27|   32|       1|       0|       0|       0|   

In [51]:
df_test.show()

+-------------------+-------+----------+-----+------+--------+---------+--------+--------+--------+--------+---------+---------+---------+---------+
|           datetime|holiday|workingday| temp| atemp|humidity|windspeed|season_1|season_2|season_3|season_4|weather_1|weather_2|weather_3|weather_4|
+-------------------+-------+----------+-----+------+--------+---------+--------+--------+--------+--------+---------+---------+---------+---------+
|2011-01-20 00:00:00|      0|         1|10.66|11.365|      56|  26.0027|       1|       0|       0|       0|        1|        0|        0|        0|
|2011-01-20 01:00:00|      0|         1|10.66|13.635|      56|        0|       1|       0|       0|       0|        1|        0|        0|        0|
|2011-01-20 02:00:00|      0|         1|10.66|13.635|      56|        0|       1|       0|       0|       0|        1|        0|        0|        0|
|2011-01-20 03:00:00|      0|         1|10.66| 12.88|      56|  11.0014|       1|       0|       0|       

In [52]:
## Convernting TRAINING DATASET datatypes from String to int
df_train_new = df_train.withColumn('datetime',to_timestamp("datetime"))\
                       .withColumn('holiday',df_train["holiday"].cast(IntegerType()))\
                       .withColumn('workingday',df_train["workingday"].cast(IntegerType()))\
                       .withColumn('temp',df_train["temp"].cast(IntegerType()))\
                       .withColumn('atemp',df_train["atemp"].cast(IntegerType()))\
                       .withColumn('humidity',df_train["humidity"].cast(IntegerType()))\
                       .withColumn('windspeed',df_train["windspeed"].cast(IntegerType()))\
                       .withColumn('casual',df_train["casual"].cast(IntegerType()))\
                       .withColumn('registered',df_train["registered"].cast(IntegerType()))\
                       .withColumn('count',df_train["count"].cast(IntegerType()))

df_test_new = df_test.withColumn('datetime',to_timestamp("datetime"))\
                       .withColumn('holiday',df_test["holiday"].cast(IntegerType()))\
                       .withColumn('workingday',df_test["workingday"].cast(IntegerType()))\
                       .withColumn('temp',df_test["temp"].cast(IntegerType()))\
                       .withColumn('atemp',df_test["atemp"].cast(IntegerType()))\
                       .withColumn('humidity',df_test["humidity"].cast(IntegerType()))\
                       .withColumn('windspeed',df_test["windspeed"].cast(IntegerType()))

In [53]:
type(df_train_new)
type(df_test_new)

pyspark.sql.dataframe.DataFrame

In [84]:
print(df_train_new.dtypes)
print(df_test_new.dtypes)

[('datetime', 'timestamp'), ('holiday', 'int'), ('workingday', 'int'), ('temp', 'int'), ('atemp', 'int'), ('humidity', 'int'), ('windspeed', 'int'), ('casual', 'int'), ('registered', 'int'), ('count', 'int'), ('season_1', 'int'), ('season_2', 'int'), ('season_3', 'int'), ('season_4', 'int'), ('weather_1', 'int'), ('weather_2', 'int'), ('weather_3', 'int'), ('weather_4', 'int')]
[('datetime', 'timestamp'), ('holiday', 'int'), ('workingday', 'int'), ('temp', 'int'), ('atemp', 'int'), ('humidity', 'int'), ('windspeed', 'int'), ('season_1', 'int'), ('season_2', 'int'), ('season_3', 'int'), ('season_4', 'int'), ('weather_1', 'int'), ('weather_2', 'int'), ('weather_3', 'int'), ('weather_4', 'int')]


In [85]:
##Split TRAINING DATASET datetime into meaning columns such as hour,day,month,year,etc
df_train = df_train_new.withColumn('hour',hour(col("datetime")))\
                       .withColumn('minute',minute(col("datetime")))\
                       .withColumn('second',second(col("datetime")))\
                       .withColumn('day',dayofmonth(col("datetime")))\
                       .withColumn('year',year(col("datetime")))\
                       .withColumn('month',month(col("datetime")))
df_train.show()

##Split TESTING DATASET datetime into meaning columns such as hour,day,month,year,etc
df_test = df_test_new.withColumn('hour',hour(col("datetime")))\
                       .withColumn('minute',minute(col("datetime")))\
                       .withColumn('second',second(col("datetime")))\
                       .withColumn('day',dayofmonth(col("datetime")))\
                       .withColumn('year',year(col("datetime")))\
                       .withColumn('month',month(col("datetime")))
df_test.show()

+-------------------+-------+----------+----+-----+--------+---------+------+----------+-----+--------+--------+--------+--------+---------+---------+---------+---------+----+------+------+---+----+-----+
|           datetime|holiday|workingday|temp|atemp|humidity|windspeed|casual|registered|count|season_1|season_2|season_3|season_4|weather_1|weather_2|weather_3|weather_4|hour|minute|second|day|year|month|
+-------------------+-------+----------+----+-----+--------+---------+------+----------+-----+--------+--------+--------+--------+---------+---------+---------+---------+----+------+------+---+----+-----+
|2011-01-01 00:00:00|      0|         0|   9|   14|      81|        0|     3|        13|   16|       1|       0|       0|       0|        1|        0|        0|        0|   0|     0|     0|  1|2011|    1|
|2011-01-01 01:00:00|      0|         0|   9|   13|      80|        0|     8|        32|   40|       1|       0|       0|       0|        1|        0|        0|        0|   1|     

In [183]:
## VectorAssembler
assembler_train_f = VectorAssembler(inputCols=['season_1','season_2','season_3','season_4','weather_1','weather_2','weather_3','weather_4','temp','atemp','humidity','windspeed','hour','minute','second','day','year','month','holiday','workingday'],outputCol="features")
assembled_train_f = assembler_train_f.transform(df_train)

(trainingData, testData) = assembled_train_f.randomSplit([0.9, 0.1])

In [184]:
trainingData.count()
testData.count()

1088

In [185]:
trainingData.show()
testData.show()

+-------------------+-------+----------+----+-----+--------+---------+------+----------+-----+--------+--------+--------+--------+---------+---------+---------+---------+----+------+------+---+----+-----+--------------------+
|           datetime|holiday|workingday|temp|atemp|humidity|windspeed|casual|registered|count|season_1|season_2|season_3|season_4|weather_1|weather_2|weather_3|weather_4|hour|minute|second|day|year|month|            features|
+-------------------+-------+----------+----+-----+--------+---------+------+----------+-----+--------+--------+--------+--------+---------+---------+---------+---------+----+------+------+---+----+-----+--------------------+
|2011-01-01 01:00:00|      0|         0|   9|   13|      80|        0|     8|        32|   40|       1|       0|       0|       0|        1|        0|        0|        0|   1|     0|     0|  1|2011|    1|(20,[0,4,8,9,10,1...|
|2011-01-01 02:00:00|      0|         0|   9|   13|      80|        0|     5|        27|   32|  

In [186]:
lr = LinearRegression(labelCol='count')
lrModel = lr.fit(trainingData)

In [187]:
trainingSummary = lrModel.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 140.253173
r2: 0.400700


In [188]:
lr_predictions = lrModel.transform(testData)
lr_predictions.select("prediction","count").show(5)
from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="count",metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))

+------------------+-----+
|        prediction|count|
+------------------+-----+
| -98.7184679057682|   16|
|-45.69496224165778|    1|
| 40.60745651673642|   20|
| 46.79084519753815|   76|
|160.11800633199164|   30|
+------------------+-----+
only showing top 5 rows

R Squared (R2) on test data = 0.368014


In [190]:
lrModel.write().overwrite().save("trained_model/linearregression.model")

Py4JJavaError: An error occurred while calling o3240.save.
: org.apache.spark.SparkException: Job aborted.
	at org.apache.spark.internal.io.SparkHadoopWriter$.write(SparkHadoopWriter.scala:100)
	at org.apache.spark.rdd.PairRDDFunctions.$anonfun$saveAsHadoopDataset$1(PairRDDFunctions.scala:1090)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:388)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1088)
	at org.apache.spark.rdd.PairRDDFunctions.$anonfun$saveAsHadoopFile$4(PairRDDFunctions.scala:1061)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:388)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1026)
	at org.apache.spark.rdd.PairRDDFunctions.$anonfun$saveAsHadoopFile$3(PairRDDFunctions.scala:1008)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:388)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1007)
	at org.apache.spark.rdd.PairRDDFunctions.$anonfun$saveAsHadoopFile$2(PairRDDFunctions.scala:964)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:388)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:962)
	at org.apache.spark.rdd.RDD.$anonfun$saveAsTextFile$2(RDD.scala:1552)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:388)
	at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1552)
	at org.apache.spark.rdd.RDD.$anonfun$saveAsTextFile$1(RDD.scala:1538)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:388)
	at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1538)
	at org.apache.spark.ml.util.DefaultParamsWriter$.saveMetadata(ReadWrite.scala:413)
	at org.apache.spark.ml.regression.InternalLinearRegressionModelWriter.write(LinearRegression.scala:721)
	at org.apache.spark.ml.util.GeneralMLWriter.saveImpl(ReadWrite.scala:260)
	at org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:168)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Unknown Source)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 168.0 failed 1 times, most recent failure: Lost task 0.0 in stage 168.0 (TID 162, 192.168.2.14, executor driver): java.io.IOException: (null) entry in command string: null chmod 0644 C:\Users\SP.000\Documents\Edureka\Machine Learning Engineer\Python Spark Certification Training using PySpark\Project\trained_model\linearregression.model\metadata\_temporary\0\_temporary\attempt_20210116213211_0841_m_000000_0\part-00000
	at org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:773)
	at org.apache.hadoop.util.Shell.execCommand(Shell.java:869)
	at org.apache.hadoop.util.Shell.execCommand(Shell.java:852)
	at org.apache.hadoop.fs.RawLocalFileSystem.setPermission(RawLocalFileSystem.java:733)
	at org.apache.hadoop.fs.RawLocalFileSystem$LocalFSFileOutputStream.<init>(RawLocalFileSystem.java:225)
	at org.apache.hadoop.fs.RawLocalFileSystem$LocalFSFileOutputStream.<init>(RawLocalFileSystem.java:209)
	at org.apache.hadoop.fs.RawLocalFileSystem.createOutputStreamWithMode(RawLocalFileSystem.java:307)
	at org.apache.hadoop.fs.RawLocalFileSystem.create(RawLocalFileSystem.java:296)
	at org.apache.hadoop.fs.RawLocalFileSystem.create(RawLocalFileSystem.java:328)
	at org.apache.hadoop.fs.ChecksumFileSystem$ChecksumFSOutputSummer.<init>(ChecksumFileSystem.java:398)
	at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:461)
	at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:440)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:911)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:804)
	at org.apache.hadoop.mapred.TextOutputFormat.getRecordWriter(TextOutputFormat.java:123)
	at org.apache.spark.internal.io.HadoopMapRedWriteConfigUtil.initWriter(SparkHadoopWriter.scala:230)
	at org.apache.spark.internal.io.SparkHadoopWriter$.executeTask(SparkHadoopWriter.scala:120)
	at org.apache.spark.internal.io.SparkHadoopWriter$.$anonfun$write$1(SparkHadoopWriter.scala:83)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:127)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	at java.lang.Thread.run(Unknown Source)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2008)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2007)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2007)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:973)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:973)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:973)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2239)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2188)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2177)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:775)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2120)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2152)
	at org.apache.spark.internal.io.SparkHadoopWriter$.write(SparkHadoopWriter.scala:78)
	... 51 more
Caused by: java.io.IOException: (null) entry in command string: null chmod 0644 C:\Users\SP.000\Documents\Edureka\Machine Learning Engineer\Python Spark Certification Training using PySpark\Project\trained_model\linearregression.model\metadata\_temporary\0\_temporary\attempt_20210116213211_0841_m_000000_0\part-00000
	at org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:773)
	at org.apache.hadoop.util.Shell.execCommand(Shell.java:869)
	at org.apache.hadoop.util.Shell.execCommand(Shell.java:852)
	at org.apache.hadoop.fs.RawLocalFileSystem.setPermission(RawLocalFileSystem.java:733)
	at org.apache.hadoop.fs.RawLocalFileSystem$LocalFSFileOutputStream.<init>(RawLocalFileSystem.java:225)
	at org.apache.hadoop.fs.RawLocalFileSystem$LocalFSFileOutputStream.<init>(RawLocalFileSystem.java:209)
	at org.apache.hadoop.fs.RawLocalFileSystem.createOutputStreamWithMode(RawLocalFileSystem.java:307)
	at org.apache.hadoop.fs.RawLocalFileSystem.create(RawLocalFileSystem.java:296)
	at org.apache.hadoop.fs.RawLocalFileSystem.create(RawLocalFileSystem.java:328)
	at org.apache.hadoop.fs.ChecksumFileSystem$ChecksumFSOutputSummer.<init>(ChecksumFileSystem.java:398)
	at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:461)
	at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:440)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:911)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:804)
	at org.apache.hadoop.mapred.TextOutputFormat.getRecordWriter(TextOutputFormat.java:123)
	at org.apache.spark.internal.io.HadoopMapRedWriteConfigUtil.initWriter(SparkHadoopWriter.scala:230)
	at org.apache.spark.internal.io.SparkHadoopWriter$.executeTask(SparkHadoopWriter.scala:120)
	at org.apache.spark.internal.io.SparkHadoopWriter$.$anonfun$write$1(SparkHadoopWriter.scala:83)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:127)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	... 1 more


[Row(features=SparseVector(20, {0: 1.0, 4: 1.0, 8: 10.0, 9: 11.0, 10: 56.0, 11: 26.0, 15: 20.0, 16: 2011.0, 17: 1.0, 19: 1.0}), prediction=-32.43047899598605),
 Row(features=SparseVector(20, {0: 1.0, 4: 1.0, 8: 10.0, 9: 13.0, 10: 56.0, 12: 1.0, 15: 20.0, 16: 2011.0, 17: 1.0, 19: 1.0}), prediction=-34.97443831744022),
 Row(features=SparseVector(20, {0: 1.0, 4: 1.0, 8: 10.0, 9: 13.0, 10: 56.0, 12: 2.0, 15: 20.0, 16: 2011.0, 17: 1.0, 19: 1.0}), prediction=-27.438666224043118),
 Row(features=SparseVector(20, {0: 1.0, 4: 1.0, 8: 10.0, 9: 12.0, 10: 56.0, 11: 11.0, 12: 3.0, 15: 20.0, 16: 2011.0, 17: 1.0, 19: 1.0}), prediction=-16.143902212847024),
 Row(features=SparseVector(20, {0: 1.0, 4: 1.0, 8: 10.0, 9: 12.0, 10: 56.0, 11: 11.0, 12: 4.0, 15: 20.0, 16: 2011.0, 17: 1.0, 19: 1.0}), prediction=-8.60813011942082)]

In [191]:
spark.version

'3.0.1'