In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext

In [2]:
sc= SparkContext()
sqlContext = SQLContext(sc)

In [3]:
df = sqlContext.read.format('csv').option("header", 'true').load("hdfs://nameservice1/user/edureka_294428/bikeSharing.csv")

In [4]:
df.cache()

DataFrame[instant: string, dteday: string, season: string, yr: string, mnth: string, hr: string, holiday: string, weekday: string, workingday: string, weathersit: string, temp: string, atemp: string, hum: string, windspeed: string, casual: string, registered: string, cnt: string]

In [5]:
df.show()

+-------+----------+------+---+----+---+-------+-------+----------+----------+----+------+----+---------+------+----------+---+
|instant|    dteday|season| yr|mnth| hr|holiday|weekday|workingday|weathersit|temp| atemp| hum|windspeed|casual|registered|cnt|
+-------+----------+------+---+----+---+-------+-------+----------+----------+----+------+----+---------+------+----------+---+
|      1|2011-01-01|     1|  0|   1|  0|      0|      6|         0|         1|0.24|0.2879|0.81|        0|     3|        13| 16|
|      2|2011-01-01|     1|  0|   1|  1|      0|      6|         0|         1|0.22|0.2727| 0.8|        0|     8|        32| 40|
|      3|2011-01-01|     1|  0|   1|  2|      0|      6|         0|         1|0.22|0.2727| 0.8|        0|     5|        27| 32|
|      4|2011-01-01|     1|  0|   1|  3|      0|      6|         0|         1|0.24|0.2879|0.75|        0|     3|        10| 13|
|      5|2011-01-01|     1|  0|   1|  4|      0|      6|         0|         1|0.24|0.2879|0.75|        0

In [6]:
print("Our dataset has %d rows." , df.count())

('Our dataset has %d rows.', 1000)


In [7]:
df = df.drop("instant").drop("dteday").drop("casual").drop("registered")
display(df)

DataFrame[season: string, yr: string, mnth: string, hr: string, holiday: string, weekday: string, workingday: string, weathersit: string, temp: string, atemp: string, hum: string, windspeed: string, cnt: string]

In [9]:
df.show()

+------+---+----+---+-------+-------+----------+----------+----+------+----+---------+---+
|season| yr|mnth| hr|holiday|weekday|workingday|weathersit|temp| atemp| hum|windspeed|cnt|
+------+---+----+---+-------+-------+----------+----------+----+------+----+---------+---+
|     1|  0|   1|  0|      0|      6|         0|         1|0.24|0.2879|0.81|        0| 16|
|     1|  0|   1|  1|      0|      6|         0|         1|0.22|0.2727| 0.8|        0| 40|
|     1|  0|   1|  2|      0|      6|         0|         1|0.22|0.2727| 0.8|        0| 32|
|     1|  0|   1|  3|      0|      6|         0|         1|0.24|0.2879|0.75|        0| 13|
|     1|  0|   1|  4|      0|      6|         0|         1|0.24|0.2879|0.75|        0|  1|
|     1|  0|   1|  5|      0|      6|         0|         2|0.24|0.2576|0.75|   0.0896|  1|
|     1|  0|   1|  6|      0|      6|         0|         1|0.22|0.2727| 0.8|        0|  2|
|     1|  0|   1|  7|      0|      6|         0|         1| 0.2|0.2576|0.86|        0|  3|

In [10]:
df.printSchema()

root
 |-- season: string (nullable = true)
 |-- yr: string (nullable = true)
 |-- mnth: string (nullable = true)
 |-- hr: string (nullable = true)
 |-- holiday: string (nullable = true)
 |-- weekday: string (nullable = true)
 |-- workingday: string (nullable = true)
 |-- weathersit: string (nullable = true)
 |-- temp: string (nullable = true)
 |-- atemp: string (nullable = true)
 |-- hum: string (nullable = true)
 |-- windspeed: string (nullable = true)
 |-- cnt: string (nullable = true)



In [11]:
from pyspark.sql.functions import col  # for indicating a column using a string in the line below

In [12]:
df = df.select([col(c).cast("double").alias(c) for c in df.columns])
df.printSchema()

root
 |-- season: double (nullable = true)
 |-- yr: double (nullable = true)
 |-- mnth: double (nullable = true)
 |-- hr: double (nullable = true)
 |-- holiday: double (nullable = true)
 |-- weekday: double (nullable = true)
 |-- workingday: double (nullable = true)
 |-- weathersit: double (nullable = true)
 |-- temp: double (nullable = true)
 |-- atemp: double (nullable = true)
 |-- hum: double (nullable = true)
 |-- windspeed: double (nullable = true)
 |-- cnt: double (nullable = true)



In [13]:
train, test = df.randomSplit([0.7, 0.3])

In [14]:
print("We have %d training examples and %d test examples." , (train.count(), test.count()))

('We have %d training examples and %d test examples.', (703, 297))


In [15]:
display(train.select("hr", "cnt"))

DataFrame[hr: double, cnt: double]

In [16]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer

In [17]:
featuresCols = df.columns
featuresCols.remove('cnt')
# This concatenates all feature columns into a single feature vector in a new column "rawFeatures".
vectorAssembler = VectorAssembler(inputCols=featuresCols, outputCol="rawFeatures")
# This identifies categorical features and indexes them.
vectorIndexer = VectorIndexer(inputCol="rawFeatures", outputCol="features", maxCategories=4)

In [18]:
from pyspark.ml.regression import GBTRegressor
# Takes the "features" column and learns to predict "cnt"
gbt = GBTRegressor(labelCol="cnt")


In [19]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator

In [20]:
# Define a grid of hyperparameters to test:
#  - maxDepth: max depth of each decision tree in the GBT ensemble
#  - maxIter: iterations, i.e., number of trees in each GBT ensemble
# In this example notebook, we keep these values small.  In practice, to get the highest accuracy, you would likely want to try deeper trees (10 or higher) and more trees in the ensemble (>100).
paramGrid = ParamGridBuilder().addGrid(gbt.maxDepth, [2, 5]).addGrid(gbt.maxIter, [10, 100]).build()
# We define an evaluation metric.  This tells CrossValidator how well we are doing by comparing the true labels with predictions.
evaluator = RegressionEvaluator(metricName="rmse", labelCol=gbt.getLabelCol(), predictionCol=gbt.getPredictionCol())
# Declare the CrossValidator, which runs model tuning for us.
cv = CrossValidator(estimator=gbt, evaluator=evaluator, estimatorParamMaps=paramGrid)

In [21]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[vectorAssembler, vectorIndexer, cv])

In [22]:
pipelineModel = pipeline.fit(train)


In [23]:
predictions = pipelineModel.transform(test)


In [27]:
display(predictions.select("cnt", "prediction", *featuresCols))
predictions.select("cnt", "prediction", *featuresCols).show()

DataFrame[cnt: double, prediction: double, season: double, yr: double, mnth: double, hr: double, holiday: double, weekday: double, workingday: double, weathersit: double, temp: double, atemp: double, hum: double, windspeed: double]

+----+-------------------+------+---+----+---+-------+-------+----------+----------+----+------+----+---------+
| cnt|         prediction|season| yr|mnth| hr|holiday|weekday|workingday|weathersit|temp| atemp| hum|windspeed|
+----+-------------------+------+---+----+---+-------+-------+----------+----------+----+------+----+---------+
|39.0|  37.43853442965849|   1.0|0.0| 1.0|0.0|    0.0|    0.0|       0.0|       1.0|0.26| 0.303|0.56|      0.0|
| 5.0|  8.816734619933277|   1.0|0.0| 1.0|0.0|    0.0|    1.0|       1.0|       1.0|0.12|0.1212| 0.5|   0.2836|
|12.0|  8.793159522974259|   1.0|0.0| 1.0|0.0|    0.0|    2.0|       1.0|       1.0|0.14|0.1667|0.59|   0.1045|
| 3.0|   8.32071156939577|   1.0|0.0| 1.0|0.0|    0.0|    3.0|       1.0|       2.0|0.22|0.2727|0.93|      0.0|
|17.0|  9.097710172414336|   1.0|0.0| 1.0|0.0|    0.0|    5.0|       1.0|       2.0| 0.2| 0.197|0.64|    0.194|
|21.0| 11.025287222603513|   1.0|0.0| 1.0|0.0|    0.0|    5.0|       1.0|       2.0|0.24|0.2273| 0.7|   

In [25]:
rmse = evaluator.evaluate(predictions)
print("RMSE on our test set: %g" , rmse)

('RMSE on our test set: %g', 17.507233577518747)


In [28]:
display(predictions.select("hr", "prediction"))
predictions.select("hr", "prediction").show()

DataFrame[hr: double, prediction: double]

+---+-------------------+
| hr|         prediction|
+---+-------------------+
|0.0|  37.43853442965849|
|0.0|  8.816734619933277|
|0.0|  8.793159522974259|
|0.0|   8.32071156939577|
|0.0|  9.097710172414336|
|0.0| 11.025287222603513|
|0.0| 40.975269256682054|
|0.0|  19.22971088276183|
|1.0| 1.5465771636523742|
|1.0|  4.141639025638635|
|1.0| 3.5805468973796923|
|1.0| 2.9984706347746086|
|1.0|  4.634130848590189|
|1.0| 6.9452589599572265|
|1.0|  9.078335765142175|
|1.0|   21.6158668492124|
|1.0| 16.708631486638716|
|2.0| 15.007497246669434|
|2.0|-1.1502547136504906|
|2.0|-1.7997779635253541|
+---+-------------------+
only showing top 20 rows



In [29]:
import matplotlib.pyplot as plt

In [30]:
display(predictions.select("hr", "prediction"))


DataFrame[hr: double, prediction: double]

In [31]:
plt.show()