In [7]:
from pyspark.sql import SQLContext

In [3]:
sqlContext = SQLContext(sc)

In [4]:
df = sqlContext.read.load('file:///home/cloudera/Downloads/big-data-4/daily_weather.csv',
                         format='com.databricks.spark.csv',
                         header='true', inferSchema='true')

In [5]:
df.columns

['number',
 'air_pressure_9am',
 'air_temp_9am',
 'avg_wind_direction_9am',
 'avg_wind_speed_9am',
 'max_wind_direction_9am',
 'max_wind_speed_9am',
 'rain_accumulation_9am',
 'rain_duration_9am',
 'relative_humidity_9am',
 'relative_humidity_3pm']

In [6]:
df.printSchema()

root
 |-- number: integer (nullable = true)
 |-- air_pressure_9am: double (nullable = true)
 |-- air_temp_9am: double (nullable = true)
 |-- avg_wind_direction_9am: double (nullable = true)
 |-- avg_wind_speed_9am: double (nullable = true)
 |-- max_wind_direction_9am: double (nullable = true)
 |-- max_wind_speed_9am: double (nullable = true)
 |-- rain_accumulation_9am: double (nullable = true)
 |-- rain_duration_9am: double (nullable = true)
 |-- relative_humidity_9am: double (nullable = true)
 |-- relative_humidity_3pm: double (nullable = true)



In [7]:
# Check summary statistics
df.describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
number,1095,547.0,316.24357700987383,0,1094
air_pressure_9am,1092,918.8825513138097,3.1841611803868353,907.9900000000024,929.3200000000012
air_temp_9am,1090,64.93300141287075,11.175514003175877,36.752000000000685,98.90599999999992
avg_wind_direction_9am,1091,142.23551070057584,69.13785928889183,15.500000000000046,343.4
avg_wind_speed_9am,1092,5.50828424225493,4.552813465531715,0.69345139999974,23.554978199999763
max_wind_direction_9am,1092,148.9535179651692,67.23801294602951,28.89999999999991,312.19999999999993
max_wind_speed_9am,1091,7.019513529175272,5.59820917078096,1.1855782000000479,29.84077959999996
rain_accumulation_9am,1089,0.20307895225211126,1.5939521253574904,0.0,24.01999999999907
rain_duration_9am,1092,294.1080522756142,1598.078778660148,0.0,17704.0


In [8]:
df.describe('air_pressure_9am').show()

+-------+------------------+
|summary|  air_pressure_9am|
+-------+------------------+
|  count|              1092|
|   mean| 918.8825513138097|
| stddev|3.1841611803868353|
|    min| 907.9900000000024|
|    max| 929.3200000000012|
+-------+------------------+



In [9]:
len(df.columns)

11

In [10]:
df.count()

1095

In [11]:
 df2 = df.na.drop(subset=['air_pressure_9am'])

In [12]:
df2.count()

1092

In [13]:
df2.stat.corr("rain_accumulation_9am", "rain_duration_9am")

0.7298253479609015

In [14]:
df.na.drop(subset=['rain_accumulation_9am']).count()

1089

In [15]:
 df2.stat.corr("relative_humidity_9am", "relative_humidity_3pm")

0.8823388530772411

In [16]:
#Start Data Preparation assignment
df.describe(['air_temp_9am']).show()

+-------+------------------+
|summary|      air_temp_9am|
+-------+------------------+
|  count|              1090|
|   mean| 64.93300141287075|
| stddev|11.175514003175877|
|    min|36.752000000000685|
|    max| 98.90599999999992|
+-------+------------------+



In [17]:
df.count()

1095

In [18]:
# Drop all the rows with missing a value
removeAllDF = df.na.drop()

In [19]:
removeAllDF.describe(['air_temp_9am']).show()

+-------+------------------+
|summary|      air_temp_9am|
+-------+------------------+
|  count|              1064|
|   mean| 65.02260949558739|
| stddev|11.168033449415699|
|    min|36.752000000000685|
|    max| 98.90599999999992|
+-------+------------------+



In [20]:
removeAllDF.count()

1064

In [21]:
#Impute missing values with mean value for that column
from pyspark.sql.functions import avg
imputeDF = df

In [22]:
for x in imputeDF.columns:
    meanValue = removeAllDF.agg(avg(x)).first()[0]
    print(x, meanValue)
    imputeDF = imputeDF.na.fill(meanValue, [x])

number 545.0018796992481
air_pressure_9am 918.9031798641055
air_temp_9am 65.02260949558739
avg_wind_direction_9am 142.30675564934032
avg_wind_speed_9am 5.485793050713691
max_wind_direction_9am 148.48042413321312
max_wind_speed_9am 6.9997136588756925
rain_accumulation_9am 0.18202347650615522
rain_duration_9am 266.3936973996038
relative_humidity_9am 34.07743985327712
relative_humidity_3pm 35.14838093290537


In [23]:
df.describe(['air_temp_9am']).show()
imputeDF.describe(['air_temp_9am']).show()

+-------+------------------+
|summary|      air_temp_9am|
+-------+------------------+
|  count|              1090|
|   mean| 64.93300141287075|
| stddev|11.175514003175877|
|    min|36.752000000000685|
|    max| 98.90599999999992|
+-------+------------------+

+-------+------------------+
|summary|      air_temp_9am|
+-------+------------------+
|  count|              1095|
|   mean| 64.93341058219822|
| stddev|11.149948199920226|
|    min|36.752000000000685|
|    max| 98.90599999999992|
+-------+------------------+



In [24]:
#Classification work starts here
from pyspark.sql import SQLContext
from pyspark.sql import DataFrameNaFunctions
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import Binarizer
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer


In [25]:
featureColumns = ['air_pressure_9am','air_temp_9am','avg_wind_direction_9am','avg_wind_speed_9am',
        'max_wind_direction_9am','max_wind_speed_9am','rain_accumulation_9am',
        'rain_duration_9am']

In [26]:
droppedDF = df.drop('number')

In [27]:
droppedDF.columns


['air_pressure_9am',
 'air_temp_9am',
 'avg_wind_direction_9am',
 'avg_wind_speed_9am',
 'max_wind_direction_9am',
 'max_wind_speed_9am',
 'rain_accumulation_9am',
 'rain_duration_9am',
 'relative_humidity_9am',
 'relative_humidity_3pm']

In [28]:
imputeDropDF = droppedDF.na.drop()

In [29]:
df.count(), len(df.columns)

(1095, 11)

In [30]:
imputeDropDF.count(), len(imputeDropDF.columns)

(1064, 10)

In [31]:
binarizer = Binarizer(threshold=24.99999, inputCol="relative_humidity_3pm", outputCol="label")

In [32]:
binarizedDF = binarizer.transform(imputeDropDF)

In [33]:
binarizedDF.select("relative_humidity_3pm", "label").show(4)

+---------------------+-----+
|relative_humidity_3pm|label|
+---------------------+-----+
|   36.160000000000494|  1.0|
|     19.4265967985621|  0.0|
|   14.460000000000045|  0.0|
|   12.742547353761848|  0.0|
+---------------------+-----+
only showing top 4 rows



In [34]:
assembler = VectorAssembler(inputCols=featureColumns, outputCol="features")

In [35]:
assembledDF = assembler.transform(binarizedDF)

In [52]:
(trainingDataDF, testDataDF) = assembledDF.randomSplit([0.8,0.2], seed=13234)

In [53]:
trainingDataDF.count(), testDataDF.count()

(854, 210)

In [54]:
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=5, minInstancesPerNode = 20, impurity="gini")

In [55]:
pipeline = Pipeline(stages=[dt])

In [56]:
model = pipeline.fit(trainingDataDF)

In [57]:
predictions = model.transform(testDataDF)

In [58]:
predictions.select("prediction", "label").show(20)

+----------+-----+
|prediction|label|
+----------+-----+
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       0.0|  0.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       0.0|  0.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       0.0|  0.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       0.0|  0.0|
|       0.0|  1.0|
|       1.0|  1.0|
+----------+-----+
only showing top 20 rows



In [59]:
#save prediction to CSV
predictions.select("prediction", "label").write.save(path='file:///home/cloudera/Downloads/big-data-4/predictions1.csv',
                                                    format='com.databricks.spark.csv',
                                                    header='true')

In [60]:
from pyspark.sql import SQLContext
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

In [61]:
predictions.select("prediction", "label").show(20)

+----------+-----+
|prediction|label|
+----------+-----+
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       0.0|  0.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       0.0|  0.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       0.0|  0.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       0.0|  0.0|
|       0.0|  1.0|
|       1.0|  1.0|
+----------+-----+
only showing top 20 rows



In [62]:
evaluator = MulticlassClassificationEvaluator (
labelCol = "label", predictionCol="prediction", metricName="precision")

In [63]:
accuracy = evaluator.evaluate(predictions)

In [87]:
print("Accuracy = %g" % (accuracy))
print("Accuracy percentage = %.2g" % (accuracy * 100))
print("Accuracy percentage1 = %100g" % (accuracy))
print("Accuracy percentage2 = %100.2g" % (accuracy))
print("Error = %g" % (1- accuracy))

Accuracy = 0.809524
Accuracy percentage = 81
Accuracy percentage1 =                                                                                             0.809524
Accuracy percentage2 =                                                                                                 0.81
Error = 0.190476


In [67]:
predictions.select("prediction", "label").rdd.take(2)

[Row(prediction=1.0, label=1.0), Row(prediction=1.0, label=1.0)]

In [68]:
predictions.select("prediction", "label").rdd.map(tuple).take(2)

[(1.0, 1.0), (1.0, 1.0)]

In [70]:
metrics = MulticlassMetrics(predictions.select("prediction", "label").rdd.map(tuple))

In [89]:
metrics.confusionMatrix().toArray().transpose()

array([[ 87.,  26.],
       [ 14.,  83.]])

In [90]:
metrics.confusionMatrix().toArray()

array([[ 87.,  14.],
       [ 26.,  83.]])