# Start Spark session

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('pyspark-test').getOrCreate()
spark

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/15 19:25:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/05/15 19:25:19 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


# Get data

In [2]:
import os
import wget

if not os.path.isfile('iris.data'):
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
    wget.download(url)

In [8]:
data = spark.read.csv("iris.data", header=False, inferSchema=True)
data.show(n=5)

+---+---+---+---+-----------+
|_c0|_c1|_c2|_c3|        _c4|
+---+---+---+---+-----------+
|5.1|3.5|1.4|0.2|Iris-setosa|
|4.9|3.0|1.4|0.2|Iris-setosa|
|4.7|3.2|1.3|0.2|Iris-setosa|
|4.6|3.1|1.5|0.2|Iris-setosa|
|5.0|3.6|1.4|0.2|Iris-setosa|
+---+---+---+---+-----------+
only showing top 5 rows



In [10]:
type(data)

pyspark.sql.dataframe.DataFrame

In [9]:
# check schema
data.printSchema()

root
 |-- _c0: double (nullable = true)
 |-- _c1: double (nullable = true)
 |-- _c2: double (nullable = true)
 |-- _c3: double (nullable = true)
 |-- _c4: string (nullable = true)



### Selecting column names and indexing

In [13]:
data.columns 

['_c0', '_c1', '_c2', '_c3', '_c4']

In [14]:
data.head(3)

[Row(_c0=5.1, _c1=3.5, _c2=1.4, _c3=0.2, _c4='Iris-setosa'),
 Row(_c0=4.9, _c1=3.0, _c2=1.4, _c3=0.2, _c4='Iris-setosa'),
 Row(_c0=4.7, _c1=3.2, _c2=1.3, _c3=0.2, _c4='Iris-setosa')]

In [19]:
# select column(s)
data.select(['_c0', '_c3']).show(n=5)

+---+---+
|_c0|_c3|
+---+---+
|5.1|0.2|
|4.9|0.2|
|4.7|0.2|
|4.6|0.2|
|5.0|0.2|
+---+---+
only showing top 5 rows



In [26]:
data['_c1']

Column<'_c1'>

In [22]:
# check data types
data.dtypes

[('_c0', 'double'),
 ('_c1', 'double'),
 ('_c2', 'double'),
 ('_c3', 'double'),
 ('_c4', 'string')]

In [25]:
# describe summary stats
data.describe().show()

+-------+------------------+-------------------+------------------+------------------+--------------+
|summary|               _c0|                _c1|               _c2|               _c3|           _c4|
+-------+------------------+-------------------+------------------+------------------+--------------+
|  count|               150|                150|               150|               150|           150|
|   mean| 5.843333333333335| 3.0540000000000007|3.7586666666666693|1.1986666666666672|          null|
| stddev|0.8280661279778637|0.43359431136217375| 1.764420419952262|0.7631607417008414|          null|
|    min|               4.3|                2.0|               1.0|               0.1|   Iris-setosa|
|    max|               7.9|                4.4|               6.9|               2.5|Iris-virginica|
+-------+------------------+-------------------+------------------+------------------+--------------+



In [30]:
# adding columns (from existing)
data = data.withColumn('new_col', data['_c1'] + 2.0)
data.show(n=5)

+---+---+---+---+-----------+-------+
|_c0|_c1|_c2|_c3|        _c4|new_col|
+---+---+---+---+-----------+-------+
|5.1|3.5|1.4|0.2|Iris-setosa|    5.5|
|4.9|3.0|1.4|0.2|Iris-setosa|    5.0|
|4.7|3.2|1.3|0.2|Iris-setosa|    5.2|
|4.6|3.1|1.5|0.2|Iris-setosa|    5.1|
|5.0|3.6|1.4|0.2|Iris-setosa|    5.6|
+---+---+---+---+-----------+-------+
only showing top 5 rows



In [31]:
# drop column
data = data.drop('new_col')
data.show(n=5)

+---+---+---+---+-----------+
|_c0|_c1|_c2|_c3|        _c4|
+---+---+---+---+-----------+
|5.1|3.5|1.4|0.2|Iris-setosa|
|4.9|3.0|1.4|0.2|Iris-setosa|
|4.7|3.2|1.3|0.2|Iris-setosa|
|4.6|3.1|1.5|0.2|Iris-setosa|
|5.0|3.6|1.4|0.2|Iris-setosa|
+---+---+---+---+-----------+
only showing top 5 rows



In [32]:
# rename column
data = data.withColumnRenamed('_c0', 'new_col_name')
data.show(n=5)

+------------+---+---+---+-----------+
|new_col_name|_c1|_c2|_c3|        _c4|
+------------+---+---+---+-----------+
|         5.1|3.5|1.4|0.2|Iris-setosa|
|         4.9|3.0|1.4|0.2|Iris-setosa|
|         4.7|3.2|1.3|0.2|Iris-setosa|
|         4.6|3.1|1.5|0.2|Iris-setosa|
|         5.0|3.6|1.4|0.2|Iris-setosa|
+------------+---+---+---+-----------+
only showing top 5 rows

