# Reading and Writing Data with Spark

The data set is read in from a local file and then write

First let's import SparkConf and SparkSession

In [2]:
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pandas as pd

Since we're using Spark locally we already have both a sparkcontext and a sparksession running. We can update some of the parameters, such our application's name. Let's just call it "Our first Python Spark SQL example"

In [3]:
spark = SparkSession \
    .builder \
    .appName("Our first Python Spark SQL example") \
    .getOrCreate()

Let's create our first dataframe from a fairly small sample data set Titanic. I had converted the Titanic train csv file to json.

In [17]:
## read json file
path = "titanic.json"
user_log = spark.read.json(path)

In [16]:
## Prints out the schema in the tree format
user_log.printSchema()

root
 |-- Age: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Name: string (nullable = true)
 |-- Parch: long (nullable = true)
 |-- PassengerId: long (nullable = true)
 |-- Pclass: long (nullable = true)
 |-- Sex: string (nullable = true)
 |-- SibSp: long (nullable = true)
 |-- Survived: long (nullable = true)
 |-- Ticket: string (nullable = true)



In [20]:
## describe function works the same way as in Pandas - shows stats
user_log.describe().show()

+-------+------------------+-----+--------+-----------------+--------------------+-------------------+-----------------+------------------+------+------------------+-------------------+------------------+
|summary|               Age|Cabin|Embarked|             Fare|                Name|              Parch|      PassengerId|            Pclass|   Sex|             SibSp|           Survived|            Ticket|
+-------+------------------+-----+--------+-----------------+--------------------+-------------------+-----------------+------------------+------+------------------+-------------------+------------------+
|  count|               714|  204|     889|              891|                 891|                891|              891|               891|   891|               891|                891|               891|
|   mean| 29.69911764705882| null|    null| 32.2042079685746|                null|0.38159371492704824|            446.0| 2.308641975308642|  null|0.5230078563411896| 0.383838383838

In [21]:
user_log.describe('Age').show()

+-------+------------------+
|summary|               Age|
+-------+------------------+
|  count|               714|
|   mean| 29.69911764705882|
| stddev|14.526497332334035|
|    min|              0.42|
|    max|              80.0|
+-------+------------------+



In [23]:
## show works similar to print in pandas - default is 20
user_log.show(n=5)

+----+-----+--------+-------+--------------------+-----+-----------+------+------+-----+--------+----------------+
| Age|Cabin|Embarked|   Fare|                Name|Parch|PassengerId|Pclass|   Sex|SibSp|Survived|          Ticket|
+----+-----+--------+-------+--------------------+-----+-----------+------+------+-----+--------+----------------+
|22.0| null|       S|   7.25|Braund, Mr. Owen ...|    0|          1|     3|  male|    1|       0|       A/5 21171|
|38.0|  C85|       C|71.2833|Cumings, Mrs. Joh...|    0|          2|     1|female|    1|       1|        PC 17599|
|26.0| null|       S|  7.925|Heikkinen, Miss. ...|    0|          3|     3|female|    0|       1|STON/O2. 3101282|
|35.0| C123|       S|   53.1|Futrelle, Mrs. Ja...|    0|          4|     1|female|    1|       1|          113803|
|35.0| null|       S|   8.05|Allen, Mr. Willia...|    0|          5|     3|  male|    0|       0|          373450|
+----+-----+--------+-------+--------------------+-----+-----------+------+-----

In [24]:
## take shows the records as a list
user_log.take(5)

[Row(Age=22.0, Cabin=None, Embarked='S', Fare=7.25, Name='Braund, Mr. Owen Harris', Parch=0, PassengerId=1, Pclass=3, Sex='male', SibSp=1, Survived=0, Ticket='A/5 21171'),
 Row(Age=38.0, Cabin='C85', Embarked='C', Fare=71.2833, Name='Cumings, Mrs. John Bradley (Florence Briggs Thayer)', Parch=0, PassengerId=2, Pclass=1, Sex='female', SibSp=1, Survived=1, Ticket='PC 17599'),
 Row(Age=26.0, Cabin=None, Embarked='S', Fare=7.925, Name='Heikkinen, Miss. Laina', Parch=0, PassengerId=3, Pclass=3, Sex='female', SibSp=0, Survived=1, Ticket='STON/O2. 3101282'),
 Row(Age=35.0, Cabin='C123', Embarked='S', Fare=53.1, Name='Futrelle, Mrs. Jacques Heath (Lily May Peel)', Parch=0, PassengerId=4, Pclass=1, Sex='female', SibSp=1, Survived=1, Ticket='113803'),
 Row(Age=35.0, Cabin=None, Embarked='S', Fare=8.05, Name='Allen, Mr. William Henry', Parch=0, PassengerId=5, Pclass=3, Sex='male', SibSp=0, Survived=0, Ticket='373450')]

In [29]:
## count display the number of records in the dataframe
user_log.count()

891

In [34]:
## we can check the number of columns in the dataframe
len(user_log.columns)

12

In [42]:
## limit we can use if we want to copy first x rows into another dataframe
user_log1 = user_log.limit(6)
user_log1.show()

+----+-----+--------+-------+--------------------+-----+-----------+------+------+-----+--------+----------------+
| Age|Cabin|Embarked|   Fare|                Name|Parch|PassengerId|Pclass|   Sex|SibSp|Survived|          Ticket|
+----+-----+--------+-------+--------------------+-----+-----------+------+------+-----+--------+----------------+
|22.0| null|       S|   7.25|Braund, Mr. Owen ...|    0|          1|     3|  male|    1|       0|       A/5 21171|
|38.0|  C85|       C|71.2833|Cumings, Mrs. Joh...|    0|          2|     1|female|    1|       1|        PC 17599|
|26.0| null|       S|  7.925|Heikkinen, Miss. ...|    0|          3|     3|female|    0|       1|STON/O2. 3101282|
|35.0| C123|       S|   53.1|Futrelle, Mrs. Ja...|    0|          4|     1|female|    1|       1|          113803|
|35.0| null|       S|   8.05|Allen, Mr. Willia...|    0|          5|     3|  male|    0|       0|          373450|
|null| null|       Q| 8.4583|    Moran, Mr. James|    0|          6|     3|  mal

Saving the Dataframe to csv file

In [27]:
## lets save the dataframe in csv file
out_path = "titanic.csv"

In [53]:
## it will create a folder named titanic.csv and inside that folder a file will be created something like 
#part-00000-b447870f-7167-41b1-987b-f4b570950c64-c000.csv
user_log.write.save(out_path, format="csv", header=True)

Read the same csv file to cross if everything worked fine

In [43]:
user_log_2 = spark.read.csv(out_path, header=True)

In [44]:
user_log_2.printSchema()

root
 |-- Age: string (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- Fare: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Parch: string (nullable = true)
 |-- PassengerId: string (nullable = true)
 |-- Pclass: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- SibSp: string (nullable = true)
 |-- Survived: string (nullable = true)
 |-- Ticket: string (nullable = true)



In [45]:
user_log_2.take(2)

[Row(Age='22.0', Cabin=None, Embarked='S', Fare='7.25', Name='Braund, Mr. Owen Harris', Parch='0', PassengerId='1', Pclass='3', Sex='male', SibSp='1', Survived='0', Ticket='A/5 21171'),
 Row(Age='38.0', Cabin='C85', Embarked='C', Fare='71.2833', Name='Cumings, Mrs. John Bradley (Florence Briggs Thayer)', Parch='0', PassengerId='2', Pclass='1', Sex='female', SibSp='1', Survived='1', Ticket='PC 17599')]

In [51]:
user_log_2.select("PassengerId").show()

+-----------+
|PassengerId|
+-----------+
|          1|
|          2|
|          3|
|          4|
|          5|
|          6|
|          7|
|          8|
|          9|
|         10|
|         11|
|         12|
|         13|
|         14|
|         15|
|         16|
|         17|
|         18|
|         19|
|         20|
+-----------+
only showing top 20 rows



In [49]:
user_log_2.take(4)

[Row(Age='22.0', Cabin=None, Embarked='S', Fare='7.25', Name='Braund, Mr. Owen Harris', Parch='0', PassengerId='1', Pclass='3', Sex='male', SibSp='1', Survived='0', Ticket='A/5 21171'),
 Row(Age='38.0', Cabin='C85', Embarked='C', Fare='71.2833', Name='Cumings, Mrs. John Bradley (Florence Briggs Thayer)', Parch='0', PassengerId='2', Pclass='1', Sex='female', SibSp='1', Survived='1', Ticket='PC 17599'),
 Row(Age='26.0', Cabin=None, Embarked='S', Fare='7.925', Name='Heikkinen, Miss. Laina', Parch='0', PassengerId='3', Pclass='3', Sex='female', SibSp='0', Survived='1', Ticket='STON/O2. 3101282'),
 Row(Age='35.0', Cabin='C123', Embarked='S', Fare='53.1', Name='Futrelle, Mrs. Jacques Heath (Lily May Peel)', Parch='0', PassengerId='4', Pclass='1', Sex='female', SibSp='1', Survived='1', Ticket='113803')]