In [1]:
import pyspark

In [2]:
# import Spark Sessiom
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

In [3]:
# Create Spark Session
spark = SparkSession.builder.appName("TestSpark").getOrCreate()

In [4]:
spark

In [10]:
df = spark.read.csv('fakefriends.csv')

In [11]:
# Show data
df.show()

+---+--------+---+------+
|_c0|     _c1|_c2|   _c3|
+---+--------+---+------+
| Id|    Name|Age|Salary|
|  0|    Will| 33|   385|
|  1|Jean-Luc| 26|     2|
|  2|    Hugh| 55|   221|
|  3|  Deanna| 40|   465|
|  4|   Quark| 68|    21|
|  5|  Weyoun| 59|   318|
|  6|  Gowron| 37|   220|
|  7|    Will| 54|   307|
|  8|  Jadzia| 38|   380|
|  9|    Hugh| 27|   181|
| 10|     Odo| 53|   191|
| 11|     Ben| 57|   372|
| 12|   Keiko| 54|   253|
| 13|Jean-Luc| 56|   444|
| 14|    Hugh| 43|    49|
| 15|     Rom| 36|    49|
| 16|  Weyoun| 22|   323|
| 17|     Odo| 35|    13|
| 18|Jean-Luc| 45|   455|
+---+--------+---+------+
only showing top 20 rows



In [13]:
# Take header in account
df = spark.read.option('header', 'true').csv('fakefriends.csv')

In [14]:
df.head(3)

[Row(Id='0', Name='Will', Age='33', Salary='385'),
 Row(Id='1', Name='Jean-Luc', Age='26', Salary='2'),
 Row(Id='2', Name='Hugh', Age='55', Salary='221')]

In [15]:
# See Schema
# You will notice by default all are string.
df.printSchema()

root
 |-- Id: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Salary: string (nullable = true)



In [17]:
# Infer Schema based on Data
df = spark.read.option('header', 'true').csv('fakefriends.csv', inferSchema=True)
df.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- NumberOfFriends: integer (nullable = true)



In [18]:
df.head(3)

[Row(Id=0, Name='Will', Age=33, NumberOfFriends=385),
 Row(Id=1, Name='Jean-Luc', Age=26, NumberOfFriends=2),
 Row(Id=2, Name='Hugh', Age=55, NumberOfFriends=221)]

In [20]:
# Header and InferSchema
df = spark.read.csv('fakefriends.csv', header=True, inferSchema=True)
df.printSchema()
df.head(3)

root
 |-- Id: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- NumberOfFriends: integer (nullable = true)



[Row(Id=0, Name='Will', Age=33, NumberOfFriends=385),
 Row(Id=1, Name='Jean-Luc', Age=26, NumberOfFriends=2),
 Row(Id=2, Name='Hugh', Age=55, NumberOfFriends=221)]

In [22]:
# get column names
df.columns

['Id', 'Name', 'Age', 'NumberOfFriends']

In [24]:
# Select only specific column
# Select method returns a new data frame
df.select(['Name', 'Age']).head(3)

[Row(Name='Will', Age=33),
 Row(Name='Jean-Luc', Age=26),
 Row(Name='Hugh', Age=55)]

In [25]:
# Check Column data types
df.dtypes

[('Id', 'int'), ('Name', 'string'), ('Age', 'int'), ('NumberOfFriends', 'int')]

In [27]:
# Describe Dataframe stats
df.describe()

# describe method return another dataframe which can be used for show
df.describe().show()

+-------+-----------------+----+------------------+-----------------+
|summary|               Id|Name|               Age|  NumberOfFriends|
+-------+-----------------+----+------------------+-----------------+
|  count|              500| 500|               500|              500|
|   mean|            249.5|null|            43.708|          248.532|
| stddev|144.4818327679989|null|14.864340996711995|147.2217288680643|
|    min|                0| Ben|                18|                1|
|    max|              499|Worf|                69|              499|
+-------+-----------------+----+------------------+-----------------+



In [30]:
# Add a column - use WithColumn method
df = df.withColumn('Age after 2 years', df['Age'] + 2)
df.head(3)

[Row(Id=0, Name='Will', Age=33, NumberOfFriends=385, Age after 2 years=35),
 Row(Id=1, Name='Jean-Luc', Age=26, NumberOfFriends=2, Age after 2 years=28),
 Row(Id=2, Name='Hugh', Age=55, NumberOfFriends=221, Age after 2 years=57)]

In [34]:
# Drop the column using drop
df = df.drop('Age after 2 years')
df.head(3)

[Row(Id=0, Name='Will', Age=33, NumberOfFriends=385),
 Row(Id=1, Name='Jean-Luc', Age=26, NumberOfFriends=2),
 Row(Id=2, Name='Hugh', Age=55, NumberOfFriends=221)]

In [36]:
# Rename Column
df.withColumnRenamed('Name', 'User Name').head(3)

[Row(Id=0, User Name='Will', Age=33, NumberOfFriends=385),
 Row(Id=1, User Name='Jean-Luc', Age=26, NumberOfFriends=2),
 Row(Id=2, User Name='Hugh', Age=55, NumberOfFriends=221)]