In [1]:
#### Start a Spark Session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("demographics").getOrCreate()

In [2]:
### Load the stocks.csv file, have Spark infer the data types
df = spark.read.csv('demographics.csv',header=True,inferSchema=True)

In [3]:
### Print the column names
df.columns

['id',
 'name',
 'age',
 'height_meter',
 'weight_kg',
 'children',
 'occupation',
 'academic_degree',
 'salary',
 'location']

In [4]:
# Print out the first 10 rows
df.head(10)

[Row(id=0, name='Darlena Avila', age=58, height_meter=1.87, weight_kg=53, children=1, occupation='Choreographer', academic_degree='PhD', salary=68, location='South Dakota'),
 Row(id=1, name='Yan Boyd', age=65, height_meter=1.8, weight_kg=40, children=0, occupation='Cellarman', academic_degree='Bachelor', salary=73, location='Delaware'),
 Row(id=2, name='Joette Lane', age=32, height_meter=1.8, weight_kg=73, children=1, occupation='Veterinary Surgeon', academic_degree='Master', salary=69, location='South Dakota'),
 Row(id=3, name='Jazmine Hunt', age=61, height_meter=1.79, weight_kg=89, children=0, occupation='Hawker', academic_degree='PhD', salary=88, location='Louisiana'),
 Row(id=4, name='Remedios Gomez', age=23, height_meter=1.64, weight_kg=51, children=2, occupation='Choreographer', academic_degree='Bachelor', salary=83, location='West Virginia'),
 Row(id=5, name='Myung Brewer', age=20, height_meter=1.68, weight_kg=60, children=4, occupation='Window Dresser', academic_degree='Bachelo

In [5]:
# Select the age, height_meter, and weight_kg columns and use describe to show the summary statistics
df.select(["age", "height_meter", "weight_kg"]).describe().show()

+-------+------------------+------------------+------------------+
|summary|               age|      height_meter|         weight_kg|
+-------+------------------+------------------+------------------+
|  count|              1000|              1000|              1000|
|   mean|            42.933|1.7519499999999995|            64.011|
| stddev|14.255445581556843|0.1436897499623555|15.005733939099779|
|    min|                18|               1.5|                38|
|    max|                67|               2.0|                90|
+-------+------------------+------------------+------------------+



In [6]:
# Print the schema to see the types
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- height_meter: double (nullable = true)
 |-- weight_kg: integer (nullable = true)
 |-- children: integer (nullable = true)
 |-- occupation: string (nullable = true)
 |-- academic_degree: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- location: string (nullable = true)



In [7]:
# Rename the Salary column to `Salary (1k)` and show only this new column
df = df.withColumnRenamed('Salary', 'Salary (1k)')
df.select("Salary (1k)").show()

+-----------+
|Salary (1k)|
+-----------+
|         68|
|         73|
|         69|
|         88|
|         83|
|         65|
|         72|
|         65|
|         87|
|         72|
|         73|
|         90|
|         78|
|         69|
|         75|
|         77|
|         76|
|         90|
|         79|
|         77|
+-----------+
only showing top 20 rows



In [8]:
# Create a new column called `Salary` where the values are the `Salary (1k)` * 1000
# Show the columns `Salary` and `Salary (1k)`
df = df.withColumn("Salary", df["Salary (1k)"] * 1000)
df.select(["Salary", "Salary (1k)"]).show()

+------+-----------+
|Salary|Salary (1k)|
+------+-----------+
| 68000|         68|
| 73000|         73|
| 69000|         69|
| 88000|         88|
| 83000|         83|
| 65000|         65|
| 72000|         72|
| 65000|         65|
| 87000|         87|
| 72000|         72|
| 73000|         73|
| 90000|         90|
| 78000|         78|
| 69000|         69|
| 75000|         75|
| 77000|         77|
| 76000|         76|
| 90000|         90|
| 79000|         79|
| 77000|         77|
+------+-----------+
only showing top 20 rows

