# Census ETL 

In [None]:
import sys
sys.path.append("..")
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from helpers.path_translation import translate_to_file_string
from helpers.data_prep_and_print import print_df

### Create spark session

In [None]:
#create a SparkSession
spark = (SparkSession
       .builder
       .appName("Census ETL")
       .getOrCreate())
spark.sparkContext.setLogLevel("ERROR")

## Load Census Data

In [None]:
# create a DataFrame from Json
inputFile = translate_to_file_string("../data/census_2010.json")
df = spark.read.json(inputFile)   
print_df(df,10)

## Transformation

In [None]:
# Select only older people > 60
seniors = df[df["age"]>60]
print_df(seniors, 10)

In [None]:
# calc total of people as new attribute
seniors_with_total = seniors.withColumn("total",col("females")+col("males"))
print_df(seniors_with_total,10)

## Load

In [None]:
# Due to the lack of a running database export the result into a CSV file.
seniors_with_total.write.mode("overwrite").csv("/tmp/seniors")

In [None]:
spark.stop()