In [0]:
# Spark User Defined Functions
# Partitioning DataFrames
import datetime
import pyspark.sql.functions as F
from pyspark.sql import Row
import pandas as pd
from pyspark.sql.types import *

In [0]:
help(spark.udf.register)

In [0]:
df = spark.read.json('/public/retail_db_json/orders')

In [0]:
df.show()

In [0]:
dc = spark.udf.register('date_convert', lambda d: int(d[:10].replace('-', '')))

In [0]:
dc

In [0]:
df.select(dc('order_date').alias('order_date')).show()

In [0]:
df.filter(dc('order_date') == 20140101).show()

In [0]:
df \
    .groupBy(dc('order_date').alias('order_date')) \
    .count() \
    .withColumnRenamed('count', 'order_count') \
    .show()

In [0]:
df.selectExpr('date_convert(order_date) AS order_date').show()

In [0]:
df.createOrReplaceTempView('orders')

In [0]:
spark.sql("""
    SELECT o.*, date_convert(order_date) AS order_date_as_int FROM orders AS o          
""").show()

In [0]:
spark.sql("""
    SELECT o.*, date_convert(order_date) AS order_date_as_int FROM orders AS o WHERE date_convert(order_date) = 20140101        
""").show()

In [0]:
spark.sql("""
    SELECT date_convert(order_date) AS order_date, count(*) AS order_count FROM orders AS o
    GROUP BY 1      
""").show()

In [0]:
courses = {
    'course_id': ['1', '2', '3'],
    'course_name': ['Python', 'Spark', 'SQL'],
    'course_author': ['John', 'Mary', 'Alice'],
    'course_status': ['active', 'active', 'active'],
    'course_published_at': ['2018-01-01', '2019-01-01', '2017-01-01']
}

courses_df = spark.createDataFrame(pd.DataFrame(courses))

In [0]:
users = {
    'user_id': [1, 2, 3],
    'user_name': ['Alice', 'Bob', 'Charlie'],
    'user_email': ['alice@', 'bob@', 'charlie@'],
    'user_gender': ['F', 'M', 'M'],
}

users_df = spark.createDataFrame(pd.DataFrame(users))

In [0]:
course_enrollments = {
    'course_id': [1, 1, 2, 3],
    'user_id': [1, 2, 3, 4],
    'enrollment_id': [1, 2, 3, 4],
    'grade': ['A', 'B', 'C', 'D'],
    'department': ['CS', 'CS', 'Math', 'Math']
}

course_enrollments_df = spark.createDataFrame(pd.DataFrame(course_enrollments))

In [0]:
def data_cleanse(c):
    return c.strip() if c.strip() != '\\N' else None

In [0]:
data_cleanse = spark.udf.register('data_cleanse', data_cleanse)

In [0]:
courses_df.select(
    data_cleanse(F.col('course_id')).alias('course_id'),
    data_cleanse(F.col('course_status')).alias('course_status')
).show()

In [0]:
courses_df.createOrReplaceTempView('courses')

In [0]:
spark.sql("""
    SELECT course_id, data_cleanse(course_status) AS course_status
    FROM courses          
""").show()