In [None]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
import sklearn.datasets
import pandas as pd
import os

# Creating Spark Session

In [None]:
spark = SparkSession.builder.appName('SQLTutorial').getOrCreate()

# Extracting Test Data

In [None]:
# extracting from sklearn
breast_cancer = sklearn.datasets.load_breast_cancer()

# mapping binary labels to target nambes
target_map = dict(zip([1, 0], breast_cancer['target_names']))

# creating the label names dataframe
df_target = pd.DataFrame(breast_cancer['target'], columns=['cancer_type']).apply(lambda x: x.map(target_map))

# creating the feature dataframe
df = pd.DataFrame(breast_cancer['data'], columns=breast_cancer['feature_names'])

In [None]:
# dump flat data for pyspark load tutorial
df_target.to_csv('../tutorial-data/breast_cancer_target.csv', index_label='datakey')
df.to_csv('../tutorial-data/breast_cancer_features.csv', index_label='datakey')
df_target.to_json('../tutorial-data/breast_cancer_target.json')
df.to_json('../tutorial-data/breast_cancer_features.json')
df_target.to_parquet('../tutorial-data/breast_cancer_target.parquet', index=True)
df.to_parquet('../tutorial-data/breast_cancer_features.parquet', index=True)

# Combining Both Dataframes

## Example Join Type 1: pd.concat

In [None]:
pd.concat([df_target, df], axis=1).head()

## Example Join Type 2: df.join

In [None]:
df_target.join(df).head()

In [None]:
df_complete = df_target.join(df)

In [None]:
df_complete.head()

# Create Resilient Distributed Dataset

In [None]:
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
df_rdd = spark.createDataFrame(df_complete)

## Creating Temp SQL Table Reference

In [None]:
df_rdd.createOrReplaceTempView('SqlTutorial')

# Query Comparisons: SQL and Dot Notation

## Query 1: Limit

In [None]:
df_rdd.limit(10).toPandas().head()

In [None]:
spark.sql("""
SELECT *
FROM SqlTutorial
LIMIT 10
""").toPandas().head()

## Query 2: Group By

In [None]:
df_rdd.groupBy('cancer_type')\
    .agg({'mean radius':'mean'})\
    .withColumnRenamed('avg(mean radius)', 'AvgMeanRadius')\
    .show()

In [None]:
spark.sql("""
SELECT 
    cancer_type AS CancerType,
    AVG(`mean radius`) AS AvgMedianRadius
FROM SqlTutorial
GROUP BY cancer_type
""").show()

## Query 3: Window and Aliases

In [None]:
from pyspark.sql.functions import col, lead
from pyspark.sql.window import Window

In [None]:
window = Window.partitionBy('cancer_type').orderBy('mean radius')

df_rdd.withColumn(
    'next radius', 
    lead('mean radius', 1).over(window)
).select(['mean radius', 'next radius']).show()

In [None]:
spark.sql("""
SELECT
    `mean radius`,
    LEAD(`mean radius`) 
        OVER(PARTITION BY cancer_type ORDER BY `mean radius`) AS `next radius`
FROM SqlTutorial
""").show()