In [9]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
import sklearn.datasets
import pandas as pd
import os

# Creating Spark Session

In [4]:
spark = SparkSession.builder.appName('SQLTutorial').getOrCreate()

# Extracting Test Data

In [5]:
# extracting from sklearn
breast_cancer = sklearn.datasets.load_breast_cancer()

# mapping binary labels to target nambes
target_map = dict(zip([1, 0], breast_cancer['target_names']))

# creating the label names dataframe
df_target = pd.DataFrame(breast_cancer['target'], columns=['cancer_type']).apply(lambda x: x.map(target_map))

# creating the feature dataframe
df = pd.DataFrame(breast_cancer['data'], columns=breast_cancer['feature_names'])

In [6]:
# dump flat data for pyspark load tutorial
df_target.to_csv('../tutorial-data/breast_cancer_target.csv', index_label='datakey')
df.to_csv('../tutorial-data/breast_cancer_features.csv', index_label='datakey')
df_target.to_json('../tutorial-data/breast_cancer_target.json')
df.to_json('../tutorial-data/breast_cancer_features.json')
df_target.to_parquet('../tutorial-data/breast_cancer_target.parquet', index=True)
df.to_parquet('../tutorial-data/breast_cancer_features.parquet', index=True)

# Combining Both Dataframes

## Example Join Type 1: pd.concat

In [4]:
pd.concat([df_target, df], axis=1).head()

Unnamed: 0,cancer_type,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,benign,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,benign,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,benign,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,benign,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,benign,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


## Example Join Type 2: df.join

In [5]:
df_target.join(df).head()

Unnamed: 0,cancer_type,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,benign,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,benign,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,benign,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,benign,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,benign,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [6]:
df_complete = df_target.join(df)

In [7]:
df_complete.head()

Unnamed: 0,cancer_type,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,benign,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,benign,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,benign,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,benign,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,benign,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


# Create Resilient Distributed Dataset

In [12]:
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
df_rdd = spark.createDataFrame(df_complete)

## Creating Temp SQL Table Reference

In [14]:
df_rdd.createOrReplaceTempView('SqlTutorial')

# Query Comparisons: SQL and Dot Notation

## Query 1: Limit

In [15]:
df_rdd.limit(10).toPandas().head()



Unnamed: 0,cancer_type,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,benign,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,benign,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,benign,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,benign,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,benign,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [16]:
spark.sql("""
SELECT *
FROM SqlTutorial
LIMIT 10
""").toPandas().head()

Unnamed: 0,cancer_type,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,benign,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,benign,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,benign,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,benign,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,benign,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


## Query 2: Group By

In [17]:
df_rdd.groupBy('cancer_type')\
    .agg({'mean radius':'mean'})\
    .withColumnRenamed('avg(mean radius)', 'AvgMeanRadius')\
    .show()

+-----------+------------------+
|cancer_type|     AvgMeanRadius|
+-----------+------------------+
|  malignant|12.146523809523808|
|     benign|17.462830188679245|
+-----------+------------------+



In [18]:
spark.sql("""
SELECT 
    cancer_type AS CancerType,
    AVG(`mean radius`) AS AvgMedianRadius
FROM SqlTutorial
GROUP BY cancer_type
""").show()

+----------+------------------+
|CancerType|   AvgMedianRadius|
+----------+------------------+
| malignant|12.146523809523808|
|    benign|17.462830188679245|
+----------+------------------+



## Query 3: Window and Aliases

In [19]:
from pyspark.sql.functions import col, lead
from pyspark.sql.window import Window

In [20]:
window = Window.partitionBy('cancer_type').orderBy('mean radius')

df_rdd.withColumn(
    'next radius', 
    lead('mean radius', 1).over(window)
).select(['mean radius', 'next radius']).show()

+-----------+-----------+
|mean radius|next radius|
+-----------+-----------+
|      6.981|      7.691|
|      7.691|      7.729|
|      7.729|       7.76|
|       7.76|      8.196|
|      8.196|      8.219|
|      8.219|      8.571|
|      8.571|      8.597|
|      8.597|      8.598|
|      8.598|      8.618|
|      8.618|      8.671|
|      8.671|      8.726|
|      8.726|      8.734|
|      8.734|      8.878|
|      8.878|      8.888|
|      8.888|       8.95|
|       8.95|        9.0|
|        9.0|      9.029|
|      9.029|      9.042|
|      9.042|      9.173|
|      9.173|      9.268|
+-----------+-----------+
only showing top 20 rows



In [21]:
spark.sql("""
SELECT
    `mean radius`,
    LEAD(`mean radius`) 
        OVER(PARTITION BY cancer_type ORDER BY `mean radius`) AS `next radius`
FROM SqlTutorial
""").show()

+-----------+-----------+
|mean radius|next radius|
+-----------+-----------+
|      6.981|      7.691|
|      7.691|      7.729|
|      7.729|       7.76|
|       7.76|      8.196|
|      8.196|      8.219|
|      8.219|      8.571|
|      8.571|      8.597|
|      8.597|      8.598|
|      8.598|      8.618|
|      8.618|      8.671|
|      8.671|      8.726|
|      8.726|      8.734|
|      8.734|      8.878|
|      8.878|      8.888|
|      8.888|       8.95|
|       8.95|        9.0|
|        9.0|      9.029|
|      9.029|      9.042|
|      9.042|      9.173|
|      9.173|      9.268|
+-----------+-----------+
only showing top 20 rows

