In [1]:
%run ./Includes/paths.py

In [3]:
import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

from delta import *
import pandas as pd

# start spark
builder = (pyspark.sql.SparkSession.builder.appName("Spark-Course")
                .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
                .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
                .config("spark.sql.warehouse.dir", spark_warehouse_path)
                .config("spark.sql.catalogImplementation", "hive"))

spark = configure_spark_with_delta_pip(builder).getOrCreate()

# setting log-level to ERROR to decrease verbosity
# log4j log-levels are: OFF, FATAL, ERROR, WARN, INFO, DEBUG, TRACE, ALL
spark.sparkContext.setLogLevel("ERROR")

# so that we can register UDFs in SQL
spark.builder.enableHiveSupport()

spark

In [3]:
def display(spark_df, rows=10):
    return spark_df.limit(rows).toPandas().head(rows)

In [4]:
%load_ext sparksql_magic

In [92]:
df = spark.read.option('header', True).option('inferSchema', True).csv('../Data/titanic/train.csv')

display(df)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [93]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [94]:
# RFormula won't work if we don't clean it a bit
df = df.where('Age is not null and Sex is not null')

## Vectors

Features passed to an ML model must be as vectors of doubles.

In [8]:
from pyspark.ml.linalg import Vectors

# dense vector
dense_vec = Vectors.dense(1, 2, 3.5)
dense_vec

DenseVector([1.0, 2.0, 3.5])

In [10]:
# sparse vector
sparse_vec = Vectors.sparse(10, {2 : 18.1, 8 : -1.3})
sparse_vec

SparseVector(10, {2: 18.1, 8: -1.3})

In [11]:
# sparse vector
sparse_vec = Vectors.sparse(10, [2, 8], [18.1, -1.3])
sparse_vec

SparseVector(10, {2: 18.1, 8: -1.3})

## RFormula

In [95]:
from pyspark.ml.feature import RFormula

rform = RFormula(formula = 'Survived ~ Age + Sex')
rform

RFormula_1855eb6ebaa9

In [96]:
fitted_rform = rform.fit(df)
fitted_rform

RFormulaModel: uid=RFormula_1855eb6ebaa9, resolvedFormula=ResolvedRFormula(label=Survived, terms=[Age,Sex], hasIntercept=true)

In [97]:
# create `features` column from Age and Sex
df_transformed = fitted_rform.transform(df)

display(df_transformed)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,features,label
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,"[22.0, 1.0]",0.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,"[38.0, 0.0]",1.0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,"[26.0, 0.0]",1.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,"[35.0, 0.0]",1.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,"[35.0, 1.0]",0.0
5,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,"[54.0, 1.0]",0.0
6,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,"[2.0, 1.0]",0.0
7,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,"[27.0, 0.0]",1.0
8,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,"[14.0, 0.0]",1.0
9,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7,G6,S,"[4.0, 0.0]",1.0


## Train-Test Split

In [104]:
train, test = df_transformed.randomSplit([0.7, 0.3])
train.agg(sum('Survived').alias('survived_count'), count('*').alias('total')).selectExpr('survived_count', 'total', 'survived_count/total as survived_prop').show()
test.agg(sum('Survived').alias('survived_count'), count('*').alias('total')).selectExpr('survived_count', 'total', 'survived_count/total as survived_prop').show()

+--------------+-----+-------------------+
|survived_count|total|      survived_prop|
+--------------+-----+-------------------+
|           207|  507|0.40828402366863903|
+--------------+-----+-------------------+

+--------------+-----+-------------------+
|survived_count|total|      survived_prop|
+--------------+-----+-------------------+
|            83|  207|0.40096618357487923|
+--------------+-----+-------------------+



## Estimators

In [105]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol="label",featuresCol="features")
lr

LogisticRegression_bee414672ed7

In [75]:
lr.explainParams()

"aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)\nelasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)\nfamily: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial (default: auto)\nfeaturesCol: features column name. (default: features)\nfitIntercept: whether to fit an intercept term. (default: True)\nlabelCol: label column name. (default: label)\nlowerBoundsOnCoefficients: The lower bounds on coefficients if fitting under bound constrained optimization. The bound matrix must be compatible with the shape (1, number of features) for binomial regression, or (number of classes, number of features) for multinomial regression. (undefined)\nlowerBoundsOnIntercepts: The lower bounds on intercepts if fitting under bound constrained optimization. The bounds vector size must be

In [106]:
fitted_lr = lr.fit(train)
fitted_lr

LogisticRegressionModel: uid=LogisticRegression_bee414672ed7, numClasses=2, numFeatures=2

In [107]:
train_transformed = fitted_lr.transform(train)

display(train_transformed)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,features,label,rawPrediction,probability,prediction
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,"[22.0, 1.0]",0.0,"[1.2647888779910677, -1.2647888779910677]","[0.7798493848033273, 0.22015061519667267]",0.0
1,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,"[26.0, 0.0]",1.0,"[-1.260387208836935, 1.260387208836935]","[0.2209072435768506, 0.7790927564231493]",1.0
2,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,"[35.0, 0.0]",1.0,"[-1.1666556660351481, 1.1666556660351481]","[0.23746002025284987, 0.7625399797471502]",1.0
3,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,"[35.0, 1.0]",0.0,"[1.4001788842603151, -1.4001788842603151]","[0.8022122732546563, 0.19778772674534373]",0.0
4,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,"[2.0, 1.0]",0.0,"[1.0564965606537635, -1.0564965606537635]","[0.7420204644399355, 0.2579795355600645]",0.0
5,13,0,3,"Saundercock, Mr. William Henry",male,20.0,0,0,A/5. 2151,8.05,,S,"[20.0, 1.0]",0.0,"[1.2439596462573372, -1.2439596462573372]","[0.7762524958856047, 0.22374750411439526]",0.0
6,14,0,3,"Andersson, Mr. Anders Johan",male,39.0,1,5,347082,31.275,,S,"[39.0, 1.0]",0.0,"[1.441837347727776, -1.441837347727776]","[0.8087390140193111, 0.19126098598068886]",0.0
7,15,0,3,"Vestrom, Miss. Hulda Amanda Adolfina",female,14.0,0,0,350406,7.8542,,S,"[14.0, 0.0]",0.0,"[-1.3853625992393175, 1.3853625992393175]","[0.20014912357440146, 0.7998508764255985]",1.0
8,16,1,2,"Hewlett, Mrs. (Mary D Kingcome)",female,55.0,0,0,248706,16.0,,S,"[55.0, 0.0]",1.0,"[-0.9583633486978441, 0.9583633486978441]","[0.2772059993650834, 0.7227940006349166]",1.0
9,17,0,3,"Rice, Master. Eugene",male,2.0,4,1,382652,29.125,,Q,"[2.0, 1.0]",0.0,"[1.0564965606537635, -1.0564965606537635]","[0.7420204644399355, 0.2579795355600645]",0.0
