# Multiclass Classification on Spark

The aim of this document is to execute a simple multiclass-classification on Spark

In [1]:
import numpy as np
import pandas as pd 

In [2]:
from sklearn.datasets import load_iris

iris = load_iris()
df = pd.DataFrame(iris.data, columns=[x[:-5] for x in iris.feature_names])
df['species'] = [iris.target_names[x] for x in iris.target]
df.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [3]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(with_mean=True, with_std=True)
scaler.fit(df.drop('species',axis=1));

In [4]:
dg = pd.DataFrame(scaler.transform(df.drop('species',axis=1)), columns=df.columns[:-1])
dg['label'] = df['species']
dg.tail()

Unnamed: 0,sepal length,sepal width,petal length,petal width,label
145,1.038005,-0.124958,0.819624,1.447956,virginica
146,0.553333,-1.281972,0.705893,0.922064,virginica
147,0.795669,-0.124958,0.819624,1.053537,virginica
148,0.432165,0.800654,0.933356,1.447956,virginica
149,0.068662,-0.124958,0.762759,0.790591,virginica


In [5]:
from sklearn.model_selection import train_test_split
from itertools import product
from collections import defaultdict

def validation(df,target,estimator,grid_param,seed=42):
    X_train, X_val, y_train, y_val = train_test_split(df.drop(target, axis=1), df[target],
                                                      test_size=0.4, random_state=seed)
    param_names = sorted(grid_param.keys())
    df_score = defaultdict(list)

    for param_values in product(*[grid_param[name] for name in param_names]):
        param_tmp = {theta:val for theta,val in zip(param_names,param_values)}
        for theta in param_names:
            df_score[theta].append(param_tmp[theta])

        model = estimator(**param_tmp)
        model.fit(X_train,y_train)
        df_score['accuracy'].append(np.mean(model.predict(X_val) == y_val))

    return pd.DataFrame(df_score, columns=param_names+['accuracy'])

In [6]:
from sklearn.ensemble import RandomForestClassifier
param_rf = {'n_estimators': [10,20,30], 'max_depth': [3,5,7]}

validation(dg,'label',RandomForestClassifier,param_rf)

Unnamed: 0,max_depth,n_estimators,accuracy
0,3,10,0.983333
1,3,20,0.983333
2,3,30,0.983333
3,5,10,0.983333
4,5,20,0.983333
5,5,30,0.983333
6,7,10,0.983333
7,7,20,1.0
8,7,30,0.983333


In [7]:
df.reset_index().to_csv('/tmp/iris.csv', index=False)

Spark

In [8]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
                    .appName("Multiclass classification on Spark")\
                    .config("spark.some.config.option", "some-value")\
                    .getOrCreate()

In [9]:
## load the data
df = spark.read.csv('/tmp/iris.csv', header=True)
print(df.columns)

['index', 'sepal length', 'sepal width', 'petal length', 'petal width', 'species']


In [10]:
df.printSchema()

root
 |-- index: string (nullable = true)
 |-- sepal length: string (nullable = true)
 |-- sepal width: string (nullable = true)
 |-- petal length: string (nullable = true)
 |-- petal width: string (nullable = true)
 |-- species: string (nullable = true)



In [11]:
for col in df.columns[:-1]:
    df = df.withColumn(col, df[col].cast('float'))

In [12]:
df.printSchema()

root
 |-- index: float (nullable = true)
 |-- sepal length: float (nullable = true)
 |-- sepal width: float (nullable = true)
 |-- petal length: float (nullable = true)
 |-- petal width: float (nullable = true)
 |-- species: string (nullable = true)



In [13]:
from pyspark.ml.feature import RFormula
formula = RFormula(formula='species ~ . - index', featuresCol="features", labelCol="label")

## check
output1 = formula.fit(df).transform(df)
output1.select(['index','features','label']).show(5, truncate=False)

+-----+----------------------------------------------------------------------------+-----+
|index|features                                                                    |label|
+-----+----------------------------------------------------------------------------+-----+
|0.0  |[5.099999904632568,3.5,1.399999976158142,0.20000000298023224]               |2.0  |
|1.0  |[4.900000095367432,3.0,1.399999976158142,0.20000000298023224]               |2.0  |
|2.0  |[4.699999809265137,3.200000047683716,1.2999999523162842,0.20000000298023224]|2.0  |
|3.0  |[4.599999904632568,3.0999999046325684,1.5,0.20000000298023224]              |2.0  |
|4.0  |[5.0,3.5999999046325684,1.399999976158142,0.20000000298023224]              |2.0  |
+-----+----------------------------------------------------------------------------+-----+
only showing top 5 rows



In [14]:
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=True)

## check
output2 = scaler.fit(output1).transform(output1)
output2.select(['index','scaledFeatures','label']).show(5, truncate=False)

+-----+----------------------------------------------------------------------------------+-----+
|index|scaledFeatures                                                                    |label|
+-----+----------------------------------------------------------------------------------+-----+
|0.0  |[-0.8976740030829902,1.0286112757883925,-1.3367940314975197,-1.3085928211141895]  |2.0  |
|1.0  |[-1.1392003814247418,-0.12454038525827196,-1.3367940314975197,-1.3085928211141895]|2.0  |
|2.0  |[-1.380727335610802,0.33672038913350605,-1.3934698798519933,-1.3085928211141895]  |2.0  |
|3.0  |[-1.5014905247816779,0.10608972700483654,-1.2801181831430464,-1.3085928211141895] |2.0  |
|4.0  |[-1.018437192253866,1.259241388051501,-1.3367940314975197,-1.3085928211141895]    |2.0  |
+-----+----------------------------------------------------------------------------------+-----+
only showing top 5 rows



In [15]:
from pyspark.ml.classification import RandomForestClassifier
estimator_rf = RandomForestClassifier(featuresCol='scaledFeatures', labelCol='label')

## check
output3 = estimator_rf.fit(output2).transform(output2)
output3.select(['index','species','label','rawPrediction','probability','prediction']).show()

+-----+-------+-----+--------------+---------------+----------+
|index|species|label| rawPrediction|    probability|prediction|
+-----+-------+-----+--------------+---------------+----------+
|  0.0| setosa|  2.0|[0.0,0.0,20.0]|  [0.0,0.0,1.0]|       2.0|
|  1.0| setosa|  2.0|[0.0,0.0,20.0]|  [0.0,0.0,1.0]|       2.0|
|  2.0| setosa|  2.0|[0.0,0.0,20.0]|  [0.0,0.0,1.0]|       2.0|
|  3.0| setosa|  2.0|[0.0,0.0,20.0]|  [0.0,0.0,1.0]|       2.0|
|  4.0| setosa|  2.0|[0.0,0.0,20.0]|  [0.0,0.0,1.0]|       2.0|
|  5.0| setosa|  2.0|[1.0,0.0,19.0]|[0.05,0.0,0.95]|       2.0|
|  6.0| setosa|  2.0|[0.0,0.0,20.0]|  [0.0,0.0,1.0]|       2.0|
|  7.0| setosa|  2.0|[0.0,0.0,20.0]|  [0.0,0.0,1.0]|       2.0|
|  8.0| setosa|  2.0|[0.0,0.0,20.0]|  [0.0,0.0,1.0]|       2.0|
|  9.0| setosa|  2.0|[0.0,0.0,20.0]|  [0.0,0.0,1.0]|       2.0|
| 10.0| setosa|  2.0|[0.0,0.0,20.0]|  [0.0,0.0,1.0]|       2.0|
| 11.0| setosa|  2.0|[0.0,0.0,20.0]|  [0.0,0.0,1.0]|       2.0|
| 12.0| setosa|  2.0|[0.0,0.0,20.0]|  [0

In [16]:
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import BinaryClassificationEvaluator

formula = RFormula(formula='species ~ . - index', featuresCol="features", labelCol="label")
scaler = StandardScaler(inputCol=formula.getFeaturesCol(), outputCol="scaledFeatures",
                        withStd=True, withMean=True)

In [17]:
estimator_rf = RandomForestClassifier(featuresCol=scaler.getOutputCol(), labelCol='label', seed=3)
grid_rf = ParamGridBuilder().addGrid(estimator_rf.maxDepth, [3,5,7])\
                            .addGrid(estimator_rf.numTrees, [10,20,30])\
                            .build()

pipeline_rf = Pipeline(stages=[formula, scaler, estimator_rf])

In [18]:
tvs_rf = TrainValidationSplit(estimator=pipeline_rf,
                              estimatorParamMaps=grid_rf,
                              evaluator=BinaryClassificationEvaluator(),
                              trainRatio=0.7)

In [19]:
model_rf = tvs_rf.fit(df)

In [20]:
model_rf.bestModel

PipelineModel_45bf9477d9f612c99000

In [21]:
from collections import defaultdict

def validation_result(grid,metrics,value='score',sort=False):
    df = defaultdict(list)

    for param in grid:
        for param_obj, param_val in param.items():
            df[param_obj.name].append(param_val)

    df = pd.DataFrame(df)
    df[value] = metrics

    if sort:
        return df.sort_values(by=value, ascending=False)
    else:
        return df

validation_result(grid_rf, model_rf.validationMetrics,sort=True)

Unnamed: 0,maxDepth,numTrees,score
0,3,10,0.900744
6,3,30,0.868486
3,3,20,0.856079
7,5,30,0.760546
8,7,30,0.756824
1,5,10,0.755583
2,7,10,0.754342
4,5,20,0.715881
5,7,20,0.715881
