In [1]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession

# sc = pyspark.SparkContext(appName="chp03")
spark = SparkSession.builder.getOrCreate()

In [2]:
df_outliers = spark.createDataFrame([
    (1, 143.5, 5.3, 28),
    (2, 154.2, 5.5, 45),
    (3, 342.3, 5.1, 99),
    (4, 144.5, 5.5, 33),
    (5, 133.2, 5.4, 54),
    (6, 124.1, 5.1, 21),
    (7, 129.2, 5.3, 42),
], ['id', 'weight', 'height', 'age'])

In [3]:
df_outliers

DataFrame[id: bigint, weight: double, height: double, age: bigint]

In [5]:
cols = ['weight', 'height', 'age']
bounds = {}

for col in cols:
    print(col, ':')
    quantiles = df_outliers.approxQuantile(col, [0.25, 0.75], 0.05)
    print('quantiles:', quantiles)
    
    IQR = quantiles[1] - quantiles[0]
    print('IQR:', IQR)
    
    bounds[col] = [
        quantiles[0] - 1.5 * IQR,
        quantiles[1] + 1.5 * IQR
    ]
    
    print('-------------------------------------------------------')

print(bounds)

weight :
quantiles: [129.2, 154.2]
IQR: 25.0
-------------------------------------------------------
height :
quantiles: [5.1, 5.5]
IQR: 0.40000000000000036
-------------------------------------------------------
age :
quantiles: [28.0, 54.0]
IQR: 26.0
-------------------------------------------------------
{'weight': [91.69999999999999, 191.7], 'height': [4.499999999999999, 6.1000000000000005], 'age': [-11.0, 93.0]}


In [6]:
outliers = df_outliers.select(*['id'] + [
    (
        (df_outliers[c] < bounds[c][0]) |
        (df_outliers[c] > bounds[c][1])
    ).alias(c + '_o') for c in cols
])
outliers.show()

+---+--------+--------+-----+
| id|weight_o|height_o|age_o|
+---+--------+--------+-----+
|  1|   false|   false|false|
|  2|   false|   false|false|
|  3|    true|   false| true|
|  4|   false|   false|false|
|  5|   false|   false|false|
|  6|   false|   false|false|
|  7|   false|   false|false|
+---+--------+--------+-----+



In [7]:
df_outliers = df_outliers.join(outliers, on='id')
df_outliers.show()

+---+------+------+---+--------+--------+-----+
| id|weight|height|age|weight_o|height_o|age_o|
+---+------+------+---+--------+--------+-----+
|  7| 129.2|   5.3| 42|   false|   false|false|
|  6| 124.1|   5.1| 21|   false|   false|false|
|  5| 133.2|   5.4| 54|   false|   false|false|
|  1| 143.5|   5.3| 28|   false|   false|false|
|  3| 342.3|   5.1| 99|    true|   false| true|
|  2| 154.2|   5.5| 45|   false|   false|false|
|  4| 144.5|   5.5| 33|   false|   false|false|
+---+------+------+---+--------+--------+-----+



In [8]:
df_outliers.filter('weight_o').select('id', 'weight').show()
df_outliers.filter('age_o').select('id', 'age').show()

+---+------+
| id|weight|
+---+------+
|  3| 342.3|
+---+------+

+---+---+
| id|age|
+---+---+
|  3| 99|
+---+---+

