In [1]:
# jupyter notebook 환경에서 spark를 사용하기 위한 모듈
import findspark
findspark.init('C:/spark/spark-3.0.1-bin-hadoop2.7')

In [2]:
# pyspark import
import pyspark
from pyspark import SparkConf,SparkContext
from pyspark.sql.context import SQLContext
from pyspark.sql import SparkSession

In [3]:
# pyspark 연결하기
# 꼬이면 kernel restart
conf= pyspark.SparkConf().setAppName('appName').setMaster('local')
sc= pyspark.SparkContext(conf= conf)
spark= SparkSession(sc)

In [6]:
df= spark.createDataFrame(
[
    (1, 144.5, 5.9, 33, 'M'),
    (2, 167.2, 5.4, 45, 'M'),
    (3, 124.1, 5.2, 23, 'F'),
    (4, 144.5, 5.9, 33, 'M'),
    (5, 133.2, 5.7, 54, 'F'),
    (3, 124.1, 5.2, 23, 'F'),
    (5, 129.2, 5.3, 42, 'M')
],
['id', 'weight', 'height', 'age', 'gender'])

In [5]:
print(df.count())
print(df.distinct().count())

7
6


# 중복된 값 처리하기

In [6]:
# 중복된 행을 찾으나 id의 경우 제외되지 않았다. 이는 서브셋 파라미터를 통해 명시된 칼럼들을 사용해서 중복된 행을 찾기 때문이다.
df = df.dropDuplicates()

In [7]:
df.show()

+---+------+------+---+------+
| id|weight|height|age|gender|
+---+------+------+---+------+
|  5| 133.2|   5.7| 54|     F|
|  5| 129.2|   5.3| 42|     M|
|  1| 144.5|   5.9| 33|     M|
|  4| 144.5|   5.9| 33|     M|
|  2| 167.2|   5.4| 45|     M|
|  3| 124.1|   5.2| 23|     F|
+---+------+------+---+------+



In [8]:
df.select([c for c in df.columns if c != 'id']).distinct().count()

5

In [9]:
df = df.dropDuplicates(
    subset=[c for c in df.columns if c != 'id'])

In [10]:
df.show()

+---+------+------+---+------+
| id|weight|height|age|gender|
+---+------+------+---+------+
|  5| 133.2|   5.7| 54|     F|
|  1| 144.5|   5.9| 33|     M|
|  2| 167.2|   5.4| 45|     M|
|  3| 124.1|   5.2| 23|     F|
|  5| 129.2|   5.3| 42|     M|
+---+------+------+---+------+



In [7]:
import pyspark.sql.functions as fn

df.agg(
    fn.count('id').alias('count'), # 열 이름을 count라는 이름으로 바꾸기
    fn.countDistinct('id').alias('distinct') # 열 이름을 distinct라는 이름으로 바꾸기
).show()

+-----+--------+
|count|distinct|
+-----+--------+
|    7|       5|
+-----+--------+



In [12]:
# id 중복을 피해 새로운 id를 할당
df.withColumn('new_id', fn.monotonically_increasing_id()).show()

+---+------+------+---+------+-------------+
| id|weight|height|age|gender|       new_id|
+---+------+------+---+------+-------------+
|  5| 133.2|   5.7| 54|     F|  25769803776|
|  1| 144.5|   5.9| 33|     M| 171798691840|
|  2| 167.2|   5.4| 45|     M| 592705486848|
|  3| 124.1|   5.2| 23|     F|1236950581248|
|  5| 129.2|   5.3| 42|     M|1365799600128|
+---+------+------+---+------+-------------+



# 결측치 처리하기

In [4]:
df_miss = spark.createDataFrame([
    (1, 143.5, 5.6, 28, 'M', 100000),
    (2, 167.2, 5.4, 45, 'M', None),
    (3, None, 5.2, None, None, None),
    (4, 144.5, 5.9, 33, 'M', None),
    (5, 133.2, 5.7, 54, 'F', None),
    (6, 124.1, 5.2, None, 'F', None),
    (7, 129.2, 5.3, 42, 'M', 76000),
],
['id', 'weight', 'height', 'age', 'gender' ,'income'])

In [8]:
# 각 행의 미관찰 값 개수를 알아보기
df_miss.rdd.map(
    lambda row: (row['id'], sum([c == None for c in row]))
).collect()

[(1, 0), (2, 1), (3, 4), (4, 1), (5, 1), (6, 2), (7, 0)]

In [9]:
df_miss.where('id == 3').show()

+---+------+------+----+------+------+
| id|weight|height| age|gender|income|
+---+------+------+----+------+------+
|  3|  null|   5.2|null|  null|  null|
+---+------+------+----+------+------+



In [10]:
df_miss.agg(*[
    (1 - (fn.count(c) / fn.count('*'))).alias(c + '_missing') for c in df_miss.columns
]).show()
# count함수에 있는 *를 칼럼 이름 부분에 추가하면 모든 행의 개수를 셀 수 있다.
# 반면 리스트 앞에 *가 오면, agg()함수는 그 리스트의 각 엘리먼트를 함수에 전달할 파라미터로 취급한다.

+----------+------------------+--------------+------------------+------------------+------------------+
|id_missing|    weight_missing|height_missing|       age_missing|    gender_missing|    income_missing|
+----------+------------------+--------------+------------------+------------------+------------------+
|       0.0|0.1428571428571429|           0.0|0.2857142857142857|0.1428571428571429|0.7142857142857143|
+----------+------------------+--------------+------------------+------------------+------------------+



In [11]:
# 미관찰 Feature 제거하기
df_miss_no_income = df_miss.select(
    [c for c in df_miss.columns if c != 'income'])

In [12]:
df_miss_no_income.dropna(thresh=3).show()

+---+------+------+----+------+
| id|weight|height| age|gender|
+---+------+------+----+------+
|  1| 143.5|   5.6|  28|     M|
|  2| 167.2|   5.4|  45|     M|
|  4| 144.5|   5.9|  33|     M|
|  5| 133.2|   5.7|  54|     F|
|  6| 124.1|   5.2|null|     F|
|  7| 129.2|   5.3|  42|     M|
+---+------+------+----+------+



In [14]:
means = df_miss_no_income.agg(
    *[fn.mean(c).alias(c)
     for c in df_miss_no_income.columns if c != 'gender']
).toPandas().to_dict('records')[0]

means['gender'] = 'missing'

df_miss_no_income.fillna(means).show()

+---+------------------+------+---+-------+
| id|            weight|height|age| gender|
+---+------------------+------+---+-------+
|  1|             143.5|   5.6| 28|      M|
|  2|             167.2|   5.4| 45|      M|
|  3|140.28333333333333|   5.2| 40|missing|
|  4|             144.5|   5.9| 33|      M|
|  5|             133.2|   5.7| 54|      F|
|  6|             124.1|   5.2| 40|      F|
|  7|             129.2|   5.3| 42|      M|
+---+------------------+------+---+-------+



In [15]:
# dropna()를 사용하여 결측치 제거하기
df_miss_no_income.dropna(thresh= 3).show()

+---+------+------+----+------+
| id|weight|height| age|gender|
+---+------+------+----+------+
|  1| 143.5|   5.6|  28|     M|
|  2| 167.2|   5.4|  45|     M|
|  4| 144.5|   5.9|  33|     M|
|  5| 133.2|   5.7|  54|     F|
|  6| 124.1|   5.2|null|     F|
|  7| 129.2|   5.3|  42|     M|
+---+------+------+----+------+



In [None]:
means = df_miss_no_income.agg(
    *[fn.mean(c).alias(c)
     for c in df_miss_no_income.columns if c != 'gender']
).toPandas().to_dict('records')[0]

means['gender'] = 'missing'

df_miss_no_income.fillna(means).show()