Jan 11, 2021

In [5]:
from pyspark import SparkContext
from pyspark.sql import SQLContext, Row
import pandas as pd
import numpy as np

from pyspark.sql.functions import when, udf, col, regexp_extract, regexp_replace
from pyspark.sql.types import DoubleType,IntegerType, StringType

# 스파크 통계
import pyspark.sql.functions as F

In [2]:
sc = SparkContext( 'local' ) 
sqlCtx = SQLContext( sc )

In [3]:
data =  [('2345', 'Checked by John'),
('2398','Verified by Stacy'),
('2328','Verified by Srinivas than some random text'),        
('3983','Double Checked on 2/23/17 by Marsha')]
df = sc.parallelize(data).toDF(['ID','Notes'] ) # RDD 형성 후 toDF( )로 DataFrame 형성
df.show()

+----+--------------------+
|  ID|               Notes|
+----+--------------------+
|2345|     Checked by John|
|2398|   Verified by Stacy|
|2328|Verified by Srini...|
|3983|Double Checked on...|
+----+--------------------+



# 문자열 정규식
- rlike
- repexp_extract
- rexexp_replace

# rlike
- where절과 함께
- 데이터 검색

In [4]:
df.where( df['Notes'].rlike('John') ).show()

+----+---------------+
|  ID|          Notes|
+----+---------------+
|2345|Checked by John|
+----+---------------+



# regexp_extract
- withColumn절과 함께
- 데이터 추출

In [6]:
# withColumn( ) : 컬럼이 있으면 수정 없으면 추가
# [] : []안의 문자열 집합 중 하나
# + : 1회 이상 반복
# 0 : groupnumber = 0은 그룹 넘버를 지정하지 않겠다
df.withColumn( 'fname', regexp_extract(df['Notes'], 'by [a-zA-Z]+', 0) ).show()

+----+--------------------+-----------+
|  ID|               Notes|      fname|
+----+--------------------+-----------+
|2345|     Checked by John|    by John|
|2398|   Verified by Stacy|   by Stacy|
|2328|Verified by Srini...|by Srinivas|
|3983|Double Checked on...|  by Marsha|
+----+--------------------+-----------+



## 그룹 지정

In [8]:
# ( ): 그룹 지정
# (), () : 1번 그룹 2번 그룹
# 2개의 그룹 중 1번 그룹만 추출, 2번 그룹만 추출
df.withColumn( 'fname', regexp_extract(df['Notes'], '(by) ([a-zA-Z]+)', 2) ).show()

+----+--------------------+--------+
|  ID|               Notes|   fname|
+----+--------------------+--------+
|2345|     Checked by John|    John|
|2398|   Verified by Stacy|   Stacy|
|2328|Verified by Srini...|Srinivas|
|3983|Double Checked on...|  Marsha|
+----+--------------------+--------+



In [12]:
# [a-zA-Z0-9_]+: 영소대문자 or 0-9 or _ 가 1회이상 반복
df.withColumn( 'fname', regexp_extract(df['Notes'], '([a-zA-Z0-9_/]+) by ([a-zA-Z]+)', 0) ).show()

+----+--------------------+--------------------+
|  ID|               Notes|               fname|
+----+--------------------+--------------------+
|2345|     Checked by John|     Checked by John|
|2398|   Verified by Stacy|   Verified by Stacy|
|2328|Verified by Srini...|Verified by Srinivas|
|3983|Double Checked on...|   2/23/17 by Marsha|
+----+--------------------+--------------------+



In [15]:
# \w : [a-zA-Z0-9_]
# [\w/]+ : [a-zA-Z0-9_/]
df.withColumn( 'fname', regexp_extract(df['Notes'], '([\w/]+) by ([a-zA-Z]+)', 0) ).show()

+----+--------------------+--------------------+
|  ID|               Notes|               fname|
+----+--------------------+--------------------+
|2345|     Checked by John|     Checked by John|
|2398|   Verified by Stacy|   Verified by Stacy|
|2328|Verified by Srini...|Verified by Srinivas|
|3983|Double Checked on...|   2/23/17 by Marsha|
+----+--------------------+--------------------+



# regexp_replace

In [19]:
df.withColumn( 'fname', regexp_replace(df['Notes'], 'by', 'and') ).show()

+----+--------------------+--------------------+
|  ID|               Notes|               fname|
+----+--------------------+--------------------+
|2345|     Checked by John|    Checked and John|
|2398|   Verified by Stacy|  Verified and Stacy|
|2328|Verified by Srini...|Verified and Srin...|
|3983|Double Checked on...|Double Checked on...|
+----+--------------------+--------------------+



In [20]:
df.withColumn( 'fname', regexp_replace(df['Notes'], 'by [a-zA-Z]+', 'and') ).show()

+----+--------------------+--------------------+
|  ID|               Notes|               fname|
+----+--------------------+--------------------+
|2345|     Checked by John|         Checked and|
|2398|   Verified by Stacy|        Verified and|
|2328|Verified by Srini...|Verified and than...|
|3983|Double Checked on...|Double Checked on...|
+----+--------------------+--------------------+

