Jan 11, 2021

In [1]:
from pyspark import SparkContext
from pyspark.sql import SQLContext, Row
import pandas as pd
import numpy as np

from pyspark.sql.functions import when, udf, col, regexp_extract, regexp_replace,count,sum,avg,round
from pyspark.sql.types import DoubleType,IntegerType, StringType

# 스파크 통계
import pyspark.sql.functions as F

In [2]:
sc = SparkContext( 'local' ) 
sqlCtx = SQLContext( sc )

In [280]:
df = sqlCtx.read.csv( '../data/airline-passengers.csv',
                    header = True, inferSchema = True)

df.toPandas()

Unnamed: 0,Month,Passengers
0,1949-01,112
1,1949-02,118
2,1949-03,132
3,1949-04,129
4,1949-05,121
...,...,...
139,1960-08,606
140,1960-09,508
141,1960-10,461
142,1960-11,390


In [281]:
df.printSchema()
# Month를 데이터 타입으로

root
 |-- Month: string (nullable = true)
 |-- Passengers: integer (nullable = true)



# 시계열 데이터로 전환

In [282]:
df1 = df.withColumn( 'parsed', F.to_timestamp( 'Month', format='yyyy-MM') )

In [31]:
df1.where(df1['parsed'] >= '1950' ).show() # 1950년도 이후 데이터 필터링
# df1.where(" parsed >= '1950' ").show()

+-------+----------+-------------------+
|  Month|Passengers|             parsed|
+-------+----------+-------------------+
|1950-01|       115|1950-01-01 00:00:00|
|1950-02|       126|1950-02-01 00:00:00|
|1950-03|       141|1950-03-01 00:00:00|
|1950-04|       135|1950-04-01 01:00:00|
|1950-05|       125|1950-05-01 00:00:00|
|1950-06|       149|1950-06-01 00:00:00|
|1950-07|       170|1950-07-01 00:00:00|
|1950-08|       170|1950-08-01 00:00:00|
|1950-09|       158|1950-09-01 00:00:00|
|1950-10|       133|1950-10-01 00:00:00|
|1950-11|       114|1950-11-01 00:00:00|
|1950-12|       140|1950-12-01 00:00:00|
|1951-01|       145|1951-01-01 00:00:00|
|1951-02|       150|1951-02-01 00:00:00|
|1951-03|       178|1951-03-01 00:00:00|
|1951-04|       163|1951-04-01 00:00:00|
|1951-05|       172|1951-05-01 00:00:00|
|1951-06|       178|1951-06-01 00:00:00|
|1951-07|       199|1951-07-01 00:00:00|
|1951-08|       199|1951-08-01 00:00:00|
+-------+----------+-------------------+
only showing top

In [32]:
df1.where(df1['parsed'] >= '1960-03' ).show()

+-------+----------+-------------------+
|  Month|Passengers|             parsed|
+-------+----------+-------------------+
|1960-03|       419|1960-03-01 00:00:00|
|1960-04|       461|1960-04-01 00:00:00|
|1960-05|       472|1960-05-01 01:00:00|
|1960-06|       535|1960-06-01 00:00:00|
|1960-07|       622|1960-07-01 00:00:00|
|1960-08|       606|1960-08-01 00:00:00|
|1960-09|       508|1960-09-01 00:00:00|
|1960-10|       461|1960-10-01 00:00:00|
|1960-11|       390|1960-11-01 00:00:00|
|1960-12|       432|1960-12-01 00:00:00|
+-------+----------+-------------------+



In [34]:
df1.groupBy( )

<bound method DataFrame.groupBy of DataFrame[Month: string, Passengers: int, parsed: timestamp]>

In [36]:
df1.withColumn( 'y', F.year( 'parsed') ).show() # 연도만 추출

+-------+----------+-------------------+----+
|  Month|Passengers|             parsed|   y|
+-------+----------+-------------------+----+
|1949-01|       112|1949-01-01 00:00:00|1949|
|1949-02|       118|1949-02-01 00:00:00|1949|
|1949-03|       132|1949-03-01 00:00:00|1949|
|1949-04|       129|1949-04-01 00:00:00|1949|
|1949-05|       121|1949-05-01 00:00:00|1949|
|1949-06|       135|1949-06-01 00:00:00|1949|
|1949-07|       148|1949-07-01 00:00:00|1949|
|1949-08|       148|1949-08-01 00:00:00|1949|
|1949-09|       136|1949-09-01 00:00:00|1949|
|1949-10|       119|1949-10-01 00:00:00|1949|
|1949-11|       104|1949-11-01 00:00:00|1949|
|1949-12|       118|1949-12-01 00:00:00|1949|
|1950-01|       115|1950-01-01 00:00:00|1950|
|1950-02|       126|1950-02-01 00:00:00|1950|
|1950-03|       141|1950-03-01 00:00:00|1950|
|1950-04|       135|1950-04-01 01:00:00|1950|
|1950-05|       125|1950-05-01 00:00:00|1950|
|1950-06|       149|1950-06-01 00:00:00|1950|
|1950-07|       170|1950-07-01 00:

In [39]:
df1.withColumn( 'm', F.month('parsed')).select('m').show() # 월만 추출

+---+
|  m|
+---+
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
| 10|
| 11|
| 12|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
+---+
only showing top 20 rows



# F.date_format()

In [41]:
df1.withColumn( 'f', F.date_format('parsed', 'yyyy년 MM월 dd일' )).show()

+-------+----------+-------------------+----------------+
|  Month|Passengers|             parsed|               f|
+-------+----------+-------------------+----------------+
|1949-01|       112|1949-01-01 00:00:00|1949년 01월 01일|
|1949-02|       118|1949-02-01 00:00:00|1949년 02월 01일|
|1949-03|       132|1949-03-01 00:00:00|1949년 03월 01일|
|1949-04|       129|1949-04-01 00:00:00|1949년 04월 01일|
|1949-05|       121|1949-05-01 00:00:00|1949년 05월 01일|
|1949-06|       135|1949-06-01 00:00:00|1949년 06월 01일|
|1949-07|       148|1949-07-01 00:00:00|1949년 07월 01일|
|1949-08|       148|1949-08-01 00:00:00|1949년 08월 01일|
|1949-09|       136|1949-09-01 00:00:00|1949년 09월 01일|
|1949-10|       119|1949-10-01 00:00:00|1949년 10월 01일|
|1949-11|       104|1949-11-01 00:00:00|1949년 11월 01일|
|1949-12|       118|1949-12-01 00:00:00|1949년 12월 01일|
|1950-01|       115|1950-01-01 00:00:00|1950년 01월 01일|
|1950-02|       126|1950-02-01 00:00:00|1950년 02월 01일|
|1950-03|       141|1950-03-01 00:00:00|1950년 03월 01일|
|

In [79]:
df1.select( F.year('parsed').alias('year'), 'Passengers').\
    groupBy( 'year' ).mean('Passengers').\
    orderBy('year').withColumn('avg(Passengers)', F.round('avg(Passengers)',2)).show()

+----+---------------+
|year|avg(Passengers)|
+----+---------------+
|1949|         126.67|
|1950|         139.67|
|1951|         170.17|
|1952|          197.0|
|1953|          225.0|
|1954|         238.92|
|1955|          284.0|
|1956|         328.25|
|1957|         368.42|
|1958|          381.0|
|1959|         428.33|
|1960|         476.17|
+----+---------------+



In [76]:
df1.selectExpr( 'year(parsed) as year','Passengers').show()

+----+----------+
|year|Passengers|
+----+----------+
|1949|       112|
|1949|       118|
|1949|       132|
|1949|       129|
|1949|       121|
|1949|       135|
|1949|       148|
|1949|       148|
|1949|       136|
|1949|       119|
|1949|       104|
|1949|       118|
|1950|       115|
|1950|       126|
|1950|       141|
|1950|       135|
|1950|       125|
|1950|       149|
|1950|       170|
|1950|       170|
+----+----------+
only showing top 20 rows



In [69]:
df1.groupBy( F.year('parsed') ).mean().orderBy('year(parsed)').show()

+------------+------------------+
|year(parsed)|   avg(Passengers)|
+------------+------------------+
|        1949|126.66666666666667|
|        1950|139.66666666666666|
|        1951|170.16666666666666|
|        1952|             197.0|
|        1953|             225.0|
|        1954|238.91666666666666|
|        1955|             284.0|
|        1956|            328.25|
|        1957| 368.4166666666667|
|        1958|             381.0|
|        1959| 428.3333333333333|
|        1960| 476.1666666666667|
+------------+------------------+



# 연습문제

In [None]:
1. 3개의 데이터를 병합하시요.

====이후문제는 병합한 데이터프레임을 이용 ===========


2. 년도 및 월별 평균 사망자를 보여주시요

3. 2015년 1월 ~ 7월 데이터를 보여주시요.

4.  2016년 사고대비 사망율을 구하시요

5. 2015년 대비  사망이 가장 많이 증가한
2016년 도 월을 구하시요


In [94]:
df1 = sqlCtx.read.csv( path   = '../data/2014년 졸음운전 교통사고.csv',
                      header = True,
                      inferSchema = True, # shcema 자동 추정,
                      encoding = 'euc-kr')
df2 = sqlCtx.read.csv( path   = '../data/2015년 졸음운전 교통사고.csv',
                      header = True,
                      inferSchema = True,# shcema 자동 추정,
                      encoding = 'euc-kr')
df3 = sqlCtx.read.csv( path   = '../data/2016년 졸음운전 교통사고.csv',
                      header = True,
                      inferSchema = True,# shcema 자동 추정,
                      encoding = 'euc-kr')
df1.show()

+----------+--------+--------+--------+
|      구분|사고(건)|사망(명)|부상(명)|
+----------+--------+--------+--------+
| 2014년1월|     169|       7|     347|
| 2014년2월|     145|       3|     268|
| 2014년3월|     206|      12|     358|
| 2014년4월|     182|       4|     360|
| 2014년5월|     218|      17|     474|
| 2014년6월|     233|      13|     457|
| 2014년7월|     229|      14|     431|
| 2014년8월|     222|       9|     447|
| 2014년9월|     225|      15|     430|
|2014년10월|     210|      14|     392|
|2014년11월|     209|      10|     386|
|2014년12월|     178|      12|     329|
+----------+--------+--------+--------+



In [95]:
df = df1.unionAll( df2 ).unionAll( df3 )
df.show()
df.printSchema()

+----------+--------+--------+--------+
|      구분|사고(건)|사망(명)|부상(명)|
+----------+--------+--------+--------+
| 2014년1월|     169|       7|     347|
| 2014년2월|     145|       3|     268|
| 2014년3월|     206|      12|     358|
| 2014년4월|     182|       4|     360|
| 2014년5월|     218|      17|     474|
| 2014년6월|     233|      13|     457|
| 2014년7월|     229|      14|     431|
| 2014년8월|     222|       9|     447|
| 2014년9월|     225|      15|     430|
|2014년10월|     210|      14|     392|
|2014년11월|     209|      10|     386|
|2014년12월|     178|      12|     329|
| 2015년1월|     205|       8|     375|
| 2015년2월|     175|       8|     375|
| 2015년3월|     237|       4|     492|
| 2015년4월|     227|      10|     486|
| 2015년5월|     231|       9|     504|
| 2015년6월|     207|       8|     401|
| 2015년7월|     273|       7|     556|
| 2015년8월|     268|      19|     534|
+----------+--------+--------+--------+
only showing top 20 rows

root
 |-- 구분: string (nullable = true)
 |-- 사고(건): integer (nulla

In [160]:
df.show()

+-------------------+--------+--------+--------+
|               구분|사고(건)|사망(명)|부상(명)|
+-------------------+--------+--------+--------+
|2014-01-01 00:00:00|     169|       7|     347|
|2014-02-01 00:00:00|     145|       3|     268|
|2014-03-01 00:00:00|     206|      12|     358|
|2014-04-01 00:00:00|     182|       4|     360|
|2014-05-01 00:00:00|     218|      17|     474|
|2014-06-01 00:00:00|     233|      13|     457|
|2014-07-01 00:00:00|     229|      14|     431|
|2014-08-01 00:00:00|     222|       9|     447|
|2014-09-01 00:00:00|     225|      15|     430|
|2014-10-01 00:00:00|     210|      14|     392|
|2014-11-01 00:00:00|     209|      10|     386|
|2014-12-01 00:00:00|     178|      12|     329|
|2015-01-01 00:00:00|     205|       8|     375|
|2015-02-01 00:00:00|     175|       8|     375|
|2015-03-01 00:00:00|     237|       4|     492|
|2015-04-01 00:00:00|     227|      10|     486|
|2015-05-01 00:00:00|     231|       9|     504|
|2015-06-01 00:00:00|     207| 

In [None]:
5. 2015년 대비  사망이 가장 많이 증가한 2016년 도 월을 구하시요


In [96]:
# 시계열 데이터로 변환
df = df.withColumn( '구분', F.to_timestamp( '구분', format='yyyy년MM월') )

In [248]:
# 2번
df.groupBy( F.year('구분'), F.month('구분') ).mean( '사망(명)').orderBy('year(구분)').show()

+----------+-----------+-------------+
|year(구분)|month(구분)|avg(사망(명))|
+----------+-----------+-------------+
|      2014|          6|         13.0|
|      2014|         11|         10.0|
|      2014|          1|          7.0|
|      2014|          8|          9.0|
|      2014|         10|         14.0|
|      2014|          2|          3.0|
|      2014|         12|         12.0|
|      2014|          4|          4.0|
|      2014|          5|         17.0|
|      2014|          9|         15.0|
|      2014|          3|         12.0|
|      2014|          7|         14.0|
|      2015|         10|         14.0|
|      2015|         11|          3.0|
|      2015|          7|          7.0|
|      2015|          9|         10.0|
|      2015|         12|          8.0|
|      2015|          4|         10.0|
|      2015|          8|         19.0|
|      2015|          3|          4.0|
+----------+-----------+-------------+
only showing top 20 rows



# 3번 해결 방법
- 4가지

In [233]:
# 3번
df.where( (F.year('구분') == '2015' ) & ( F.month('구분') <= '7' ) ).show()

+-------------------+--------+--------+--------+
|               구분|사고(건)|사망(명)|부상(명)|
+-------------------+--------+--------+--------+
|2015-01-01 00:00:00|     205|       8|     375|
|2015-02-01 00:00:00|     175|       8|     375|
|2015-03-01 00:00:00|     237|       4|     492|
|2015-04-01 00:00:00|     227|      10|     486|
|2015-05-01 00:00:00|     231|       9|     504|
|2015-06-01 00:00:00|     207|       8|     401|
|2015-07-01 00:00:00|     273|       7|     556|
+-------------------+--------+--------+--------+



In [255]:
# 3번
df.where( (df['구분'] >= '2015-01' ) & (df['구분'] < '2015-08' ) ).show()

+-------------------+--------+--------+--------+
|               구분|사고(건)|사망(명)|부상(명)|
+-------------------+--------+--------+--------+
|2015-01-01 00:00:00|     205|       8|     375|
|2015-02-01 00:00:00|     175|       8|     375|
|2015-03-01 00:00:00|     237|       4|     492|
|2015-04-01 00:00:00|     227|      10|     486|
|2015-05-01 00:00:00|     231|       9|     504|
|2015-06-01 00:00:00|     207|       8|     401|
|2015-07-01 00:00:00|     273|       7|     556|
+-------------------+--------+--------+--------+



In [259]:
# 3번
df.where( "`구분` between '2015-01' and '2015-08' ").show()

+-------------------+--------+--------+--------+
|               구분|사고(건)|사망(명)|부상(명)|
+-------------------+--------+--------+--------+
|2015-01-01 00:00:00|     205|       8|     375|
|2015-02-01 00:00:00|     175|       8|     375|
|2015-03-01 00:00:00|     237|       4|     492|
|2015-04-01 00:00:00|     227|      10|     486|
|2015-05-01 00:00:00|     231|       9|     504|
|2015-06-01 00:00:00|     207|       8|     401|
|2015-07-01 00:00:00|     273|       7|     556|
+-------------------+--------+--------+--------+



In [254]:
# 3번
df.where( "`구분` >= '2015-01' and `구분` < '2015-08' ").show()

+-------------------+--------+--------+--------+
|               구분|사고(건)|사망(명)|부상(명)|
+-------------------+--------+--------+--------+
|2015-01-01 00:00:00|     205|       8|     375|
|2015-02-01 00:00:00|     175|       8|     375|
|2015-03-01 00:00:00|     237|       4|     492|
|2015-04-01 00:00:00|     227|      10|     486|
|2015-05-01 00:00:00|     231|       9|     504|
|2015-06-01 00:00:00|     207|       8|     401|
|2015-07-01 00:00:00|     273|       7|     556|
+-------------------+--------+--------+--------+



# 성능 확인
- timemodule확인
- 보통 sql문 사용할 때 사용한다

In [258]:
# 4번
import time
start = time.time()

df.where(F.year('구분') == '2016').\
    withColumn( '사고 대비 사망율',df['사망(명)']/df['사고(건)']).\
    withColumn('사고 대비 사망율', round('사고 대비 사망율', 2)).show()

end = time.time()
print( end - start )

+-------------------+--------+--------+--------+----------------+
|               구분|사고(건)|사망(명)|부상(명)|사고 대비 사망율|
+-------------------+--------+--------+--------+----------------+
|2016-01-01 00:00:00|     192|       5|     387|            0.03|
|2016-02-01 00:00:00|     174|       6|     328|            0.03|
|2016-03-01 00:00:00|     217|       7|     435|            0.03|
|2016-04-01 00:00:00|     216|       7|     419|            0.03|
|2016-05-01 00:00:00|     239|      13|     522|            0.05|
|2016-06-01 00:00:00|     200|      12|     362|            0.06|
|2016-07-01 00:00:00|     227|       9|     460|            0.04|
|2016-08-01 00:00:00|     230|       7|     490|            0.03|
|2016-09-01 00:00:00|     187|      13|     347|            0.07|
|2016-10-01 00:00:00|     183|      10|     367|            0.05|
|2016-11-01 00:00:00|     200|       5|     418|            0.03|
|2016-12-01 00:00:00|     168|       4|     364|            0.02|
+-------------------+-------

In [None]:
# 5. 2015년 대비  사망이 가장 많이 증가한 2016년 도 월을 구하시요
df2.rdd.ma[( lambda x: x[''])]

# Window
- duration 
    - 시간, 분, 단위로 집계 가능
    - pandas resample보다는 불편하지만 구체적 시간 단위로 집계 가능

In [283]:
df1.show()

+-------+----------+-------------------+
|  Month|Passengers|             parsed|
+-------+----------+-------------------+
|1949-01|       112|1949-01-01 00:00:00|
|1949-02|       118|1949-02-01 00:00:00|
|1949-03|       132|1949-03-01 00:00:00|
|1949-04|       129|1949-04-01 00:00:00|
|1949-05|       121|1949-05-01 00:00:00|
|1949-06|       135|1949-06-01 00:00:00|
|1949-07|       148|1949-07-01 00:00:00|
|1949-08|       148|1949-08-01 00:00:00|
|1949-09|       136|1949-09-01 00:00:00|
|1949-10|       119|1949-10-01 00:00:00|
|1949-11|       104|1949-11-01 00:00:00|
|1949-12|       118|1949-12-01 00:00:00|
|1950-01|       115|1950-01-01 00:00:00|
|1950-02|       126|1950-02-01 00:00:00|
|1950-03|       141|1950-03-01 00:00:00|
|1950-04|       135|1950-04-01 01:00:00|
|1950-05|       125|1950-05-01 00:00:00|
|1950-06|       149|1950-06-01 00:00:00|
|1950-07|       170|1950-07-01 00:00:00|
|1950-08|       170|1950-08-01 00:00:00|
+-------+----------+-------------------+
only showing top

In [287]:
df1.groupBy( F.year('parsed') ).mean().show()

+------------+------------------+
|year(parsed)|   avg(Passengers)|
+------------+------------------+
|        1959| 428.3333333333333|
|        1955|             284.0|
|        1952|             197.0|
|        1956|            328.25|
|        1951|170.16666666666666|
|        1950|139.66666666666666|
|        1949|126.66666666666667|
|        1957| 368.4166666666667|
|        1960| 476.1666666666667|
|        1953|             225.0|
|        1958|             381.0|
|        1954|238.91666666666666|
+------------+------------------+



In [291]:
df1.groupBy( F.window('parsed', '1 day') ).mean().show()

+--------------------+---------------+
|              window|avg(Passengers)|
+--------------------+---------------+
|[1956-07-31 09:30...|          405.0|
|[1957-03-31 08:30...|          348.0|
|[1949-06-30 10:00...|          148.0|
|[1960-07-31 09:30...|          606.0|
|[1949-04-30 10:00...|          121.0|
|[1959-12-31 08:30...|          417.0|
|[1952-09-30 09:00...|          191.0|
|[1951-01-31 09:00...|          150.0|
|[1952-08-31 09:00...|          209.0|
|[1958-05-31 09:30...|          435.0|
|[1950-04-30 10:00...|          125.0|
|[1949-01-31 09:00...|          118.0|
|[1951-11-30 09:00...|          166.0|
|[1954-11-30 08:30...|          229.0|
|[1950-03-31 09:00...|          135.0|
|[1951-04-30 09:00...|          172.0|
|[1956-02-29 08:30...|          317.0|
|[1955-06-30 09:30...|          364.0|
|[1958-03-31 08:30...|          348.0|
|[1953-07-31 09:00...|          272.0|
+--------------------+---------------+
only showing top 20 rows



In [296]:
from pandas_datareader import data
samsung_df = data.get_data_yahoo('005930.KS', '2017/01/01')
samsung_df = samsung_df.reset_index()

In [297]:
sDF = sqlCtx.createDataFrame( samsung_df )
sDF.show(5)

+-------------------+-------+-------+-------+-------+----------+---------------+
|               Date|   High|    Low|   Open|  Close|    Volume|      Adj Close|
+-------------------+-------+-------+-------+-------+----------+---------------+
|2017-01-02 00:00:00|36240.0|35880.0|35980.0|36100.0| 4650600.0|  32702.4609375|
|2017-01-03 00:00:00|36620.0|36020.0|36280.0|36480.0| 7357650.0|   33046.703125|
|2017-01-04 00:00:00|36520.0|36100.0|36500.0|36160.0| 7971750.0|32756.814453125|
|2017-01-05 00:00:00|36060.0|35540.0|36060.0|35560.0|1.096745E7|32213.283203125|
|2017-01-06 00:00:00|36440.0|36040.0|36180.0|36200.0| 8880950.0| 32793.05078125|
+-------------------+-------+-------+-------+-------+----------+---------------+
only showing top 5 rows



In [298]:
sDF.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Open: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: double (nullable = true)
 |-- Adj Close: double (nullable = true)



In [302]:
sDF.groupBy( F.window('Date', '1 week')).mean('Close').orderBy('window').show()

+--------------------+------------------+
|              window|        avg(Close)|
+--------------------+------------------+
|[2016-12-29 09:00...|           36075.0|
|[2017-01-05 09:00...|           37548.0|
|[2017-01-12 09:00...|           37100.0|
|[2017-01-19 09:00...|           38544.0|
|[2017-01-26 09:00...|39313.333333333336|
|[2017-02-02 09:00...|           38928.0|
|[2017-02-09 09:00...|           37928.0|
|[2017-02-16 09:00...|           38788.0|
|[2017-02-23 09:00...|           38610.0|
|[2017-03-02 09:00...|           40060.0|
|[2017-03-09 09:00...|           41076.0|
|[2017-03-16 09:00...|           42224.0|
|[2017-03-23 09:00...|           41588.0|
|[2017-03-30 09:00...|           41740.0|
|[2017-04-06 09:00...|           41892.0|
|[2017-04-13 09:00...|           41252.0|
|[2017-04-20 09:00...|           42268.0|
|[2017-04-27 09:00...|45013.333333333336|
|[2017-05-04 09:00...|           46040.0|
|[2017-05-11 09:00...|           46116.0|
+--------------------+------------