# 데이터 재구성(재구조화)

## 1. 정돈된 데이터

### 1.1. Stack

In [1]:
import pandas as pd
import numpy as np

<img src='https://t1.daumcdn.net/cfile/tistory/25475A3A585FAAF42C'>

- stack[stӕk]
- ~ (sth) (up) (깔끔하게 정돈하여) 쌓다[포개다]; 쌓이다, 포개지다
- ~ sth (with sth) (어떤 곳에 물건을 쌓아서) 채우다

In [2]:
fruit = pd.read_csv('data/fruit.csv', index_col='State')
fruit

Unnamed: 0_level_0,Apple,Orange,Banana
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Texas,12,10,40
Arizona,9,7,12
Florida,0,14,190


In [3]:
fruit.stack()

State          
Texas    Apple      12
         Orange     10
         Banana     40
Arizona  Apple       9
         Orange      7
         Banana     12
Florida  Apple       0
         Orange     14
         Banana    190
dtype: int64

In [4]:
fruit_tidy = fruit.stack().reset_index()
fruit_tidy

Unnamed: 0,State,level_1,0
0,Texas,Apple,12
1,Texas,Orange,10
2,Texas,Banana,40
3,Arizona,Apple,9
4,Arizona,Orange,7
5,Arizona,Banana,12
6,Florida,Apple,0
7,Florida,Orange,14
8,Florida,Banana,190


In [5]:
fruit_tidy.columns = ['state','fruit','weight'] # 컬럼 교체
fruit_tidy

Unnamed: 0,state,fruit,weight
0,Texas,Apple,12
1,Texas,Orange,10
2,Texas,Banana,40
3,Arizona,Apple,9
4,Arizona,Orange,7
5,Arizona,Banana,12
6,Florida,Apple,0
7,Florida,Orange,14
8,Florida,Banana,190


### 1.2. Melt

<img src='https://t1.daumcdn.net/cfile/tistory/25177F4E5863D58A0C'/>

In [6]:
fruit = pd.read_csv('data/fruit.csv')
fruit

Unnamed: 0,State,Apple,Orange,Banana
0,Texas,12,10,40
1,Arizona,9,7,12
2,Florida,0,14,190


In [7]:
fruit.melt(id_vars=['State'], value_vars=['Apple','Orange','Banana'])

Unnamed: 0,State,variable,value
0,Texas,Apple,12
1,Arizona,Apple,9
2,Florida,Apple,0
3,Texas,Orange,10
4,Arizona,Orange,7
5,Florida,Orange,14
6,Texas,Banana,40
7,Arizona,Banana,12
8,Florida,Banana,190


In [8]:
fruit.melt(id_vars=['State'], value_vars=['Apple','Orange','Banana'], var_name='Fruit', value_name='weight')

Unnamed: 0,State,Fruit,weight
0,Texas,Apple,12
1,Arizona,Apple,9
2,Florida,Apple,0
3,Texas,Orange,10
4,Arizona,Orange,7
5,Florida,Orange,14
6,Texas,Banana,40
7,Arizona,Banana,12
8,Florida,Banana,190


### 1.4.  스택된 데이터 되돌리기

In [9]:
fruit

Unnamed: 0,State,Apple,Orange,Banana
0,Texas,12,10,40
1,Arizona,9,7,12
2,Florida,0,14,190


In [10]:
fruit.stack()

0  State       Texas
   Apple          12
   Orange         10
   Banana         40
1  State     Arizona
   Apple           9
   Orange          7
   Banana         12
2  State     Florida
   Apple           0
   Orange         14
   Banana        190
dtype: object

In [11]:
fruit.stack().unstack()
# unstack() 옆으로 늘이기

Unnamed: 0,State,Apple,Orange,Banana
0,Texas,12,10,40
1,Arizona,9,7,12
2,Florida,0,14,190


In [12]:
fruit_tidy2 = fruit.melt(id_vars=['State'], value_vars=['Apple','Orange','Banana'], var_name='Fruit', value_name='weight')
fruit_tidy2

Unnamed: 0,State,Fruit,weight
0,Texas,Apple,12
1,Arizona,Apple,9
2,Florida,Apple,0
3,Texas,Orange,10
4,Arizona,Orange,7
5,Florida,Orange,14
6,Texas,Banana,40
7,Arizona,Banana,12
8,Florida,Banana,190


In [24]:
fruit_tidy2.pivot(index='State',columns='Fruit', values='weight')

Fruit,Apple,Banana,Orange
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Arizona,9,12,7
Florida,0,190,14
Texas,12,40,10


In [25]:
fruit.stack()

0  State       Texas
   Apple          12
   Orange         10
   Banana         40
1  State     Arizona
   Apple           9
   Orange          7
   Banana         12
2  State     Florida
   Apple           0
   Orange         14
   Banana        190
dtype: object

In [26]:
fruit.stack().unstack()

Unnamed: 0,State,Apple,Orange,Banana
0,Texas,12,10,40
1,Arizona,9,7,12
2,Florida,0,14,190


In [27]:
fruit.stack().unstack(level=0)

Unnamed: 0,0,1,2
State,Texas,Arizona,Florida
Apple,12,9,0
Orange,10,7,14
Banana,40,12,190


In [28]:
fruit.stack().unstack(level=1)

Unnamed: 0,State,Apple,Orange,Banana
0,Texas,12,10,40
1,Arizona,9,7,12
2,Florida,0,14,190


### 1.4. Groupby 후 unstacking

In [8]:
employee = pd.read_csv('data/employee.csv')
employee.head(3)

Unnamed: 0,UNIQUE_ID,POSITION_TITLE,DEPARTMENT,BASE_SALARY,RACE,EMPLOYMENT_TYPE,GENDER,EMPLOYMENT_STATUS,HIRE_DATE,JOB_DATE
0,0,ASSISTANT DIRECTOR (EX LVL),Municipal Courts Department,121862.0,Hispanic/Latino,Full Time,Female,Active,2006-06-12,2012-10-13
1,1,LIBRARY ASSISTANT,Library,26125.0,Hispanic/Latino,Full Time,Female,Active,2000-07-19,2010-09-18
2,2,POLICE OFFICER,Houston Police Department-HPD,45279.0,White,Full Time,Male,Active,2015-02-03,2015-02-03


In [9]:
employee.groupby('RACE')['BASE_SALARY'].mean().astype(int)

RACE
American Indian or Alaskan Native    60272
Asian/Pacific Islander               61660
Black or African American            50137
Hispanic/Latino                      52345
Others                               51278
White                                64419
Name: BASE_SALARY, dtype: int64

In [11]:
agg = employee.groupby(['RACE','GENDER'])['BASE_SALARY'].mean().astype(int)
agg

RACE                               GENDER
American Indian or Alaskan Native  Female    60238
                                   Male      60305
Asian/Pacific Islander             Female    63226
                                   Male      61033
Black or African American          Female    48915
                                   Male      51082
Hispanic/Latino                    Female    46503
                                   Male      54782
Others                             Female    63785
                                   Male      38771
White                              Female    66793
                                   Male      63940
Name: BASE_SALARY, dtype: int64

In [12]:
agg.unstack()

GENDER,Female,Male
RACE,Unnamed: 1_level_1,Unnamed: 2_level_1
American Indian or Alaskan Native,60238,60305
Asian/Pacific Islander,63226,61033
Black or African American,48915,51082
Hispanic/Latino,46503,54782
Others,63785,38771
White,66793,63940


In [13]:
agg2 = employee.groupby(['RACE','GENDER'])['BASE_SALARY'].agg(['mean','max','min']).astype(int)
agg2

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,max,min
RACE,GENDER,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
American Indian or Alaskan Native,Female,60238,98536,26125
American Indian or Alaskan Native,Male,60305,81239,26125
Asian/Pacific Islander,Female,63226,130416,26125
Asian/Pacific Islander,Male,61033,163228,27914
Black or African American,Female,48915,150416,24960
Black or African American,Male,51082,275000,26125
Hispanic/Latino,Female,46503,126115,26125
Hispanic/Latino,Male,54782,165216,26104
Others,Female,63785,63785,63785
Others,Male,38771,38771,38771


In [14]:
agg2.unstack()
#agg2.unstack('GENDER')

Unnamed: 0_level_0,mean,mean,max,max,min,min
GENDER,Female,Male,Female,Male,Female,Male
RACE,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
American Indian or Alaskan Native,60238,60305,98536,81239,26125,26125
Asian/Pacific Islander,63226,61033,130416,163228,26125,27914
Black or African American,48915,51082,150416,275000,24960,26125
Hispanic/Latino,46503,54782,126115,165216,26125,26104
Others,63785,38771,63785,38771,63785,38771
White,66793,63940,178331,210588,27955,26125


### 1.5. 같은 셀에 여러 값이 저장된 경우

In [15]:
city = pd.read_csv('data/city.csv')
city

Unnamed: 0,City,Geolocation
0,Houston,"29.7604° N, 95.3698° W"
1,Dallas,"32.7767° N, 96.7970° W"
2,Austin,"30.2672° N, 97.7431° W"


In [16]:
g = city.Geolocation.str.split(pat='. ')
g

0    [29.7604, N, 95.3698, W]
1    [32.7767, N, 96.7970, W]
2    [30.2672, N, 97.7431, W]
Name: Geolocation, dtype: object

In [17]:
g = city.Geolocation.str.split(pat='. ', expand=True)
# 무엇을 기준으로 쪼갤까?  정규식에서 공백후 한글자(. )
g

Unnamed: 0,0,1,2,3
0,29.7604,N,95.3698,W
1,32.7767,N,96.797,W
2,30.2672,N,97.7431,W


# 날짜를 다루는 도구

## 1. Datetime 모듈

In [18]:
import datetime

In [19]:
date = datetime.date(year=2013, month=6, day=7)
time = datetime.time(hour=12, minute=30, second=19, microsecond=463198)
dt = datetime.datetime(year=2013, month=6, day=7, hour=12, minute=30, second=19, microsecond=463198)
print(date)
print(time)
print(dt)

2013-06-07
12:30:19.463198
2013-06-07 12:30:19.463198


In [20]:
td = datetime.timedelta(weeks=2, days=5, hours=10, minutes=20, seconds=6.73)
print(td)

19 days, 10:20:06.730000


In [21]:
print(date + td)

2013-06-26


In [22]:
print(dt + td)

2013-06-26 22:50:26.193198


## 2. Timestamp

### 2.1. 유연한 입력체계

In [23]:
pd.Timestamp(year=2018, month=11, day=10, hour=9, minute=0, second=0)

Timestamp('2018-11-10 09:00:00')

In [24]:
pd.Timestamp('2018-11-10')

Timestamp('2018-11-10 00:00:00')

In [25]:
pd.Timestamp('2018/11/10')

Timestamp('2018-11-10 00:00:00')

In [26]:
pd.Timestamp('2018-11/10')

Timestamp('2018-11-10 00:00:00')

In [27]:
pd.Timestamp('Nov 10, 2018')

Timestamp('2018-11-10 00:00:00')

In [28]:
pd.Timestamp(500)

Timestamp('1970-01-01 00:00:00.000000500')

In [29]:
pd.Timestamp(1000000000000000000) # 0 20개

Timestamp('2001-09-09 01:46:40')

In [30]:
pd.Timestamp(500, unit='D')

Timestamp('1971-05-16 00:00:00')

### 2.2. to_datetime

In [31]:
pd.to_datetime('2018-11-10')

Timestamp('2018-11-10 00:00:00')

In [32]:
s = pd.Series([10, 100, 1000, 10000])
pd.to_datetime(s, unit='D')

0   1970-01-11
1   1970-04-11
2   1972-09-27
3   1997-05-19
dtype: datetime64[ns]

In [33]:
s = pd.Series(['12-5-2015','14-1-2013','20/12/2017','40/23/2017'])
pd.to_datetime(s, dayfirst=True, errors='coerce')

0   2015-05-12
1   2013-01-14
2   2017-12-20
3          NaT
dtype: datetime64[ns]

In [34]:
pd.to_datetime('Start Date: Nov 10, 2017 Start Time: 9:00 am', format='Start Date: %b %d, %Y Start Time: %I:%M %p')

Timestamp('2017-11-10 09:00:00')

In [35]:
ts = pd.Timestamp('2018-11-10 09:30:28.14')
ts

Timestamp('2018-11-10 09:30:28.140000')

In [36]:
ts.ceil('h')

Timestamp('2018-11-10 10:00:00')

In [37]:
ts.year, ts.month, ts.day, ts.hour, ts.minute, ts.second

(2018, 11, 10, 9, 30, 28)

In [38]:
ts.dayofweek, ts.dayofyear, ts.daysinmonth

(5, 314, 30)

In [39]:
ts.to_pydatetime()

datetime.datetime(2018, 11, 10, 9, 30, 28, 140000)

In [40]:
td = pd.Timedelta(123.4567, unit='h')
td

Timedelta('5 days 03:27:24.120000')

In [41]:
td.round('min')

Timedelta('5 days 03:27:00')

In [42]:
td.components

Components(days=5, hours=3, minutes=27, seconds=24, milliseconds=120, microseconds=0, nanoseconds=0)

In [43]:
td.total_seconds()

444444.12

# Magic Commander

## 1. %run : 외부 코드 실행

In [44]:
%run data/myscript.py

1 의 제곱은 1
2 의 제곱은 4
3 의 제곱은 9


- 외부 코드를 실행하면 그 안에 정의된 함수를 세션에서 사용 가능

In [55]:
square()

25

## 2. 코드 실행 시간 측정

- %timeit : 단일 코드 시간 측정

In [46]:
%timeit L = [n ** 2 for n in range(1000)]

234 µs ± 11.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


- %%timeit를 사용하면 여러 코드 실행 시간 측정 가능

In [47]:
%%timeit
L = []
for n in range(1000):
    L.append(n**2)

269 µs ± 14 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


## 3. 매직 커맨더 도움말 : ?, %magic, %lsmagic

- ? : 명령어 뒤에 ? 붙이면 도움말
- %magic : 매직 커맨더에 대한 일반적인 설명과 예제
- %lsmagic : 매직 커맨더 리스트

In [49]:
%run?

[0;31mDocstring:[0m
Run the named file inside IPython as a program.

Usage::

  %run [-n -i -e -G]
       [( -t [-N<N>] | -d [-b<N>] | -p [profile options] )]
       ( -m mod | file ) [args]

Parameters after the filename are passed as command-line arguments to
the program (put in sys.argv). Then, control returns to IPython's
prompt.

This is similar to running at a system prompt ``python file args``,
but with the advantage of giving you IPython's tracebacks, and of
loading all variables into your interactive namespace for further use
(unless -p is used, see below).

The file is executed in a namespace initially consisting only of
``__name__=='__main__'`` and sys.argv constructed as indicated. It thus
sees its environment as if it were being run as a stand-alone program
(except for sharing global objects such as previously imported
modules). But after execution, the IPython interactive namespace gets
updated with all variables defined in the program (except for __name__
and sys.argv)

In [48]:
%xmode?

[0;31mDocstring:[0m
Switch modes for the exception handlers.

Valid modes: Plain, Context, Verbose, and Minimal.

If called without arguments, acts as a toggle.
[0;31mFile:[0m      /opt/anaconda3/lib/python3.7/site-packages/IPython/core/magics/basic.py


## 4. %history : 이전에 실행된 명령

In [50]:
%history -n 1-4

   1:
import pandas as pd
import numpy as np
   2:
fruit = pd.read_csv('data/fruit.csv', index_col='State')
fruit
   3: fruit.stack()
   4:
fruit_tidy = fruit.stack().reset_index()
fruit_tidy


## 5. automagic 함수 : %없이도 사용가능

- %cd : 폴더 이동
- %ls : 폴더 내에 모든 파일 리스트
- %mkdir : 폴더 생성
- %mv : 이동
- %cp : 복사
- %pwd : 현재 작업 폴더
- %rm : 삭제
- %rmdir : 폴더 삭제

In [51]:
pwd

'/Users/HumanRevolution/jupytercreation/class'

In [52]:
ls

1.txt
200408_class02.ipynb
20200408_과제.ipynb
Aggregation_practice(풀이).ipynb
[31mAggregation_practice_조성혁1.ipynb[m[m*
DATA_SETS.ipynb
Untitled.ipynb
[31mchapter10_시각화1.ipynb[m[m*
[31mchapter11_MachineLearning2.ipynb[m[m*
chapter12_0410.ipynb
chapter12_dataProcessing4(merge).ipynb
[31mchapter13_MachineLearning3(Ensemble).ipynb[m[m*
chapter14_dataProcessing5(rearrange).ipynb
[31mchapter1_0330.ipynb[m[m*
[31mchapter2_0330.ipynb[m[m*
[31mchapter3_0331.ipynb[m[m*
[31mchapter4_0401.ipynb[m[m*
chapter5_ML_intro.ipynb
[31mchapter6_0406.ipynb[m[m*
chapter6_dataProcessing1_0406.ipynb
[31mchapter7_0407.ipynb[m[m*
chapter7_dataProcessing2.ipynb
chapter8_MachineLearning1.ipynb
chapter9_0409.ipynb
[31mchapter9_dataProcessiing3(agg).ipynb[m[m*
[34mdata[m[m/
[31mgame.ipynb[m[m*
[34mimages[m[m/
ing.ipynb
mydata.txt
pep_target_조성혁.csv
project1.ipynb
project2.ipynb
score.csv
[30m[43mslides[m[m/
sub.csv
test.log
[31mturtle.ipynb[m[m*
과제.ipynb
[31m과제1_

- 추가적으로 알고싶으면
<a href='https://ipython.readthedocs.io/en/stable/interactive/magics.html'>Magic Commander(클릭)</a>