# DataFrame

pandas에서 제공하는 가장 중요한 데이터 구조 중 하나로, 2차원 데이터 구조이다.<br>
행렬로 구성된 테이블 형태의 데이터를 다룸<br>
엑셀 시트나 SQL테이블과 유사한 형태로 데이터를 처리
- 데이터프레임의 주요 특징:<br>
  - 행렬로 이루어진 2차원 데이터 구조
  - 각 열은 서로 다른 데이터 타입을 가질 수 있음
  - 인덱스: 각 행에는 고유한 인덱스가 있으며, 인덱스를 통해 데이터를 빠르게 검색, 접근 가능
  - 데이터 분석, 변형, 정리 등 다양한 작업을 쉽게 할 수 있음

In [1]:
import pandas as pd

In [2]:
data_dic = {
  "name": ["John", "Anna", "Peter", "Linda"],
  "Age": [28, 39, 49, 21],
  "City": ["New York", "Paris", "Berlin", "London"]
}

df = pd.DataFrame(data_dic)
df

Unnamed: 0,name,Age,City
0,John,28,New York
1,Anna,39,Paris
2,Peter,49,Berlin
3,Linda,21,London


In [3]:
df.head(2)

Unnamed: 0,name,Age,City
0,John,28,New York
1,Anna,39,Paris


In [4]:
df.tail(2)

Unnamed: 0,name,Age,City
2,Peter,49,Berlin
3,Linda,21,London


In [5]:
print(df.shape)

(4, 3)


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    4 non-null      object
 1   Age     4 non-null      int64 
 2   City    4 non-null      object
dtypes: int64(1), object(2)
memory usage: 228.0+ bytes


### 다양한 외부파일 읽어오기

In [7]:
# csv 파일을 읽고 저장하는데 사용

stock_df = pd.read_csv("../00_data/stock_daily_prices.csv")
stock_df

Unnamed: 0,Date,AAPL,BA,T,MGM,AMZN,IBM,TSLA,GOOG,sp500
0,2012-01-12,60.198570,75.510002,30.120001,12.130000,175.929993,180.550003,28.250000,313.644379,1295.500000
1,2012-01-13,59.972858,74.599998,30.070000,12.350000,178.419998,179.160004,22.790001,311.328064,1289.089966
2,2012-01-17,60.671429,75.239998,30.250000,12.250000,181.660004,180.000000,26.600000,313.116364,1293.670044
3,2012-01-18,61.301430,75.059998,30.330000,12.730000,189.440002,181.070007,26.809999,315.273285,1308.040039
4,2012-01-19,61.107143,75.559998,30.420000,12.800000,194.449997,180.520004,26.760000,318.590851,1314.500000
...,...,...,...,...,...,...,...,...,...,...
2154,2020-08-05,440.250000,174.279999,29.850000,16.719999,3205.030029,125.449997,1485.020020,1473.609985,3327.770020
2155,2020-08-06,455.609985,172.199997,29.840000,18.459999,3225.000000,126.120003,1489.579956,1500.099976,3349.159912
2156,2020-08-07,444.450012,170.020004,30.020000,19.030001,3167.459961,124.959999,1452.709961,1494.489990,3351.280029
2157,2020-08-10,450.910004,179.410004,30.200001,21.650000,3148.159912,127.110001,1418.569946,1496.099976,3360.469971


In [8]:
# html 문서 읽어오기
# 판다스에서 html문서를 읽을 때 기준은 table 태그이다.
# html문서를 읽기 위해서는 lxml이라는 라이브러리를 설치해야 한다.
# pip3 install lxml

import requests
from io import StringIO


# SSL 인증서 검증 비활성화
url = "https://ko.wikipedia.org/wiki/%EB%8C%80%ED%95%9C%EB%AF%BC%EA%B5%AD%EC%9D%98_%EC%98%81%ED%99%94_%ED%9D%A5%ED%96%89_%EA%B8%B0%EB%A1%9D"
response = requests.get(url, verify=False)
movies_df = pd.read_html(StringIO(response.text))


movies_df



[    순위            제목     감독      한국내 배급사         개봉일       관객수  \
 0    1          《명량》    김한민       CJ E&M  2014-07-30  17616141   
 1    2        《극한직업》    이병헌     CJ엔터테인먼트  2019-01-23  16266480   
 2    3  《신과함께: 죄와 벌》    김용화     롯데엔터테인먼트  2017-12-20  14414658   
 3    4        《국제시장》    윤제균       CJ E&M  2014-12-17  14265222   
 4    5  《어벤져스: 엔드게임》  루소 형제  월트디즈니컴퍼니코리아  2019-04-24  13977602   
 ..  ..           ...    ...          ...         ...       ...   
 62  63         《베를린》    류승완       CJ E&M  2013-01-30   7166688   
 63  64         《마스터》    조의석       CJ E&M  2016-12-21   7150586   
 64  65          《터널》    김성훈          쇼박스  2016-08-10   7120780   
 65  66        《어벤져스》  조스 휘던  월트디즈니컴퍼니코리아  2012-04-26   7087971   
 66  67      《인천상륙작전》    이재한       CJ E&M  2016-07-27   7051660   
 
                  기타  
 0   영화진흥위원회 발권통계 기준  
 1   영화진흥위원회 발권통계 기준  
 2   영화진흥위원회 발권통계 기준  
 3   영화진흥위원회 발권통계 기준  
 4   영화진흥위원회 발권통계 기준  
 ..              ...  
 62  영화진흥위원회 발권통계 기준  
 63  영화진흥위

In [9]:
# html에 있는 테이블을 배열로 가져오기
box_office =  movies_df[0]
box_office

Unnamed: 0,순위,제목,감독,한국내 배급사,개봉일,관객수,기타
0,1,《명량》,김한민,CJ E&M,2014-07-30,17616141,영화진흥위원회 발권통계 기준
1,2,《극한직업》,이병헌,CJ엔터테인먼트,2019-01-23,16266480,영화진흥위원회 발권통계 기준
2,3,《신과함께: 죄와 벌》,김용화,롯데엔터테인먼트,2017-12-20,14414658,영화진흥위원회 발권통계 기준
3,4,《국제시장》,윤제균,CJ E&M,2014-12-17,14265222,영화진흥위원회 발권통계 기준
4,5,《어벤져스: 엔드게임》,루소 형제,월트디즈니컴퍼니코리아,2019-04-24,13977602,영화진흥위원회 발권통계 기준
...,...,...,...,...,...,...,...
62,63,《베를린》,류승완,CJ E&M,2013-01-30,7166688,영화진흥위원회 발권통계 기준
63,64,《마스터》,조의석,CJ E&M,2016-12-21,7150586,영화진흥위원회 발권통계 기준
64,65,《터널》,김성훈,쇼박스,2016-08-10,7120780,영화진흥위원회 발권통계 기준
65,66,《어벤져스》,조스 휘던,월트디즈니컴퍼니코리아,2012-04-26,7087971,영화진흥위원회 발권통계 기준


### 외부 파일로 내보내기

In [10]:
box_office.to_csv("../00_out/box_office.csv")

### 인덱스 설정하기

In [11]:
stock_df

Unnamed: 0,Date,AAPL,BA,T,MGM,AMZN,IBM,TSLA,GOOG,sp500
0,2012-01-12,60.198570,75.510002,30.120001,12.130000,175.929993,180.550003,28.250000,313.644379,1295.500000
1,2012-01-13,59.972858,74.599998,30.070000,12.350000,178.419998,179.160004,22.790001,311.328064,1289.089966
2,2012-01-17,60.671429,75.239998,30.250000,12.250000,181.660004,180.000000,26.600000,313.116364,1293.670044
3,2012-01-18,61.301430,75.059998,30.330000,12.730000,189.440002,181.070007,26.809999,315.273285,1308.040039
4,2012-01-19,61.107143,75.559998,30.420000,12.800000,194.449997,180.520004,26.760000,318.590851,1314.500000
...,...,...,...,...,...,...,...,...,...,...
2154,2020-08-05,440.250000,174.279999,29.850000,16.719999,3205.030029,125.449997,1485.020020,1473.609985,3327.770020
2155,2020-08-06,455.609985,172.199997,29.840000,18.459999,3225.000000,126.120003,1489.579956,1500.099976,3349.159912
2156,2020-08-07,444.450012,170.020004,30.020000,19.030001,3167.459961,124.959999,1452.709961,1494.489990,3351.280029
2157,2020-08-10,450.910004,179.410004,30.200001,21.650000,3148.159912,127.110001,1418.569946,1496.099976,3360.469971


In [12]:
stock_df.set_index("Date", inplace=True)
stock_df

Unnamed: 0_level_0,AAPL,BA,T,MGM,AMZN,IBM,TSLA,GOOG,sp500
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2012-01-12,60.198570,75.510002,30.120001,12.130000,175.929993,180.550003,28.250000,313.644379,1295.500000
2012-01-13,59.972858,74.599998,30.070000,12.350000,178.419998,179.160004,22.790001,311.328064,1289.089966
2012-01-17,60.671429,75.239998,30.250000,12.250000,181.660004,180.000000,26.600000,313.116364,1293.670044
2012-01-18,61.301430,75.059998,30.330000,12.730000,189.440002,181.070007,26.809999,315.273285,1308.040039
2012-01-19,61.107143,75.559998,30.420000,12.800000,194.449997,180.520004,26.760000,318.590851,1314.500000
...,...,...,...,...,...,...,...,...,...
2020-08-05,440.250000,174.279999,29.850000,16.719999,3205.030029,125.449997,1485.020020,1473.609985,3327.770020
2020-08-06,455.609985,172.199997,29.840000,18.459999,3225.000000,126.120003,1489.579956,1500.099976,3349.159912
2020-08-07,444.450012,170.020004,30.020000,19.030001,3167.459961,124.959999,1452.709961,1494.489990,3351.280029
2020-08-10,450.910004,179.410004,30.200001,21.650000,3148.159912,127.110001,1418.569946,1496.099976,3360.469971


In [13]:
# index 초기화

stock_df.reset_index(inplace=True)
stock_df

Unnamed: 0,Date,AAPL,BA,T,MGM,AMZN,IBM,TSLA,GOOG,sp500
0,2012-01-12,60.198570,75.510002,30.120001,12.130000,175.929993,180.550003,28.250000,313.644379,1295.500000
1,2012-01-13,59.972858,74.599998,30.070000,12.350000,178.419998,179.160004,22.790001,311.328064,1289.089966
2,2012-01-17,60.671429,75.239998,30.250000,12.250000,181.660004,180.000000,26.600000,313.116364,1293.670044
3,2012-01-18,61.301430,75.059998,30.330000,12.730000,189.440002,181.070007,26.809999,315.273285,1308.040039
4,2012-01-19,61.107143,75.559998,30.420000,12.800000,194.449997,180.520004,26.760000,318.590851,1314.500000
...,...,...,...,...,...,...,...,...,...,...
2154,2020-08-05,440.250000,174.279999,29.850000,16.719999,3205.030029,125.449997,1485.020020,1473.609985,3327.770020
2155,2020-08-06,455.609985,172.199997,29.840000,18.459999,3225.000000,126.120003,1489.579956,1500.099976,3349.159912
2156,2020-08-07,444.450012,170.020004,30.020000,19.030001,3167.459961,124.959999,1452.709961,1494.489990,3351.280029
2157,2020-08-10,450.910004,179.410004,30.200001,21.650000,3148.159912,127.110001,1418.569946,1496.099976,3360.469971


In [14]:
# 파일을 읽어오는 과정에서 인덱스 지정
stock_df = pd.read_csv("../00_data/stock_daily_prices.csv", index_col="Date")
stock_df

Unnamed: 0_level_0,AAPL,BA,T,MGM,AMZN,IBM,TSLA,GOOG,sp500
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2012-01-12,60.198570,75.510002,30.120001,12.130000,175.929993,180.550003,28.250000,313.644379,1295.500000
2012-01-13,59.972858,74.599998,30.070000,12.350000,178.419998,179.160004,22.790001,311.328064,1289.089966
2012-01-17,60.671429,75.239998,30.250000,12.250000,181.660004,180.000000,26.600000,313.116364,1293.670044
2012-01-18,61.301430,75.059998,30.330000,12.730000,189.440002,181.070007,26.809999,315.273285,1308.040039
2012-01-19,61.107143,75.559998,30.420000,12.800000,194.449997,180.520004,26.760000,318.590851,1314.500000
...,...,...,...,...,...,...,...,...,...
2020-08-05,440.250000,174.279999,29.850000,16.719999,3205.030029,125.449997,1485.020020,1473.609985,3327.770020
2020-08-06,455.609985,172.199997,29.840000,18.459999,3225.000000,126.120003,1489.579956,1500.099976,3349.159912
2020-08-07,444.450012,170.020004,30.020000,19.030001,3167.459961,124.959999,1452.709961,1494.489990,3351.280029
2020-08-10,450.910004,179.410004,30.200001,21.650000,3148.159912,127.110001,1418.569946,1496.099976,3360.469971


### index 참조

In [15]:
stock_df = pd.read_csv("../00_data/stock_daily_prices.csv")
stock_df

Unnamed: 0,Date,AAPL,BA,T,MGM,AMZN,IBM,TSLA,GOOG,sp500
0,2012-01-12,60.198570,75.510002,30.120001,12.130000,175.929993,180.550003,28.250000,313.644379,1295.500000
1,2012-01-13,59.972858,74.599998,30.070000,12.350000,178.419998,179.160004,22.790001,311.328064,1289.089966
2,2012-01-17,60.671429,75.239998,30.250000,12.250000,181.660004,180.000000,26.600000,313.116364,1293.670044
3,2012-01-18,61.301430,75.059998,30.330000,12.730000,189.440002,181.070007,26.809999,315.273285,1308.040039
4,2012-01-19,61.107143,75.559998,30.420000,12.800000,194.449997,180.520004,26.760000,318.590851,1314.500000
...,...,...,...,...,...,...,...,...,...,...
2154,2020-08-05,440.250000,174.279999,29.850000,16.719999,3205.030029,125.449997,1485.020020,1473.609985,3327.770020
2155,2020-08-06,455.609985,172.199997,29.840000,18.459999,3225.000000,126.120003,1489.579956,1500.099976,3349.159912
2156,2020-08-07,444.450012,170.020004,30.020000,19.030001,3167.459961,124.959999,1452.709961,1494.489990,3351.280029
2157,2020-08-10,450.910004,179.410004,30.200001,21.650000,3148.159912,127.110001,1418.569946,1496.099976,3360.469971


In [16]:
# 데이터 프레임에서 열 변환
sample = stock_df["AAPL"]
sample

0        60.198570
1        59.972858
2        60.671429
3        61.301430
4        61.107143
           ...    
2154    440.250000
2155    455.609985
2156    444.450012
2157    450.910004
2158    437.500000
Name: AAPL, Length: 2159, dtype: float64

In [17]:
stock_df.AAPL

0        60.198570
1        59.972858
2        60.671429
3        61.301430
4        61.107143
           ...    
2154    440.250000
2155    455.609985
2156    444.450012
2157    450.910004
2158    437.500000
Name: AAPL, Length: 2159, dtype: float64

In [18]:
# 여러 column 선택
stock_df[["BA", "T"]]

Unnamed: 0,BA,T
0,75.510002,30.120001
1,74.599998,30.070000
2,75.239998,30.250000
3,75.059998,30.330000
4,75.559998,30.420000
...,...,...
2154,174.279999,29.850000
2155,172.199997,29.840000
2156,170.020004,30.020000
2157,179.410004,30.200001


In [19]:
stock_df[0:2]

Unnamed: 0,Date,AAPL,BA,T,MGM,AMZN,IBM,TSLA,GOOG,sp500
0,2012-01-12,60.19857,75.510002,30.120001,12.13,175.929993,180.550003,28.25,313.644379,1295.5
1,2012-01-13,59.972858,74.599998,30.07,12.35,178.419998,179.160004,22.790001,311.328064,1289.089966


### 데이터프레임 열 추가/삭제

In [20]:
import numpy as np

In [21]:
stock_df = pd.read_csv("../00_data/stock_daily_prices.csv")
stock_df

Unnamed: 0,Date,AAPL,BA,T,MGM,AMZN,IBM,TSLA,GOOG,sp500
0,2012-01-12,60.198570,75.510002,30.120001,12.130000,175.929993,180.550003,28.250000,313.644379,1295.500000
1,2012-01-13,59.972858,74.599998,30.070000,12.350000,178.419998,179.160004,22.790001,311.328064,1289.089966
2,2012-01-17,60.671429,75.239998,30.250000,12.250000,181.660004,180.000000,26.600000,313.116364,1293.670044
3,2012-01-18,61.301430,75.059998,30.330000,12.730000,189.440002,181.070007,26.809999,315.273285,1308.040039
4,2012-01-19,61.107143,75.559998,30.420000,12.800000,194.449997,180.520004,26.760000,318.590851,1314.500000
...,...,...,...,...,...,...,...,...,...,...
2154,2020-08-05,440.250000,174.279999,29.850000,16.719999,3205.030029,125.449997,1485.020020,1473.609985,3327.770020
2155,2020-08-06,455.609985,172.199997,29.840000,18.459999,3225.000000,126.120003,1489.579956,1500.099976,3349.159912
2156,2020-08-07,444.450012,170.020004,30.020000,19.030001,3167.459961,124.959999,1452.709961,1494.489990,3351.280029
2157,2020-08-10,450.910004,179.410004,30.200001,21.650000,3148.159912,127.110001,1418.569946,1496.099976,3360.469971


In [22]:
add_list = np.random.rand(2159)
stock_df["random"] = add_list
stock_df

Unnamed: 0,Date,AAPL,BA,T,MGM,AMZN,IBM,TSLA,GOOG,sp500,random
0,2012-01-12,60.198570,75.510002,30.120001,12.130000,175.929993,180.550003,28.250000,313.644379,1295.500000,0.189707
1,2012-01-13,59.972858,74.599998,30.070000,12.350000,178.419998,179.160004,22.790001,311.328064,1289.089966,0.840964
2,2012-01-17,60.671429,75.239998,30.250000,12.250000,181.660004,180.000000,26.600000,313.116364,1293.670044,0.903968
3,2012-01-18,61.301430,75.059998,30.330000,12.730000,189.440002,181.070007,26.809999,315.273285,1308.040039,0.106015
4,2012-01-19,61.107143,75.559998,30.420000,12.800000,194.449997,180.520004,26.760000,318.590851,1314.500000,0.711498
...,...,...,...,...,...,...,...,...,...,...,...
2154,2020-08-05,440.250000,174.279999,29.850000,16.719999,3205.030029,125.449997,1485.020020,1473.609985,3327.770020,0.563401
2155,2020-08-06,455.609985,172.199997,29.840000,18.459999,3225.000000,126.120003,1489.579956,1500.099976,3349.159912,0.844496
2156,2020-08-07,444.450012,170.020004,30.020000,19.030001,3167.459961,124.959999,1452.709961,1494.489990,3351.280029,0.709374
2157,2020-08-10,450.910004,179.410004,30.200001,21.650000,3148.159912,127.110001,1418.569946,1496.099976,3360.469971,0.651195


In [23]:
stock_df.insert(0, column="ionq", value=add_list) # 1st prameter = column의 위치
stock_df

Unnamed: 0,ionq,Date,AAPL,BA,T,MGM,AMZN,IBM,TSLA,GOOG,sp500,random
0,0.189707,2012-01-12,60.198570,75.510002,30.120001,12.130000,175.929993,180.550003,28.250000,313.644379,1295.500000,0.189707
1,0.840964,2012-01-13,59.972858,74.599998,30.070000,12.350000,178.419998,179.160004,22.790001,311.328064,1289.089966,0.840964
2,0.903968,2012-01-17,60.671429,75.239998,30.250000,12.250000,181.660004,180.000000,26.600000,313.116364,1293.670044,0.903968
3,0.106015,2012-01-18,61.301430,75.059998,30.330000,12.730000,189.440002,181.070007,26.809999,315.273285,1308.040039,0.106015
4,0.711498,2012-01-19,61.107143,75.559998,30.420000,12.800000,194.449997,180.520004,26.760000,318.590851,1314.500000,0.711498
...,...,...,...,...,...,...,...,...,...,...,...,...
2154,0.563401,2020-08-05,440.250000,174.279999,29.850000,16.719999,3205.030029,125.449997,1485.020020,1473.609985,3327.770020,0.563401
2155,0.844496,2020-08-06,455.609985,172.199997,29.840000,18.459999,3225.000000,126.120003,1489.579956,1500.099976,3349.159912,0.844496
2156,0.709374,2020-08-07,444.450012,170.020004,30.020000,19.030001,3167.459961,124.959999,1452.709961,1494.489990,3351.280029,0.709374
2157,0.651195,2020-08-10,450.910004,179.410004,30.200001,21.650000,3148.159912,127.110001,1418.569946,1496.099976,3360.469971,0.651195


In [24]:
# 열 삭제
del stock_df["random"]
stock_df

Unnamed: 0,ionq,Date,AAPL,BA,T,MGM,AMZN,IBM,TSLA,GOOG,sp500
0,0.189707,2012-01-12,60.198570,75.510002,30.120001,12.130000,175.929993,180.550003,28.250000,313.644379,1295.500000
1,0.840964,2012-01-13,59.972858,74.599998,30.070000,12.350000,178.419998,179.160004,22.790001,311.328064,1289.089966
2,0.903968,2012-01-17,60.671429,75.239998,30.250000,12.250000,181.660004,180.000000,26.600000,313.116364,1293.670044
3,0.106015,2012-01-18,61.301430,75.059998,30.330000,12.730000,189.440002,181.070007,26.809999,315.273285,1308.040039
4,0.711498,2012-01-19,61.107143,75.559998,30.420000,12.800000,194.449997,180.520004,26.760000,318.590851,1314.500000
...,...,...,...,...,...,...,...,...,...,...,...
2154,0.563401,2020-08-05,440.250000,174.279999,29.850000,16.719999,3205.030029,125.449997,1485.020020,1473.609985,3327.770020
2155,0.844496,2020-08-06,455.609985,172.199997,29.840000,18.459999,3225.000000,126.120003,1489.579956,1500.099976,3349.159912
2156,0.709374,2020-08-07,444.450012,170.020004,30.020000,19.030001,3167.459961,124.959999,1452.709961,1494.489990,3351.280029
2157,0.651195,2020-08-10,450.910004,179.410004,30.200001,21.650000,3148.159912,127.110001,1418.569946,1496.099976,3360.469971


In [25]:
# drop()을 이용한 열삭제
stock_df.drop(labels=["Date", "ionq"], axis=1, inplace=True)
stock_df

Unnamed: 0,AAPL,BA,T,MGM,AMZN,IBM,TSLA,GOOG,sp500
0,60.198570,75.510002,30.120001,12.130000,175.929993,180.550003,28.250000,313.644379,1295.500000
1,59.972858,74.599998,30.070000,12.350000,178.419998,179.160004,22.790001,311.328064,1289.089966
2,60.671429,75.239998,30.250000,12.250000,181.660004,180.000000,26.600000,313.116364,1293.670044
3,61.301430,75.059998,30.330000,12.730000,189.440002,181.070007,26.809999,315.273285,1308.040039
4,61.107143,75.559998,30.420000,12.800000,194.449997,180.520004,26.760000,318.590851,1314.500000
...,...,...,...,...,...,...,...,...,...
2154,440.250000,174.279999,29.850000,16.719999,3205.030029,125.449997,1485.020020,1473.609985,3327.770020
2155,455.609985,172.199997,29.840000,18.459999,3225.000000,126.120003,1489.579956,1500.099976,3349.159912
2156,444.450012,170.020004,30.020000,19.030001,3167.459961,124.959999,1452.709961,1494.489990,3351.280029
2157,450.910004,179.410004,30.200001,21.650000,3148.159912,127.110001,1418.569946,1496.099976,3360.469971


In [26]:
# pop() -> 삭제 후 반환
sp500 = stock_df.pop("sp500")
print(sp500)
stock_df

0       1295.500000
1       1289.089966
2       1293.670044
3       1308.040039
4       1314.500000
           ...     
2154    3327.770020
2155    3349.159912
2156    3351.280029
2157    3360.469971
2158    3333.689941
Name: sp500, Length: 2159, dtype: float64


Unnamed: 0,AAPL,BA,T,MGM,AMZN,IBM,TSLA,GOOG
0,60.198570,75.510002,30.120001,12.130000,175.929993,180.550003,28.250000,313.644379
1,59.972858,74.599998,30.070000,12.350000,178.419998,179.160004,22.790001,311.328064
2,60.671429,75.239998,30.250000,12.250000,181.660004,180.000000,26.600000,313.116364
3,61.301430,75.059998,30.330000,12.730000,189.440002,181.070007,26.809999,315.273285
4,61.107143,75.559998,30.420000,12.800000,194.449997,180.520004,26.760000,318.590851
...,...,...,...,...,...,...,...,...
2154,440.250000,174.279999,29.850000,16.719999,3205.030029,125.449997,1485.020020,1473.609985
2155,455.609985,172.199997,29.840000,18.459999,3225.000000,126.120003,1489.579956,1500.099976
2156,444.450012,170.020004,30.020000,19.030001,3167.459961,124.959999,1452.709961,1494.489990
2157,450.910004,179.410004,30.200001,21.650000,3148.159912,127.110001,1418.569946,1496.099976


### 영화 데이터 작업

In [27]:
box_office

Unnamed: 0,순위,제목,감독,한국내 배급사,개봉일,관객수,기타
0,1,《명량》,김한민,CJ E&M,2014-07-30,17616141,영화진흥위원회 발권통계 기준
1,2,《극한직업》,이병헌,CJ엔터테인먼트,2019-01-23,16266480,영화진흥위원회 발권통계 기준
2,3,《신과함께: 죄와 벌》,김용화,롯데엔터테인먼트,2017-12-20,14414658,영화진흥위원회 발권통계 기준
3,4,《국제시장》,윤제균,CJ E&M,2014-12-17,14265222,영화진흥위원회 발권통계 기준
4,5,《어벤져스: 엔드게임》,루소 형제,월트디즈니컴퍼니코리아,2019-04-24,13977602,영화진흥위원회 발권통계 기준
...,...,...,...,...,...,...,...
62,63,《베를린》,류승완,CJ E&M,2013-01-30,7166688,영화진흥위원회 발권통계 기준
63,64,《마스터》,조의석,CJ E&M,2016-12-21,7150586,영화진흥위원회 발권통계 기준
64,65,《터널》,김성훈,쇼박스,2016-08-10,7120780,영화진흥위원회 발권통계 기준
65,66,《어벤져스》,조스 휘던,월트디즈니컴퍼니코리아,2012-04-26,7087971,영화진흥위원회 발권통계 기준


In [28]:
del box_office["기타"]
box_office["매출"] = box_office["관객수"].mul(15000)
box_office

Unnamed: 0,순위,제목,감독,한국내 배급사,개봉일,관객수,매출
0,1,《명량》,김한민,CJ E&M,2014-07-30,17616141,264242115000
1,2,《극한직업》,이병헌,CJ엔터테인먼트,2019-01-23,16266480,243997200000
2,3,《신과함께: 죄와 벌》,김용화,롯데엔터테인먼트,2017-12-20,14414658,216219870000
3,4,《국제시장》,윤제균,CJ E&M,2014-12-17,14265222,213978330000
4,5,《어벤져스: 엔드게임》,루소 형제,월트디즈니컴퍼니코리아,2019-04-24,13977602,209664030000
...,...,...,...,...,...,...,...
62,63,《베를린》,류승완,CJ E&M,2013-01-30,7166688,107500320000
63,64,《마스터》,조의석,CJ E&M,2016-12-21,7150586,107258790000
64,65,《터널》,김성훈,쇼박스,2016-08-10,7120780,106811700000
65,66,《어벤져스》,조스 휘던,월트디즈니컴퍼니코리아,2012-04-26,7087971,106319565000


### LOC 데이터프레임에서 라벨 기반 요소 선택

In [29]:
stock_df = pd.read_csv("../00_data/stock_daily_prices.csv", index_col="Date")
stock_df

Unnamed: 0_level_0,AAPL,BA,T,MGM,AMZN,IBM,TSLA,GOOG,sp500
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2012-01-12,60.198570,75.510002,30.120001,12.130000,175.929993,180.550003,28.250000,313.644379,1295.500000
2012-01-13,59.972858,74.599998,30.070000,12.350000,178.419998,179.160004,22.790001,311.328064,1289.089966
2012-01-17,60.671429,75.239998,30.250000,12.250000,181.660004,180.000000,26.600000,313.116364,1293.670044
2012-01-18,61.301430,75.059998,30.330000,12.730000,189.440002,181.070007,26.809999,315.273285,1308.040039
2012-01-19,61.107143,75.559998,30.420000,12.800000,194.449997,180.520004,26.760000,318.590851,1314.500000
...,...,...,...,...,...,...,...,...,...
2020-08-05,440.250000,174.279999,29.850000,16.719999,3205.030029,125.449997,1485.020020,1473.609985,3327.770020
2020-08-06,455.609985,172.199997,29.840000,18.459999,3225.000000,126.120003,1489.579956,1500.099976,3349.159912
2020-08-07,444.450012,170.020004,30.020000,19.030001,3167.459961,124.959999,1452.709961,1494.489990,3351.280029
2020-08-10,450.910004,179.410004,30.200001,21.650000,3148.159912,127.110001,1418.569946,1496.099976,3360.469971


In [30]:
stock_df.loc["2012-01-19"]

AAPL       61.107143
BA         75.559998
T          30.420000
MGM        12.800000
AMZN      194.449997
IBM       180.520004
TSLA       26.760000
GOOG      318.590851
sp500    1314.500000
Name: 2012-01-19, dtype: float64

In [31]:
stock_df.loc["2012-01-13":"2012-01-19"]

Unnamed: 0_level_0,AAPL,BA,T,MGM,AMZN,IBM,TSLA,GOOG,sp500
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2012-01-13,59.972858,74.599998,30.07,12.35,178.419998,179.160004,22.790001,311.328064,1289.089966
2012-01-17,60.671429,75.239998,30.25,12.25,181.660004,180.0,26.6,313.116364,1293.670044
2012-01-18,61.30143,75.059998,30.33,12.73,189.440002,181.070007,26.809999,315.273285,1308.040039
2012-01-19,61.107143,75.559998,30.42,12.8,194.449997,180.520004,26.76,318.590851,1314.5


In [32]:
stock_df.loc[:"2012-01-19"]


Unnamed: 0_level_0,AAPL,BA,T,MGM,AMZN,IBM,TSLA,GOOG,sp500
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2012-01-12,60.19857,75.510002,30.120001,12.13,175.929993,180.550003,28.25,313.644379,1295.5
2012-01-13,59.972858,74.599998,30.07,12.35,178.419998,179.160004,22.790001,311.328064,1289.089966
2012-01-17,60.671429,75.239998,30.25,12.25,181.660004,180.0,26.6,313.116364,1293.670044
2012-01-18,61.30143,75.059998,30.33,12.73,189.440002,181.070007,26.809999,315.273285,1308.040039
2012-01-19,61.107143,75.559998,30.42,12.8,194.449997,180.520004,26.76,318.590851,1314.5


In [33]:
stock_df.loc[["2012-01-12", "2012-01-17", "2012-01-19", "2012-01-25"]]

Unnamed: 0_level_0,AAPL,BA,T,MGM,AMZN,IBM,TSLA,GOOG,sp500
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2012-01-12,60.19857,75.510002,30.120001,12.13,175.929993,180.550003,28.25,313.644379,1295.5
2012-01-17,60.671429,75.239998,30.25,12.25,181.660004,180.0,26.6,313.116364,1293.670044
2012-01-19,61.107143,75.559998,30.42,12.8,194.449997,180.520004,26.76,318.590851,1314.5
2012-01-25,63.808571,75.82,30.209999,13.11,187.800003,191.729996,27.969999,283.681702,1326.060059


### sample()

In [34]:
# sample() -> 일부 데이터를 무작위로 추출
stock_df.sample(n=5, axis=0) # n = 개수, axis는 행열선택 (0=row, 1=column)

Unnamed: 0_level_0,AAPL,BA,T,MGM,AMZN,IBM,TSLA,GOOG,sp500
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2015-10-08,109.5,139.039993,33.400002,21.139999,533.159973,152.279999,226.720001,639.159973,2013.430054
2016-09-01,106.730003,129.899994,40.959999,24.65,770.619995,159.539993,200.770004,768.780029,2170.860107
2016-06-13,97.339996,129.919998,40.189999,24.120001,715.23999,151.279999,217.869995,718.359985,2079.060059
2015-09-25,114.709999,131.009995,32.330002,19.76,524.25,145.419998,256.910004,611.969971,1931.339966
2012-08-09,88.675713,74.279999,37.23,9.62,234.059998,198.419998,29.41,319.975647,1402.800049


In [35]:
# 전체 데이터의 30%를 랜덤으로 추출

stock_df.sample(frac=0.3, axis=0)

Unnamed: 0_level_0,AAPL,BA,T,MGM,AMZN,IBM,TSLA,GOOG,sp500
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2013-04-26,59.599998,92.849998,37.040001,13.520000,254.809998,194.309998,51.200001,399.213654,1582.239990
2016-11-29,111.459999,151.639999,39.480000,28.870001,762.520020,163.529999,189.570007,770.840027,2204.659912
2012-02-14,72.779999,75.559998,30.070000,14.500000,191.300003,192.220001,33.169998,303.741516,1350.500000
2018-12-19,160.889999,319.549988,29.820000,24.510000,1495.079956,116.430000,332.970001,1023.010010,2506.959961
2017-10-16,159.880005,259.750000,36.169998,29.690001,1006.340027,146.830002,350.600006,992.000000,2557.639893
...,...,...,...,...,...,...,...,...,...
2016-10-14,117.629997,133.500000,39.220001,25.750000,822.960022,154.449997,196.509995,778.530029,2132.979980
2012-12-21,74.190002,76.169998,33.669998,11.810000,256.920013,193.419998,34.000000,356.478821,1430.150024
2014-03-14,74.955711,123.110001,32.490002,26.270000,373.739990,182.210007,230.970001,584.210266,1841.130005
2019-07-22,207.220001,373.420013,32.130001,28.350000,1985.630005,149.740005,255.679993,1138.069946,2985.030029


### iloc() 데이터프레임에서 정수 인덱스를 기반으로 요소 선택

In [36]:
stock_df

Unnamed: 0_level_0,AAPL,BA,T,MGM,AMZN,IBM,TSLA,GOOG,sp500
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2012-01-12,60.198570,75.510002,30.120001,12.130000,175.929993,180.550003,28.250000,313.644379,1295.500000
2012-01-13,59.972858,74.599998,30.070000,12.350000,178.419998,179.160004,22.790001,311.328064,1289.089966
2012-01-17,60.671429,75.239998,30.250000,12.250000,181.660004,180.000000,26.600000,313.116364,1293.670044
2012-01-18,61.301430,75.059998,30.330000,12.730000,189.440002,181.070007,26.809999,315.273285,1308.040039
2012-01-19,61.107143,75.559998,30.420000,12.800000,194.449997,180.520004,26.760000,318.590851,1314.500000
...,...,...,...,...,...,...,...,...,...
2020-08-05,440.250000,174.279999,29.850000,16.719999,3205.030029,125.449997,1485.020020,1473.609985,3327.770020
2020-08-06,455.609985,172.199997,29.840000,18.459999,3225.000000,126.120003,1489.579956,1500.099976,3349.159912
2020-08-07,444.450012,170.020004,30.020000,19.030001,3167.459961,124.959999,1452.709961,1494.489990,3351.280029
2020-08-10,450.910004,179.410004,30.200001,21.650000,3148.159912,127.110001,1418.569946,1496.099976,3360.469971


In [37]:
stock_df.iloc[2]

AAPL       60.671429
BA         75.239998
T          30.250000
MGM        12.250000
AMZN      181.660004
IBM       180.000000
TSLA       26.600000
GOOG      313.116364
sp500    1293.670044
Name: 2012-01-17, dtype: float64

In [38]:
stock_df.iloc[2:5]

Unnamed: 0_level_0,AAPL,BA,T,MGM,AMZN,IBM,TSLA,GOOG,sp500
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2012-01-17,60.671429,75.239998,30.25,12.25,181.660004,180.0,26.6,313.116364,1293.670044
2012-01-18,61.30143,75.059998,30.33,12.73,189.440002,181.070007,26.809999,315.273285,1308.040039
2012-01-19,61.107143,75.559998,30.42,12.8,194.449997,180.520004,26.76,318.590851,1314.5


In [39]:
stock_df.iloc[[2, 3, 5, 10]]

Unnamed: 0_level_0,AAPL,BA,T,MGM,AMZN,IBM,TSLA,GOOG,sp500
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2012-01-17,60.671429,75.239998,30.25,12.25,181.660004,180.0,26.6,313.116364,1293.670044
2012-01-18,61.30143,75.059998,30.33,12.73,189.440002,181.070007,26.809999,315.273285,1308.040039
2012-01-20,60.042858,75.519997,30.51,12.64,190.929993,188.520004,26.6,291.900879,1315.380005
2012-01-27,63.897144,74.550003,29.16,13.19,195.369995,190.460007,29.33,288.907104,1316.329956


In [40]:
stock_df.iloc[4, 0:3]

AAPL    61.107143
BA      75.559998
T       30.420000
Name: 2012-01-19, dtype: float64

### 데이터프레임 연산 수행

In [41]:
stock_df

Unnamed: 0_level_0,AAPL,BA,T,MGM,AMZN,IBM,TSLA,GOOG,sp500
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2012-01-12,60.198570,75.510002,30.120001,12.130000,175.929993,180.550003,28.250000,313.644379,1295.500000
2012-01-13,59.972858,74.599998,30.070000,12.350000,178.419998,179.160004,22.790001,311.328064,1289.089966
2012-01-17,60.671429,75.239998,30.250000,12.250000,181.660004,180.000000,26.600000,313.116364,1293.670044
2012-01-18,61.301430,75.059998,30.330000,12.730000,189.440002,181.070007,26.809999,315.273285,1308.040039
2012-01-19,61.107143,75.559998,30.420000,12.800000,194.449997,180.520004,26.760000,318.590851,1314.500000
...,...,...,...,...,...,...,...,...,...
2020-08-05,440.250000,174.279999,29.850000,16.719999,3205.030029,125.449997,1485.020020,1473.609985,3327.770020
2020-08-06,455.609985,172.199997,29.840000,18.459999,3225.000000,126.120003,1489.579956,1500.099976,3349.159912
2020-08-07,444.450012,170.020004,30.020000,19.030001,3167.459961,124.959999,1452.709961,1494.489990,3351.280029
2020-08-10,450.910004,179.410004,30.200001,21.650000,3148.159912,127.110001,1418.569946,1496.099976,3360.469971


In [42]:
stock_df["T+1000"] = stock_df["T"] + 1000
stock_df

Unnamed: 0_level_0,AAPL,BA,T,MGM,AMZN,IBM,TSLA,GOOG,sp500,T+1000
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2012-01-12,60.198570,75.510002,30.120001,12.130000,175.929993,180.550003,28.250000,313.644379,1295.500000,1030.120001
2012-01-13,59.972858,74.599998,30.070000,12.350000,178.419998,179.160004,22.790001,311.328064,1289.089966,1030.070000
2012-01-17,60.671429,75.239998,30.250000,12.250000,181.660004,180.000000,26.600000,313.116364,1293.670044,1030.250000
2012-01-18,61.301430,75.059998,30.330000,12.730000,189.440002,181.070007,26.809999,315.273285,1308.040039,1030.330000
2012-01-19,61.107143,75.559998,30.420000,12.800000,194.449997,180.520004,26.760000,318.590851,1314.500000,1030.420000
...,...,...,...,...,...,...,...,...,...,...
2020-08-05,440.250000,174.279999,29.850000,16.719999,3205.030029,125.449997,1485.020020,1473.609985,3327.770020,1029.850000
2020-08-06,455.609985,172.199997,29.840000,18.459999,3225.000000,126.120003,1489.579956,1500.099976,3349.159912,1029.840000
2020-08-07,444.450012,170.020004,30.020000,19.030001,3167.459961,124.959999,1452.709961,1494.489990,3351.280029,1030.020000
2020-08-10,450.910004,179.410004,30.200001,21.650000,3148.159912,127.110001,1418.569946,1496.099976,3360.469971,1030.200001


In [43]:
stock_df["T2"] = stock_df["T"].add(1000)
stock_df

Unnamed: 0_level_0,AAPL,BA,T,MGM,AMZN,IBM,TSLA,GOOG,sp500,T+1000,T2
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2012-01-12,60.198570,75.510002,30.120001,12.130000,175.929993,180.550003,28.250000,313.644379,1295.500000,1030.120001,1030.120001
2012-01-13,59.972858,74.599998,30.070000,12.350000,178.419998,179.160004,22.790001,311.328064,1289.089966,1030.070000,1030.070000
2012-01-17,60.671429,75.239998,30.250000,12.250000,181.660004,180.000000,26.600000,313.116364,1293.670044,1030.250000,1030.250000
2012-01-18,61.301430,75.059998,30.330000,12.730000,189.440002,181.070007,26.809999,315.273285,1308.040039,1030.330000,1030.330000
2012-01-19,61.107143,75.559998,30.420000,12.800000,194.449997,180.520004,26.760000,318.590851,1314.500000,1030.420000,1030.420000
...,...,...,...,...,...,...,...,...,...,...,...
2020-08-05,440.250000,174.279999,29.850000,16.719999,3205.030029,125.449997,1485.020020,1473.609985,3327.770020,1029.850000,1029.850000
2020-08-06,455.609985,172.199997,29.840000,18.459999,3225.000000,126.120003,1489.579956,1500.099976,3349.159912,1029.840000,1029.840000
2020-08-07,444.450012,170.020004,30.020000,19.030001,3167.459961,124.959999,1452.709961,1494.489990,3351.280029,1030.020000,1030.020000
2020-08-10,450.910004,179.410004,30.200001,21.650000,3148.159912,127.110001,1418.569946,1496.099976,3360.469971,1030.200001,1030.200001


In [44]:
# mul() -> 열을 대상으로 곱하기 연산을 수행

stock_df["sp500"] = stock_df["sp500"].mul(1.3)
stock_df

Unnamed: 0_level_0,AAPL,BA,T,MGM,AMZN,IBM,TSLA,GOOG,sp500,T+1000,T2
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2012-01-12,60.198570,75.510002,30.120001,12.130000,175.929993,180.550003,28.250000,313.644379,1684.150000,1030.120001,1030.120001
2012-01-13,59.972858,74.599998,30.070000,12.350000,178.419998,179.160004,22.790001,311.328064,1675.816956,1030.070000,1030.070000
2012-01-17,60.671429,75.239998,30.250000,12.250000,181.660004,180.000000,26.600000,313.116364,1681.771057,1030.250000,1030.250000
2012-01-18,61.301430,75.059998,30.330000,12.730000,189.440002,181.070007,26.809999,315.273285,1700.452051,1030.330000,1030.330000
2012-01-19,61.107143,75.559998,30.420000,12.800000,194.449997,180.520004,26.760000,318.590851,1708.850000,1030.420000,1030.420000
...,...,...,...,...,...,...,...,...,...,...,...
2020-08-05,440.250000,174.279999,29.850000,16.719999,3205.030029,125.449997,1485.020020,1473.609985,4326.101026,1029.850000,1029.850000
2020-08-06,455.609985,172.199997,29.840000,18.459999,3225.000000,126.120003,1489.579956,1500.099976,4353.907886,1029.840000,1029.840000
2020-08-07,444.450012,170.020004,30.020000,19.030001,3167.459961,124.959999,1452.709961,1494.489990,4356.664038,1030.020000,1030.020000
2020-08-10,450.910004,179.410004,30.200001,21.650000,3148.159912,127.110001,1418.569946,1496.099976,4368.610962,1030.200001,1030.200001


In [45]:
stock_df.iloc[4, 2] = 0
stock_df

Unnamed: 0_level_0,AAPL,BA,T,MGM,AMZN,IBM,TSLA,GOOG,sp500,T+1000,T2
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2012-01-12,60.198570,75.510002,30.120001,12.130000,175.929993,180.550003,28.250000,313.644379,1684.150000,1030.120001,1030.120001
2012-01-13,59.972858,74.599998,30.070000,12.350000,178.419998,179.160004,22.790001,311.328064,1675.816956,1030.070000,1030.070000
2012-01-17,60.671429,75.239998,30.250000,12.250000,181.660004,180.000000,26.600000,313.116364,1681.771057,1030.250000,1030.250000
2012-01-18,61.301430,75.059998,30.330000,12.730000,189.440002,181.070007,26.809999,315.273285,1700.452051,1030.330000,1030.330000
2012-01-19,61.107143,75.559998,0.000000,12.800000,194.449997,180.520004,26.760000,318.590851,1708.850000,1030.420000,1030.420000
...,...,...,...,...,...,...,...,...,...,...,...
2020-08-05,440.250000,174.279999,29.850000,16.719999,3205.030029,125.449997,1485.020020,1473.609985,4326.101026,1029.850000,1029.850000
2020-08-06,455.609985,172.199997,29.840000,18.459999,3225.000000,126.120003,1489.579956,1500.099976,4353.907886,1029.840000,1029.840000
2020-08-07,444.450012,170.020004,30.020000,19.030001,3167.459961,124.959999,1452.709961,1494.489990,4356.664038,1030.020000,1030.020000
2020-08-10,450.910004,179.410004,30.200001,21.650000,3148.159912,127.110001,1418.569946,1496.099976,4368.610962,1030.200001,1030.200001


### 콜백함수

In [46]:
stock_df = pd.read_csv("../00_data/stock_daily_prices.csv", index_col="Date")
stock_df

Unnamed: 0_level_0,AAPL,BA,T,MGM,AMZN,IBM,TSLA,GOOG,sp500
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2012-01-12,60.198570,75.510002,30.120001,12.130000,175.929993,180.550003,28.250000,313.644379,1295.500000
2012-01-13,59.972858,74.599998,30.070000,12.350000,178.419998,179.160004,22.790001,311.328064,1289.089966
2012-01-17,60.671429,75.239998,30.250000,12.250000,181.660004,180.000000,26.600000,313.116364,1293.670044
2012-01-18,61.301430,75.059998,30.330000,12.730000,189.440002,181.070007,26.809999,315.273285,1308.040039
2012-01-19,61.107143,75.559998,30.420000,12.800000,194.449997,180.520004,26.760000,318.590851,1314.500000
...,...,...,...,...,...,...,...,...,...
2020-08-05,440.250000,174.279999,29.850000,16.719999,3205.030029,125.449997,1485.020020,1473.609985,3327.770020
2020-08-06,455.609985,172.199997,29.840000,18.459999,3225.000000,126.120003,1489.579956,1500.099976,3349.159912
2020-08-07,444.450012,170.020004,30.020000,19.030001,3167.459961,124.959999,1452.709961,1494.489990,3351.280029
2020-08-10,450.910004,179.410004,30.200001,21.650000,3148.159912,127.110001,1418.569946,1496.099976,3360.469971


In [47]:
def increment(balance):
  return balance * 1.1

In [48]:
stock_df["increment"] = stock_df["T"].apply(increment)
stock_df

Unnamed: 0_level_0,AAPL,BA,T,MGM,AMZN,IBM,TSLA,GOOG,sp500,increment
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2012-01-12,60.198570,75.510002,30.120001,12.130000,175.929993,180.550003,28.250000,313.644379,1295.500000,33.132001
2012-01-13,59.972858,74.599998,30.070000,12.350000,178.419998,179.160004,22.790001,311.328064,1289.089966,33.077000
2012-01-17,60.671429,75.239998,30.250000,12.250000,181.660004,180.000000,26.600000,313.116364,1293.670044,33.275000
2012-01-18,61.301430,75.059998,30.330000,12.730000,189.440002,181.070007,26.809999,315.273285,1308.040039,33.363000
2012-01-19,61.107143,75.559998,30.420000,12.800000,194.449997,180.520004,26.760000,318.590851,1314.500000,33.462000
...,...,...,...,...,...,...,...,...,...,...
2020-08-05,440.250000,174.279999,29.850000,16.719999,3205.030029,125.449997,1485.020020,1473.609985,3327.770020,32.835000
2020-08-06,455.609985,172.199997,29.840000,18.459999,3225.000000,126.120003,1489.579956,1500.099976,3349.159912,32.824000
2020-08-07,444.450012,170.020004,30.020000,19.030001,3167.459961,124.959999,1452.709961,1494.489990,3351.280029,33.022000
2020-08-10,450.910004,179.410004,30.200001,21.650000,3148.159912,127.110001,1418.569946,1496.099976,3360.469971,33.220001


### 필터링 옵션

In [49]:
stock_df = pd.read_csv("../00_data/stock_daily_prices.csv")
stock_df

Unnamed: 0,Date,AAPL,BA,T,MGM,AMZN,IBM,TSLA,GOOG,sp500
0,2012-01-12,60.198570,75.510002,30.120001,12.130000,175.929993,180.550003,28.250000,313.644379,1295.500000
1,2012-01-13,59.972858,74.599998,30.070000,12.350000,178.419998,179.160004,22.790001,311.328064,1289.089966
2,2012-01-17,60.671429,75.239998,30.250000,12.250000,181.660004,180.000000,26.600000,313.116364,1293.670044
3,2012-01-18,61.301430,75.059998,30.330000,12.730000,189.440002,181.070007,26.809999,315.273285,1308.040039
4,2012-01-19,61.107143,75.559998,30.420000,12.800000,194.449997,180.520004,26.760000,318.590851,1314.500000
...,...,...,...,...,...,...,...,...,...,...
2154,2020-08-05,440.250000,174.279999,29.850000,16.719999,3205.030029,125.449997,1485.020020,1473.609985,3327.770020
2155,2020-08-06,455.609985,172.199997,29.840000,18.459999,3225.000000,126.120003,1489.579956,1500.099976,3349.159912
2156,2020-08-07,444.450012,170.020004,30.020000,19.030001,3167.459961,124.959999,1452.709961,1494.489990,3351.280029
2157,2020-08-10,450.910004,179.410004,30.200001,21.650000,3148.159912,127.110001,1418.569946,1496.099976,3360.469971


In [50]:
# 애플의 주가가 440달러 이상인 시점을 조회

apple = stock_df[(stock_df["AAPL"] >= 440)]
apple

Unnamed: 0,Date,AAPL,BA,T,MGM,AMZN,IBM,TSLA,GOOG,sp500
2154,2020-08-05,440.25,174.279999,29.85,16.719999,3205.030029,125.449997,1485.02002,1473.609985,3327.77002
2155,2020-08-06,455.609985,172.199997,29.84,18.459999,3225.0,126.120003,1489.579956,1500.099976,3349.159912
2156,2020-08-07,444.450012,170.020004,30.02,19.030001,3167.459961,124.959999,1452.709961,1494.48999,3351.280029
2157,2020-08-10,450.910004,179.410004,30.200001,21.65,3148.159912,127.110001,1418.569946,1496.099976,3360.469971


In [51]:
stock_df[(stock_df["Date"] == "2020-08-05")]

Unnamed: 0,Date,AAPL,BA,T,MGM,AMZN,IBM,TSLA,GOOG,sp500
2154,2020-08-05,440.25,174.279999,29.85,16.719999,3205.030029,125.449997,1485.02002,1473.609985,3327.77002


In [52]:
# isin() -> 특정 열에 해당 값이 존재하는 지 확인하여 반환하는 메서드

mask = stock_df["Date"].isin(["2020-08-05", "2020-08-10"])
stock_df[mask]

Unnamed: 0,Date,AAPL,BA,T,MGM,AMZN,IBM,TSLA,GOOG,sp500
2154,2020-08-05,440.25,174.279999,29.85,16.719999,3205.030029,125.449997,1485.02002,1473.609985,3327.77002
2157,2020-08-10,450.910004,179.410004,30.200001,21.65,3148.159912,127.110001,1418.569946,1496.099976,3360.469971


In [53]:
# between() -> 범위 연산을 하는 메서드

stock_df[stock_df["Date"].between("2012-01-19", "2013-01-19")]

Unnamed: 0,Date,AAPL,BA,T,MGM,AMZN,IBM,TSLA,GOOG,sp500
4,2012-01-19,61.107143,75.559998,30.420000,12.80,194.449997,180.520004,26.760000,318.590851,1314.500000
5,2012-01-20,60.042858,75.519997,30.510000,12.64,190.929993,188.520004,26.600000,291.900879,1315.380005
6,2012-01-23,61.058571,75.510002,30.400000,13.14,186.089996,189.979996,26.770000,291.666748,1316.000000
7,2012-01-24,60.058571,75.360001,30.090000,13.16,187.000000,191.929993,27.420000,289.380341,1314.650024
8,2012-01-25,63.808571,75.820000,30.209999,13.11,187.800003,191.729996,27.969999,283.681702,1326.060059
...,...,...,...,...,...,...,...,...,...,...
251,2013-01-14,71.678574,76.550003,34.020000,12.75,272.730011,192.619995,33.259998,360.274597,1470.680054
252,2013-01-15,69.417145,76.940002,33.759998,13.15,271.899994,192.500000,33.900002,361.111481,1472.339966
253,2013-01-16,72.298569,74.339996,33.259998,12.98,268.929993,192.589996,34.099998,356.259644,1472.630005
254,2013-01-17,71.811432,75.260002,33.200001,12.96,270.480011,193.649994,34.380001,354.331879,1480.939941


In [54]:
# duplicated() -> 중복된 행을 찾는 메서드
# keep은 모든 중복되는 값을 true로 표시하고 그렇지 않은 값은 False로 표시한다.

mask_true = (stock_df["AAPL"].round().duplicated(keep=False))
print(stock_df[mask_true])

mask = ~stock_df["AAPL"].round().duplicated(keep=False)
print(stock_df[mask])

            Date        AAPL          BA          T        MGM         AMZN  \
0     2012-01-12   60.198570   75.510002  30.120001  12.130000   175.929993   
1     2012-01-13   59.972858   74.599998  30.070000  12.350000   178.419998   
2     2012-01-17   60.671429   75.239998  30.250000  12.250000   181.660004   
3     2012-01-18   61.301430   75.059998  30.330000  12.730000   189.440002   
4     2012-01-19   61.107143   75.559998  30.420000  12.800000   194.449997   
...          ...         ...         ...        ...        ...          ...   
2138  2020-07-14  388.230011  179.960007  29.959999  16.160000  3084.000000   
2141  2020-07-17  385.309998  175.660004  30.250000  16.809999  2961.969971   
2143  2020-07-21  388.000000  178.630005  30.250000  16.690001  3138.290039   
2148  2020-07-28  373.010010  170.839996  29.690001  15.410000  3000.330078   
2150  2020-07-30  384.760010  161.949997  29.570000  16.639999  3051.879883   

             IBM         TSLA         GOOG        s

In [55]:
order_df = pd.read_csv("../00_data/order_data.csv")
order_df

Unnamed: 0,Order_ID,Customer_Name,Product,Quantity,Price_per_Unit,Order_Date,Shipping_City,Shipping_Status
0,1001,John Doe,Laptop,1,1000,2024-01-01,New York,Shipped
1,1002,Anna Smith,Smartphone,2,500,2024-01-02,Los Angeles,Pending
2,1003,Peter Parker,Headphones,1,200,2024-01-03,Chicago,Shipped
3,1004,Bruce Wayne,Monitor,3,300,2024-01-04,Houston,Shipped
4,1005,Clark Kent,Keyboard,4,50,2024-01-05,Phoenix,Pending
5,1006,Diana Prince,Mouse,2,30,2024-01-06,Philadelphia,Delivered
6,1007,Tony Stark,Tablet,1,250,2024-01-07,San Antonio,Delivered
7,1008,Steve Rogers,Smartwatch,1,150,2024-01-08,San Diego,Shipped
8,1009,Natasha Romanoff,Camera,1,600,2024-01-09,Dallas,Pending
9,1010,Wanda Maximoff,Printer,2,100,2024-01-10,San Jose,Delivered


In [56]:
# drop_duplicates()

order_df.drop_duplicates(subset=["Customer_Name"], inplace=True)
order_df

Unnamed: 0,Order_ID,Customer_Name,Product,Quantity,Price_per_Unit,Order_Date,Shipping_City,Shipping_Status
0,1001,John Doe,Laptop,1,1000,2024-01-01,New York,Shipped
1,1002,Anna Smith,Smartphone,2,500,2024-01-02,Los Angeles,Pending
2,1003,Peter Parker,Headphones,1,200,2024-01-03,Chicago,Shipped
3,1004,Bruce Wayne,Monitor,3,300,2024-01-04,Houston,Shipped
4,1005,Clark Kent,Keyboard,4,50,2024-01-05,Phoenix,Pending
5,1006,Diana Prince,Mouse,2,30,2024-01-06,Philadelphia,Delivered
6,1007,Tony Stark,Tablet,1,250,2024-01-07,San Antonio,Delivered
7,1008,Steve Rogers,Smartwatch,1,150,2024-01-08,San Diego,Shipped
8,1009,Natasha Romanoff,Camera,1,600,2024-01-09,Dallas,Pending
9,1010,Wanda Maximoff,Printer,2,100,2024-01-10,San Jose,Delivered


In [57]:
# where() -> 기준이 충족하지 않는 모든 행은 NaN으로 치환 (전체데이터를 기준으로 결측값을 확인하기 위해 주로 사용)
mask = order_df["Price_per_Unit"] >= 300
order_df.where(mask)

Unnamed: 0,Order_ID,Customer_Name,Product,Quantity,Price_per_Unit,Order_Date,Shipping_City,Shipping_Status
0,1001.0,John Doe,Laptop,1.0,1000.0,2024-01-01,New York,Shipped
1,1002.0,Anna Smith,Smartphone,2.0,500.0,2024-01-02,Los Angeles,Pending
2,,,,,,,,
3,1004.0,Bruce Wayne,Monitor,3.0,300.0,2024-01-04,Houston,Shipped
4,,,,,,,,
5,,,,,,,,
6,,,,,,,,
7,,,,,,,,
8,1009.0,Natasha Romanoff,Camera,1.0,600.0,2024-01-09,Dallas,Pending
9,,,,,,,,


In [58]:
# isnull() -> 결측값을 확인하는데 사용하는 메서드
order_df.where(mask).isnull()

Unnamed: 0,Order_ID,Customer_Name,Product,Quantity,Price_per_Unit,Order_Date,Shipping_City,Shipping_Status
0,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False
2,True,True,True,True,True,True,True,True
3,False,False,False,False,False,False,False,False
4,True,True,True,True,True,True,True,True
5,True,True,True,True,True,True,True,True
6,True,True,True,True,True,True,True,True
7,True,True,True,True,True,True,True,True
8,False,False,False,False,False,False,False,False
9,True,True,True,True,True,True,True,True


In [59]:
order_df.where(mask).isnull().sum()

Order_ID           6
Customer_Name      6
Product            6
Quantity           6
Price_per_Unit     6
Order_Date         6
Shipping_City      6
Shipping_Status    6
dtype: int64

In [60]:
# dropna() -> 데이터프레임의 결측값을 포함하는 행이나 열을 제거하는 메서드

result = order_df.dropna(how="any") # any는 행또는 열에 NaN이 하나라도 있으면, all은 전부 NaN이여야 삭제
result

Unnamed: 0,Order_ID,Customer_Name,Product,Quantity,Price_per_Unit,Order_Date,Shipping_City,Shipping_Status
0,1001,John Doe,Laptop,1,1000,2024-01-01,New York,Shipped
1,1002,Anna Smith,Smartphone,2,500,2024-01-02,Los Angeles,Pending
2,1003,Peter Parker,Headphones,1,200,2024-01-03,Chicago,Shipped
3,1004,Bruce Wayne,Monitor,3,300,2024-01-04,Houston,Shipped
4,1005,Clark Kent,Keyboard,4,50,2024-01-05,Phoenix,Pending
5,1006,Diana Prince,Mouse,2,30,2024-01-06,Philadelphia,Delivered
6,1007,Tony Stark,Tablet,1,250,2024-01-07,San Antonio,Delivered
7,1008,Steve Rogers,Smartwatch,1,150,2024-01-08,San Diego,Shipped
8,1009,Natasha Romanoff,Camera,1,600,2024-01-09,Dallas,Pending
9,1010,Wanda Maximoff,Printer,2,100,2024-01-10,San Jose,Delivered


In [61]:
# mean() -> 평균을 구하는 메서드

order_df["Quantity"].mean()

1.8

In [62]:
order_df

Unnamed: 0,Order_ID,Customer_Name,Product,Quantity,Price_per_Unit,Order_Date,Shipping_City,Shipping_Status
0,1001,John Doe,Laptop,1,1000,2024-01-01,New York,Shipped
1,1002,Anna Smith,Smartphone,2,500,2024-01-02,Los Angeles,Pending
2,1003,Peter Parker,Headphones,1,200,2024-01-03,Chicago,Shipped
3,1004,Bruce Wayne,Monitor,3,300,2024-01-04,Houston,Shipped
4,1005,Clark Kent,Keyboard,4,50,2024-01-05,Phoenix,Pending
5,1006,Diana Prince,Mouse,2,30,2024-01-06,Philadelphia,Delivered
6,1007,Tony Stark,Tablet,1,250,2024-01-07,San Antonio,Delivered
7,1008,Steve Rogers,Smartwatch,1,150,2024-01-08,San Diego,Shipped
8,1009,Natasha Romanoff,Camera,1,600,2024-01-09,Dallas,Pending
9,1010,Wanda Maximoff,Printer,2,100,2024-01-10,San Jose,Delivered


In [63]:
result = order_df.fillna({"Product" : "fillnaProduct"})
result

Unnamed: 0,Order_ID,Customer_Name,Product,Quantity,Price_per_Unit,Order_Date,Shipping_City,Shipping_Status
0,1001,John Doe,Laptop,1,1000,2024-01-01,New York,Shipped
1,1002,Anna Smith,Smartphone,2,500,2024-01-02,Los Angeles,Pending
2,1003,Peter Parker,Headphones,1,200,2024-01-03,Chicago,Shipped
3,1004,Bruce Wayne,Monitor,3,300,2024-01-04,Houston,Shipped
4,1005,Clark Kent,Keyboard,4,50,2024-01-05,Phoenix,Pending
5,1006,Diana Prince,Mouse,2,30,2024-01-06,Philadelphia,Delivered
6,1007,Tony Stark,Tablet,1,250,2024-01-07,San Antonio,Delivered
7,1008,Steve Rogers,Smartwatch,1,150,2024-01-08,San Diego,Shipped
8,1009,Natasha Romanoff,Camera,1,600,2024-01-09,Dallas,Pending
9,1010,Wanda Maximoff,Printer,2,100,2024-01-10,San Jose,Delivered


### 실습
MonthlyRate(월간요금)의 평균값 도출하기

In [64]:
human_resources = pd.read_csv("../00_data/Human_Resources.csv")
human_resources["MonthlyRate"]

0       19479.0
1       24907.0
2        2396.0
3       23159.0
4       16632.0
         ...   
1465    12290.0
1466    21457.0
1467     5174.0
1468    13243.0
1469    10228.0
Name: MonthlyRate, Length: 1470, dtype: float64

In [65]:
human_resources["MonthlyRate"].dropna(how="any").mean()

14319.813232253618

In [66]:
human_resources["MonthlyRate"].mean()

14319.813232253618

In [67]:
filtered_df = human_resources[(human_resources["MonthlyRate"] > 0)]
filtered_df["MonthlyRate"].mean()

14319.813232253618

In [68]:
human_resources["MonthlyRate"].fillna(human_resources["MonthlyRate"].median()).mean()

14319.350340136054

In [69]:
human_resources["MonthlyRate"].fillna(0).mean()

14134.727210884354

### 데이터 유형 변경

In [70]:
human_resources = pd.read_csv("../00_data/Human_Resources.csv")
human_resources["HourlyRate"].info()

<class 'pandas.core.series.Series'>
RangeIndex: 1470 entries, 0 to 1469
Series name: HourlyRate
Non-Null Count  Dtype
--------------  -----
1470 non-null   int64
dtypes: int64(1)
memory usage: 11.6 KB


In [71]:
human_resources["HourlyRate"] = human_resources["HourlyRate"].astype("float64")
human_resources["HourlyRate"].info()

<class 'pandas.core.series.Series'>
RangeIndex: 1470 entries, 0 to 1469
Series name: HourlyRate
Non-Null Count  Dtype  
--------------  -----  
1470 non-null   float64
dtypes: float64(1)
memory usage: 11.6 KB


In [72]:
human_resources.loc[2, "PerformanceRating"] = 6.0
human_resources["PerformanceRating"] = human_resources["PerformanceRating"].astype("category")
human_resources["RelationshipSatisfaction"] = human_resources["RelationshipSatisfaction"].astype("category")

In [73]:
human_resources[["PerformanceRating", "RelationshipSatisfaction"]].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 2 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   PerformanceRating         1469 non-null   category
 1   RelationshipSatisfaction  1470 non-null   category
dtypes: category(2)
memory usage: 3.3 KB


In [74]:
# 오류 발생
human_resources.loc[2, "PerformanceRating"] = 8.0

TypeError: Cannot setitem on a Categorical with a new category (8.0), set the categories first