In [99]:
import pandas as pd
import numpy as np

In [100]:
dates = pd.date_range('20200625', periods=6)
dates

DatetimeIndex(['2020-06-25', '2020-06-26', '2020-06-27', '2020-06-28',
               '2020-06-29', '2020-06-30'],
              dtype='datetime64[ns]', freq='D')

In [101]:
df = pd.DataFrame(np.random.randn(6, 4), 
                  index=dates, 
                  columns=['A','B','C','D'])

In [102]:
df

Unnamed: 0,A,B,C,D
2020-06-25,-1.192645,1.307525,-0.084312,-0.290315
2020-06-26,1.503037,-1.205906,-1.169684,-1.37951
2020-06-27,-0.72159,-0.591323,-2.285181,-1.210277
2020-06-28,-0.867334,0.236723,0.331853,0.492468
2020-06-29,-0.878442,0.090991,-1.114409,-0.23365
2020-06-30,0.733077,0.090834,2.283152,-1.540344


In [103]:
df2 = df.copy()
df2
# df는 기본적으로 행, 열의 위치값을 가지고 있음
# 기본적으로 가지고 있는 위치값(index)를 이용해서 조작하는 경우
# iloc[]를 사용
# 일부 그래프나. 분석시 특정 항목을 index항목으로 수동으로 설정해야 하는 경우가 있음
# 특정항목을 index항목으로 설정 가능한 경우는 unique / not null인 값을 
#$ 가지는 항목만 설정 가능
# user_id, bbs_id


Unnamed: 0,A,B,C,D
2020-06-25,-1.192645,1.307525,-0.084312,-0.290315
2020-06-26,1.503037,-1.205906,-1.169684,-1.37951
2020-06-27,-0.72159,-0.591323,-2.285181,-1.210277
2020-06-28,-0.867334,0.236723,0.331853,0.492468
2020-06-29,-0.878442,0.090991,-1.114409,-0.23365
2020-06-30,0.733077,0.090834,2.283152,-1.540344


In [104]:
df2.reset_index(inplace=True)

In [105]:
df2

Unnamed: 0,index,A,B,C,D
0,2020-06-25,-1.192645,1.307525,-0.084312,-0.290315
1,2020-06-26,1.503037,-1.205906,-1.169684,-1.37951
2,2020-06-27,-0.72159,-0.591323,-2.285181,-1.210277
3,2020-06-28,-0.867334,0.236723,0.331853,0.492468
4,2020-06-29,-0.878442,0.090991,-1.114409,-0.23365
5,2020-06-30,0.733077,0.090834,2.283152,-1.540344


In [106]:
df2.set_index('index', inplace=True)
df2

Unnamed: 0_level_0,A,B,C,D
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-06-25,-1.192645,1.307525,-0.084312,-0.290315
2020-06-26,1.503037,-1.205906,-1.169684,-1.37951
2020-06-27,-0.72159,-0.591323,-2.285181,-1.210277
2020-06-28,-0.867334,0.236723,0.331853,0.492468
2020-06-29,-0.878442,0.090991,-1.114409,-0.23365
2020-06-30,0.733077,0.090834,2.283152,-1.540344


In [107]:
df2['E'] = df2['A'] + 1 #브로드캐스팅!

In [108]:
df2.columns

Index(['A', 'B', 'C', 'D', 'E'], dtype='object')

In [109]:
df2['E']

index
2020-06-25   -0.192645
2020-06-26    2.503037
2020-06-27    0.278410
2020-06-28    0.132666
2020-06-29    0.121558
2020-06-30    1.733077
Name: E, dtype: float64

In [110]:
df2['F'] = 0

In [111]:
df2['F']

index
2020-06-25    0
2020-06-26    0
2020-06-27    0
2020-06-28    0
2020-06-29    0
2020-06-30    0
Name: F, dtype: int64

In [112]:
df2['G'] = range(0, 6)
df2['G']

index
2020-06-25    0
2020-06-26    1
2020-06-27    2
2020-06-28    3
2020-06-29    4
2020-06-30    5
Name: G, dtype: int32

In [113]:
# index를 변경하고 싶은 경우, 기존의 인덱스를 살리고 싶으면
# 먼저 reset_index()하고 나서 해야함.
# reset을 하지 않는 경우 기존 index항목은 사라짐.
df2.set_index('G', inplace=True)

In [114]:
df2

Unnamed: 0_level_0,A,B,C,D,E,F
G,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,-1.192645,1.307525,-0.084312,-0.290315,-0.192645,0
1,1.503037,-1.205906,-1.169684,-1.37951,2.503037,0
2,-0.72159,-0.591323,-2.285181,-1.210277,0.27841,0
3,-0.867334,0.236723,0.331853,0.492468,0.132666,0
4,-0.878442,0.090991,-1.114409,-0.23365,0.121558,0
5,0.733077,0.090834,2.283152,-1.540344,1.733077,0


In [115]:
df2.columns

Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')

In [116]:
df3 = df.copy()
df3

Unnamed: 0,A,B,C,D
2020-06-25,-1.192645,1.307525,-0.084312,-0.290315
2020-06-26,1.503037,-1.205906,-1.169684,-1.37951
2020-06-27,-0.72159,-0.591323,-2.285181,-1.210277
2020-06-28,-0.867334,0.236723,0.331853,0.492468
2020-06-29,-0.878442,0.090991,-1.114409,-0.23365
2020-06-30,0.733077,0.090834,2.283152,-1.540344


In [117]:
# E 항목을 파생변수로 만드세요 1~10범위 값
# F 항목을 파생변수로 만드세요 A열과 C의 합
# H 항목을 파생변수로 만드세여 1로 설정
# Z 항목을 파생변수로 만드세요 1~9까지의 랜덤한 값으로 설정

In [118]:
import random

In [119]:
# E 항목을 파생변수로 만드세요 1~10범위 값
data = range(1,11) # 1~10
data2 = []
for i in range(6):
    data2.append(random.choice(data))
df3['E'] = data2
df3['E']

2020-06-25     1
2020-06-26     1
2020-06-27    10
2020-06-28     4
2020-06-29     1
2020-06-30     2
Freq: D, Name: E, dtype: int64

In [120]:
# F 항목을 파생변수로 만드세요 A열과 C열의 합
df3['F'] = df['A'] + df['C']
df3

Unnamed: 0,A,B,C,D,E,F
2020-06-25,-1.192645,1.307525,-0.084312,-0.290315,1,-1.276957
2020-06-26,1.503037,-1.205906,-1.169684,-1.37951,1,0.333354
2020-06-27,-0.72159,-0.591323,-2.285181,-1.210277,10,-3.006771
2020-06-28,-0.867334,0.236723,0.331853,0.492468,4,-0.535481
2020-06-29,-0.878442,0.090991,-1.114409,-0.23365,1,-1.992851
2020-06-30,0.733077,0.090834,2.283152,-1.540344,2,3.016228


In [121]:
# H 항목을 파생변수로 만드세여 1로 설정
df3['H'] = 1
df3

Unnamed: 0,A,B,C,D,E,F,H
2020-06-25,-1.192645,1.307525,-0.084312,-0.290315,1,-1.276957,1
2020-06-26,1.503037,-1.205906,-1.169684,-1.37951,1,0.333354,1
2020-06-27,-0.72159,-0.591323,-2.285181,-1.210277,10,-3.006771,1
2020-06-28,-0.867334,0.236723,0.331853,0.492468,4,-0.535481,1
2020-06-29,-0.878442,0.090991,-1.114409,-0.23365,1,-1.992851,1
2020-06-30,0.733077,0.090834,2.283152,-1.540344,2,3.016228,1


In [122]:
df3['Z'] = random.randint(1, 10)
df3['Z']

2020-06-25    8
2020-06-26    8
2020-06-27    8
2020-06-28    8
2020-06-29    8
2020-06-30    8
Freq: D, Name: Z, dtype: int64

In [123]:
data2 = []
for _ in range(6):
    data2.append(random.randint(1, 10))
data2

[2, 7, 5, 7, 7, 7]

In [124]:
# Z 항목을 파생변수로 만드세요 1~9까지의 랜덤한 값으로 설정

# data2 = []
# for _ in range(6):
#     data2.append(random.randint(1, 10))
# data2
# df3['Z2'] = data2

df3['Z2'] = [random.randint(1, 10) for _ in range(6)]
df3['Z2']

2020-06-25     3
2020-06-26     2
2020-06-27    10
2020-06-28     4
2020-06-29     8
2020-06-30    10
Freq: D, Name: Z2, dtype: int64

In [125]:
# 리스트 내포 : 결과가 항상 list
# list comprehension
[random.randint(1, 10) for _ in range(6)]

[3, 3, 3, 2, 5, 1]

In [126]:
# [ '반복하면서 한번에 처리할 내용' for i in range(10)]
[i + 1 for i in range(10)]

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [127]:
# [ '반복하면서 한번에 처리할 내용' for i in range(10) if i % 2 == 0]
[i + 1 for i in range(10) if  i % 2 == 0]

[1, 3, 5, 7, 9]

In [128]:
name = ['홍길동', '김길동', '송길동']
for x in name :
    print(x)

홍길동
김길동
송길동


In [129]:
# name에 들어있는 각각의 모든 데이터에 '님'자를 붙여주세요
[i + '님' for i in name]

['홍길동님', '김길동님', '송길동님']

In [130]:
# name에 들어있는 각각의 모든 데이터에 '신입'자를 붙여주세요

In [131]:
['신입 ' + i  for i in name]

['신입 홍길동', '신입 김길동', '신입 송길동']

In [132]:
a_list = range(1, 100, 2)

In [133]:
# a_list에 몇 개가 들어 있는지 세어보기
count = 0
result = [count + 1 for x in a_list]
sum(result)

50

In [134]:
# a_list의 각각의 모든 데이터에 0.1을 곱하기. 변경된 a_list의 합과 평균

In [135]:
a_list = [x * 0.1 for x in a_list]

In [136]:
np.sum(a_list)

250.0

In [137]:
np.mean(a_list)

5.0

In [138]:
np.std(a_list)

2.8861739379323628

In [139]:
[x * 0.1 for x in a_list][:3]

[0.010000000000000002, 0.030000000000000006, 0.05]

In [140]:
df3.drop(columns=['H'], inplace=True) #열 삭제

In [141]:
df3

Unnamed: 0,A,B,C,D,E,F,Z,Z2
2020-06-25,-1.192645,1.307525,-0.084312,-0.290315,1,-1.276957,8,3
2020-06-26,1.503037,-1.205906,-1.169684,-1.37951,1,0.333354,8,2
2020-06-27,-0.72159,-0.591323,-2.285181,-1.210277,10,-3.006771,8,10
2020-06-28,-0.867334,0.236723,0.331853,0.492468,4,-0.535481,8,4
2020-06-29,-0.878442,0.090991,-1.114409,-0.23365,1,-1.992851,8,8
2020-06-30,0.733077,0.090834,2.283152,-1.540344,2,3.016228,8,10


In [142]:
df3.drop(columns=['E', 'F', 'Z', 'Z2'], inplace=True) #열 삭제
df3

Unnamed: 0,A,B,C,D
2020-06-25,-1.192645,1.307525,-0.084312,-0.290315
2020-06-26,1.503037,-1.205906,-1.169684,-1.37951
2020-06-27,-0.72159,-0.591323,-2.285181,-1.210277
2020-06-28,-0.867334,0.236723,0.331853,0.492468
2020-06-29,-0.878442,0.090991,-1.114409,-0.23365
2020-06-30,0.733077,0.090834,2.283152,-1.540344


In [145]:
df3

Unnamed: 0,A,B,C,D
2020-06-25,-1.192645,1.307525,-0.084312,-0.290315
2020-06-26,1.503037,-1.205906,-1.169684,-1.37951
2020-06-27,-0.72159,-0.591323,-2.285181,-1.210277
2020-06-28,-0.867334,0.236723,0.331853,0.492468
2020-06-29,-0.878442,0.090991,-1.114409,-0.23365
2020-06-30,0.733077,0.090834,2.283152,-1.540344


In [146]:
df3.drop(index=['2020-06-25'], inplace=True) #행삭제
df3

KeyError: "['2020-06-25'] not found in axis"

In [147]:
df3.loc['2020-06-25'] = [1, 1, 1, 1] #열의 순서대로 들어감.

In [148]:
df3

Unnamed: 0,A,B,C,D
2020-06-25,1.0,1.0,1.0,1.0
2020-06-26,1.503037,-1.205906,-1.169684,-1.37951
2020-06-27,-0.72159,-0.591323,-2.285181,-1.210277
2020-06-28,-0.867334,0.236723,0.331853,0.492468
2020-06-29,-0.878442,0.090991,-1.114409,-0.23365
2020-06-30,0.733077,0.090834,2.283152,-1.540344
