In [1]:
import pandas as pd
import numpy as np

In [2]:
# Pandas 데이터형을 구성하는 기본 Series()
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [3]:
# 날짜 이용
dates = pd.date_range("20130101", periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [4]:
# Pandas에서 가장 많이 사용되는 데이터형 DataFrame(index와column을 지정해준다.)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=["A", "B", "C", "D"])
df

Unnamed: 0,A,B,C,D
2013-01-01,1.065182,1.345635,-0.823127,1.764124
2013-01-02,0.591886,-0.66999,0.176391,0.217724
2013-01-03,-0.152315,-0.432319,-0.162177,-0.16128
2013-01-04,0.399689,-0.239309,0.286701,-0.690304
2013-01-05,-0.840853,-0.240483,-1.777933,2.555367
2013-01-06,-0.447031,0.411566,-0.299924,-0.45135


In [5]:
# 앞부분 데이터 확인
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,1.065182,1.345635,-0.823127,1.764124
2013-01-02,0.591886,-0.66999,0.176391,0.217724
2013-01-03,-0.152315,-0.432319,-0.162177,-0.16128
2013-01-04,0.399689,-0.239309,0.286701,-0.690304
2013-01-05,-0.840853,-0.240483,-1.777933,2.555367


In [6]:
# DataFrame의 기본 정보 확인(각 컬럼의 크기와 데이터형태를 확인한다.)
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6 entries, 2013-01-01 to 2013-01-06
Freq: D
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       6 non-null      float64
 1   B       6 non-null      float64
 2   C       6 non-null      float64
 3   D       6 non-null      float64
dtypes: float64(4)
memory usage: 240.0 bytes


In [7]:
# DataFrame의 통계적 기본 정보를 확인
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.10276,0.029183,-0.433345,0.539047
std,0.708718,0.738363,0.766997,1.315466
min,-0.840853,-0.66999,-1.777933,-0.690304
25%,-0.373352,-0.38436,-0.692326,-0.378832
50%,0.123687,-0.239896,-0.231051,0.028222
75%,0.543837,0.248847,0.091749,1.377524
max,1.065182,1.345635,0.286701,2.555367


In [8]:
# sort_values: 데이터 정렬
df.sort_values(by = "B",ascending = False) # B컬럼을 기준으로 내림차순 정렬!

Unnamed: 0,A,B,C,D
2013-01-01,1.065182,1.345635,-0.823127,1.764124
2013-01-06,-0.447031,0.411566,-0.299924,-0.45135
2013-01-04,0.399689,-0.239309,0.286701,-0.690304
2013-01-05,-0.840853,-0.240483,-1.777933,2.555367
2013-01-03,-0.152315,-0.432319,-0.162177,-0.16128
2013-01-02,0.591886,-0.66999,0.176391,0.217724


In [10]:
# 특정 컬럼만 읽기
df["A"]

2013-01-01    1.065182
2013-01-02    0.591886
2013-01-03   -0.152315
2013-01-04    0.399689
2013-01-05   -0.840853
2013-01-06   -0.447031
Freq: D, Name: A, dtype: float64

In [11]:
# [n:m]: n부터 m-1 까지(인덱스나 컬럼의 이름으로 slice하는 경우는 끝을 포함함)
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,1.065182,1.345635,-0.823127,1.764124
2013-01-02,0.591886,-0.66999,0.176391,0.217724
2013-01-03,-0.152315,-0.432319,-0.162177,-0.16128


In [12]:
df["20130102":"20130104"]

Unnamed: 0,A,B,C,D
2013-01-02,0.591886,-0.66999,0.176391,0.217724
2013-01-03,-0.152315,-0.432319,-0.162177,-0.16128
2013-01-04,0.399689,-0.239309,0.286701,-0.690304


In [14]:
# Pandas의 보편적인 slice 옵션(모든 행에 열 "A","B" 컬럼만 선택)
df.loc[:,["A", "B"]]

Unnamed: 0,A,B
2013-01-01,1.065182,1.345635
2013-01-02,0.591886,-0.66999
2013-01-03,-0.152315,-0.432319
2013-01-04,0.399689,-0.239309
2013-01-05,-0.840853,-0.240483
2013-01-06,-0.447031,0.411566


In [15]:
df.loc["20130102",["A", "B"]]

A    0.591886
B   -0.669990
Name: 2013-01-02 00:00:00, dtype: float64

In [16]:
# loc는 컬럼 명이나 인덱스 명으로 지정, iloc는 인덱스의 번호로만 지정
df.iloc[3]

A    0.399689
B   -0.239309
C    0.286701
D   -0.690304
Name: 2013-01-04 00:00:00, dtype: float64

In [17]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,0.399689,-0.239309
2013-01-05,-0.840853,-0.240483


In [18]:
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,A,C
2013-01-02,0.591886,0.176391
2013-01-03,-0.152315,-0.162177
2013-01-05,-0.840853,-1.777933


In [19]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2013-01-01,1.345635,-0.823127
2013-01-02,-0.66999,0.176391
2013-01-03,-0.432319,-0.162177
2013-01-04,-0.239309,0.286701
2013-01-05,-0.240483,-1.777933
2013-01-06,0.411566,-0.299924


In [21]:
# A컬럼의 값들 중 0보다 큰값들만 출력
df[df["A"] > 0]

Unnamed: 0,A,B,C,D
2013-01-01,1.065182,1.345635,-0.823127,1.764124
2013-01-02,0.591886,-0.66999,0.176391,0.217724
2013-01-04,0.399689,-0.239309,0.286701,-0.690304


In [22]:
df[df>0]

Unnamed: 0,A,B,C,D
2013-01-01,1.065182,1.345635,,1.764124
2013-01-02,0.591886,,0.176391,0.217724
2013-01-03,,,,
2013-01-04,0.399689,,0.286701,
2013-01-05,,,,2.555367
2013-01-06,,0.411566,,


In [23]:
# DataFrame에 E라는 컬럼을 추가
df["E"] = ["one", "one", "two", "three", "four", "three"]
df

Unnamed: 0,A,B,C,D,E
2013-01-01,1.065182,1.345635,-0.823127,1.764124,one
2013-01-02,0.591886,-0.66999,0.176391,0.217724,one
2013-01-03,-0.152315,-0.432319,-0.162177,-0.16128,two
2013-01-04,0.399689,-0.239309,0.286701,-0.690304,three
2013-01-05,-0.840853,-0.240483,-1.777933,2.555367,four
2013-01-06,-0.447031,0.411566,-0.299924,-0.45135,three


In [24]:
# 특정 요소가 있는지 확인
df["E"].isin(["two", "four"])

2013-01-01    False
2013-01-02    False
2013-01-03     True
2013-01-04    False
2013-01-05     True
2013-01-06    False
Freq: D, Name: E, dtype: bool

In [25]:
# 특정 요소가 있는 행만 선택
df[df["E"].isin(["two", "four"])]

Unnamed: 0,A,B,C,D,E
2013-01-03,-0.152315,-0.432319,-0.162177,-0.16128,two
2013-01-05,-0.840853,-0.240483,-1.777933,2.555367,four


In [26]:
# 특정 컬럼 제거
del df["E"]
df

Unnamed: 0,A,B,C,D
2013-01-01,1.065182,1.345635,-0.823127,1.764124
2013-01-02,0.591886,-0.66999,0.176391,0.217724
2013-01-03,-0.152315,-0.432319,-0.162177,-0.16128
2013-01-04,0.399689,-0.239309,0.286701,-0.690304
2013-01-05,-0.840853,-0.240483,-1.777933,2.555367
2013-01-06,-0.447031,0.411566,-0.299924,-0.45135


In [27]:
# 각 컬럼 누적합
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D
2013-01-01,1.065182,1.345635,-0.823127,1.764124
2013-01-02,1.657069,0.675645,-0.646736,1.981848
2013-01-03,1.504754,0.243326,-0.808913,1.820568
2013-01-04,1.904443,0.004017,-0.522212,1.130263
2013-01-05,1.063591,-0.236467,-2.300145,3.685631
2013-01-06,0.61656,0.175099,-2.600069,3.234281
