In [1]:
import numpy as np
import pandas as pd
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 25
pd.options.display.max_columns = 20
pd.options.display.max_colwidth = 82
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc("figure", figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)

In [2]:
import numpy as np
import pandas as pd

## 1. 결측치 표현 및 확인

In [3]:
float_data = pd.Series([1.2, -3.5, np.nan, 0])
float_data

0    1.2
1   -3.5
2    NaN
3    0.0
dtype: float64

In [4]:
float_data.isna()

0    False
1    False
2     True
3    False
dtype: bool

In [5]:
string_data = pd.Series(["aardvark", np.nan, None, "avocado"])
print(string_data)
print(string_data.isna())
float_data = pd.Series([1, 2, None], dtype='float64')
float_data

0    aardvark
1         NaN
2        None
3     avocado
dtype: object
0    False
1     True
2     True
3    False
dtype: bool


0    1.0
1    2.0
2    NaN
dtype: float64

In [6]:
float_data.isna()

0    False
1    False
2     True
dtype: bool

## 2. 결측치 골라내기

In [11]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [14]:
data[data.notna()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [15]:
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],
                    [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [16]:
data.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


### how

In [18]:
data.dropna(how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [19]:
data[4] = np.nan
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


### axis

In [20]:
data.dropna(axis="columns", how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


### thresh

In [24]:
df = pd.DataFrame(np.random.standard_normal((7, 3)))
df.iloc[:4, 1] = np.nan
df.iloc[:2, 2] = np.nan
df

Unnamed: 0,0,1,2
0,0.476985,,
1,-0.577087,,
2,0.523772,,1.34381
3,-0.713544,,-2.370232
4,-1.860761,-0.860757,0.560145
5,-1.265934,0.119827,-1.063512
6,0.332883,-2.359419,-0.199543


In [25]:
print(df.dropna())      # 결측치 하나라도 갖고 있는 행 drop
df.dropna(thresh=2)     # 결측치가 2개 이상인 행 drop

          0         1         2
4 -1.860761 -0.860757  0.560145
5 -1.265934  0.119827 -1.063512
6  0.332883 -2.359419 -0.199543


Unnamed: 0,0,1,2
2,0.523772,,1.34381
3,-0.713544,,-2.370232
4,-1.860761,-0.860757,0.560145
5,-1.265934,0.119827,-1.063512
6,0.332883,-2.359419,-0.199543


## 3. 결측치 채우기

In [26]:
df

Unnamed: 0,0,1,2
0,0.476985,,
1,-0.577087,,
2,0.523772,,1.34381
3,-0.713544,,-2.370232
4,-1.860761,-0.860757,0.560145
5,-1.265934,0.119827,-1.063512
6,0.332883,-2.359419,-0.199543


In [27]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.476985,0.0,0.0
1,-0.577087,0.0,0.0
2,0.523772,0.0,1.34381
3,-0.713544,0.0,-2.370232
4,-1.860761,-0.860757,0.560145
5,-1.265934,0.119827,-1.063512
6,0.332883,-2.359419,-0.199543


### 딕셔너리를 통한 행 별 다른 결측치 채우기

In [29]:
df.fillna({1: 0.5, 2: 0})   # 1번 column의 결측치는 0.5, 2번 column의 결측치는 0

Unnamed: 0,0,1,2
0,0.476985,0.5,0.0
1,-0.577087,0.5,0.0
2,0.523772,0.5,1.34381
3,-0.713544,0.5,-2.370232
4,-1.860761,-0.860757,0.560145
5,-1.265934,0.119827,-1.063512
6,0.332883,-2.359419,-0.199543


### method

In [30]:
df = pd.DataFrame(np.random.standard_normal((6, 3)))
df.iloc[2:, 1] = np.nan
df.iloc[4:, 2] = np.nan
print(df)
df.fillna(method="ffill")   # 이전의 유효한 값으로 채움

          0         1         2
0 -1.541996 -0.970736 -1.307030
1  0.286350  0.377984 -0.753887
2  0.331286       NaN  0.069877
3  0.246674       NaN  1.004812
4  1.327195       NaN       NaN
5  0.022185       NaN       NaN


  df.fillna(method="ffill")


Unnamed: 0,0,1,2
0,-1.541996,-0.970736,-1.30703
1,0.28635,0.377984,-0.753887
2,0.331286,0.377984,0.069877
3,0.246674,0.377984,1.004812
4,1.327195,0.377984,1.004812
5,0.022185,0.377984,1.004812


In [31]:
df.fillna(method="bfill")   # 이후의 유효한 값으로 채움

  df.fillna(method="bfill")


Unnamed: 0,0,1,2
0,-1.541996,-0.970736,-1.30703
1,0.28635,0.377984,-0.753887
2,0.331286,,0.069877
3,0.246674,,1.004812
4,1.327195,,
5,0.022185,,


In [32]:
df.fillna(method="ffill", limit=2)  # 연속된 결측치 2개만 method 옵션 따라 채움

  df.fillna(method="ffill", limit=2)


Unnamed: 0,0,1,2
0,-1.541996,-0.970736,-1.30703
1,0.28635,0.377984,-0.753887
2,0.331286,0.377984,0.069877
3,0.246674,0.377984,1.004812
4,1.327195,,1.004812
5,0.022185,,1.004812


In [33]:
data = pd.Series([1., np.nan, 3.5, np.nan, 7])

In [34]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

## 4. 중복 제거

In [35]:
data = pd.DataFrame({"k1": ["one", "two"] * 3 + ["two"],
                    "k2": [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [36]:
data.duplicated()   # 각 행이 중복인지 아닌지를 반환

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [37]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [38]:
data["v1"] = range(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [41]:
data.drop_duplicates(subset=["k1"]) # k1이 중복이면 drop

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [39]:
data.drop_duplicates(["k1", "k2"], keep="last") # k1, k2가 중복이면 drop
# keep='last': 중복인 행 중 마지막 행을 남겨놓음(원래는 먼저 나온 것을 남겨 놓음)

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


## 5. 함수나 맵핑을 이용하여 데이터 변형

In [42]:
data = pd.DataFrame({"food": ["bacon", "pulled pork", "bacon",
                            "pastrami", "corned beef", "bacon",
                            "pastrami", "honey ham", "nova lox"],
                    "ounces": [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,corned beef,7.5
5,bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [43]:
meat_to_animal = {
  "bacon": "pig",
  "pulled pork": "pig",
  "pastrami": "cow",
  "corned beef": "cow",
  "honey ham": "pig",
  "nova lox": "salmon"
}

In [45]:
data["animal"] = data["food"].map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [46]:
def get_animal(x):
    return meat_to_animal[x]
data["food"].map(get_animal)

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [47]:
df = pd.DataFrame(np.random.standard_normal((7, 3)))

In [48]:
df

Unnamed: 0,0,1,2
0,0.86258,-0.010032,0.050009
1,0.670216,0.852965,-0.955869
2,-0.023493,-2.304234,-0.652469
3,-1.218302,-1.33261,1.074623
4,0.723642,0.690002,1.001543
5,-0.503087,-0.622274,-0.921169
6,-0.726213,0.222896,0.051316


In [53]:
# DataFrame: 행/열 연산시 apply, 연소별 연산시 applymap
df.applymap(lambda x:x**2)  # 원소별로 변형

  df.applymap(lambda x:x**2)  # 원소별로 변형


Unnamed: 0,0,1,2
0,0.744044,0.000101,0.002501
1,0.449189,0.727549,0.913685
2,0.000552,5.309494,0.425716
3,1.48426,1.775849,1.154814
4,0.523657,0.476103,1.003089
5,0.253097,0.387225,0.848552
6,0.527386,0.049682,0.002633


## 6. 값 치환하기

In [54]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [55]:
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [56]:
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [57]:
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [58]:
data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

## 7. 축 색인 이름 바꾸기

### map

In [69]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=["Ohio", "Colorado", "New York"],
                    columns=["one", "two", "three", "four"])

In [70]:
def transform(x):
    return x[:4].upper()

data.index

Index(['Ohio', 'Colorado', 'New York'], dtype='object')

In [71]:
data.index.map(transform)   # 객체의 축 색인 이름 변경

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [72]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


### rename

In [73]:
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [76]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [77]:
data.rename(index={"OHIO": "INDIANA"},
            columns={"three": "peekaboo"})

Unnamed: 0,one,two,peekaboo,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


## 8. 이산화

In [78]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [80]:
bins = [18, 25, 35, 60, 100]
age_categories = pd.cut(ages, bins)
age_categories

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [82]:
pd.cut(ages, bins, right=False) # False: 대괄호가 왼쪽

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64, left]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

In [83]:
group_names = ["Youth", "YoungAdult", "MiddleAged", "Senior"]
pd.cut(ages, bins, labels=group_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

In [86]:
data = np.random.uniform(size=20)
data

array([0.4154, 0.2644, 0.0974, 0.4858, 0.4647, 0.0298, 0.6943, 0.7169,
       0.7298, 0.4144, 0.0151, 0.909 , 0.7894, 0.1652, 0.3128, 0.6109,
       0.3645, 0.156 , 0.1773, 0.8679])

In [88]:
pd.cut(data, 4, precision=2)    # 4개의 균등한 그룹

[(0.24, 0.46], (0.24, 0.46], (0.014, 0.24], (0.46, 0.69], (0.46, 0.69], ..., (0.46, 0.69], (0.24, 0.46], (0.014, 0.24], (0.014, 0.24], (0.69, 0.91]]
Length: 20
Categories (4, interval[float64, right]): [(0.014, 0.24] < (0.24, 0.46] < (0.46, 0.69] < (0.69, 0.91]]

In [90]:
data = np.random.standard_normal(1000)
quartiles = pd.qcut(data,4, precision=2)   # 소수점 아래 두 자리까지 계산
print(quartiles)
pd.value_counts(quartiles)

[(-0.011, 0.68], (-0.011, 0.68], (-3.76, -0.63], (-0.63, -0.011], (-0.011, 0.68], ..., (-0.011, 0.68], (-0.63, -0.011], (-3.76, -0.63], (-0.011, 0.68], (-3.76, -0.63]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.76, -0.63] < (-0.63, -0.011] < (-0.011, 0.68] < (0.68, 3.26]]


  pd.value_counts(quartiles)


(-3.76, -0.63]     250
(-0.63, -0.011]    250
(-0.011, 0.68]     250
(0.68, 3.26]       250
Name: count, dtype: int64

In [91]:
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.]).value_counts()

(-3.746, -1.335]     100
(-1.335, -0.0106]    400
(-0.0106, 1.303]     400
(1.303, 3.26]        100
Name: count, dtype: int64

## 9. 이상치를 찾고 제외

In [92]:
data = pd.DataFrame(np.random.standard_normal((1000, 4)))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.005798,0.044207,-0.039151,-0.025269
std,1.027689,0.985665,0.966266,0.985333
min,-3.64586,-3.333767,-2.901831,-3.108915
25%,-0.688975,-0.619875,-0.711668,-0.731432
50%,0.033107,0.02804,-0.071723,-0.054752
75%,0.721104,0.695298,0.66426,0.634162
max,2.763474,3.525865,2.611678,3.366626


In [76]:
col = data[2]
col[col.abs() > 3]

709   -3.194414
739    3.023720
Name: 2, dtype: float64

In [77]:
data[(data.abs() > 3).any(axis="columns")]

Unnamed: 0,0,1,2,3
17,-0.207434,3.525865,0.28307,0.544635
38,-3.64586,0.255475,-0.549574,-1.907459
260,0.336788,-3.333767,-1.240685,-0.650855
342,-3.018842,-0.298748,0.406954,0.183282
366,0.781753,-0.555434,-0.048478,-3.108915
497,-3.183867,1.050471,-1.042736,1.680374
544,-3.140963,-1.509976,-0.389818,-0.273253
709,1.090038,-0.848098,-3.194414,0.077839
739,0.003349,-0.011807,3.02372,-1.105312
768,0.452649,-3.481593,0.789944,1.737746


In [78]:
data[data.abs() > 3]

Unnamed: 0,0,1,2,3
0,,,,
1,,,,
2,,,,
3,,,,
4,,,,
...,...,...,...,...
995,,,,
996,,,,
997,,,,
998,,,,


In [79]:
np.sign(data) * 3

Unnamed: 0,0,1,2,3
0,-3.0,3.0,3.0,3.0
1,3.0,-3.0,3.0,3.0
2,3.0,-3.0,3.0,3.0
3,3.0,-3.0,3.0,3.0
4,3.0,3.0,3.0,3.0
...,...,...,...,...
995,-3.0,3.0,-3.0,3.0
996,3.0,3.0,-3.0,3.0
997,-3.0,3.0,3.0,3.0
998,3.0,-3.0,-3.0,-3.0


In [80]:
data[data.abs() > 3]=np.sign(data) * 3

In [81]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.009074,-0.012517,-0.036146,-0.018395
std,1.025656,0.995959,0.953506,0.980285
min,-3.0,-3.0,-3.0,-3.0
25%,-0.719824,-0.69402,-0.701202,-0.697479
50%,0.031774,0.00228,-0.045684,0.007335
75%,0.681335,0.6712,0.625358,0.613225
max,3.0,3.0,3.0,2.859053


## 10. 더미 변수 계산하기

In [85]:
df = pd.DataFrame({"key": ["b", "b", "a", "c", "a", "b"],
                   "data1": range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [86]:
pd.get_dummies(df["key"], dtype=float)

Unnamed: 0,a,b,c
0,0.0,1.0,0.0
1,0.0,1.0,0.0
2,1.0,0.0,0.0
3,0.0,0.0,1.0
4,1.0,0.0,0.0
5,0.0,1.0,0.0


In [87]:
dummies = pd.get_dummies(df["key"], prefix="key", dtype=float)

In [88]:
dummies

Unnamed: 0,key_a,key_b,key_c
0,0.0,1.0,0.0
1,0.0,1.0,0.0
2,1.0,0.0,0.0
3,0.0,0.0,1.0
4,1.0,0.0,0.0
5,0.0,1.0,0.0


In [92]:
df_with_dummy = pd.concat([df[["data1"]],dummies],axis=1)
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0.0,1.0,0.0
1,1,0.0,1.0,0.0
2,2,1.0,0.0,0.0
3,3,0.0,0.0,1.0
4,4,1.0,0.0,0.0
5,5,0.0,1.0,0.0


In [93]:
np.random.seed(12345) # to make the example repeatable
values = np.random.uniform(size=10)
values

array([0.9296, 0.3164, 0.1839, 0.2046, 0.5677, 0.5955, 0.9645, 0.6532,
       0.7489, 0.6536])

In [95]:
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
pd.cut(values, bins,labels=['a','b','c','d','e'])

['e', 'b', 'a', 'b', 'c', 'c', 'e', 'd', 'd', 'd']
Categories (5, object): ['a' < 'b' < 'c' < 'd' < 'e']

In [96]:
pd.get_dummies(pd.cut(values, bins,labels=['a','b','c','d','e']))

Unnamed: 0,a,b,c,d,e
0,0,0,0,0,1
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,1,0,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,0,0,1,0
9,0,0,0,1,0
