In [1]:
import numpy as np
import pandas as pd
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 25
pd.options.display.max_columns = 20
pd.options.display.max_colwidth = 82
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc("figure", figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)

In [2]:
import numpy as np
import pandas as pd

## 1. **중요!** 결측치 확인 - isna, isnull, notna, notnull

In [5]:
float_data = pd.Series([1.2, -3.5, np.nan, 0])
float_data

0    1.2
1   -3.5
2    NaN
3    0.0
dtype: float64

In [6]:
float_data.isna()

0    False
1    False
2     True
3    False
dtype: bool

In [7]:
string_data = pd.Series(["aardvark", np.nan, None, "avocado"])
print(string_data)
print(string_data.isna())
float_data = pd.Series([1, 2, None], dtype='float64')
float_data

0    aardvark
1         NaN
2        None
3     avocado
dtype: object
0    False
1     True
2     True
3    False
dtype: bool


0    1.0
1    2.0
2    NaN
dtype: float64

In [17]:
# isna = isnull
float_data.isnull()

0    False
1    False
2     True
dtype: bool

## 2. **중요!**  결측치 골라내기-dropna(how,axis,thresh)

In [23]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [24]:
# notna = notnull
data[data.notna()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [25]:
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],
                    [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [30]:
# default: 하나라도 결측치인 행 제외(how=‘any’, axis=‘index’)
# 전부 결측치인 행만 제외 how='all'
# 결측치가 n개 이상인 경우 제외 thresh=n
data.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


### how

In [31]:
data.dropna(how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [19]:
data[4] = np.nan
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


### axis

In [20]:
data.dropna(axis="columns", how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


### thresh

In [32]:
df = pd.DataFrame(np.random.standard_normal((7, 3)))
df.iloc[:4, 1] = np.nan
df.iloc[:2, 2] = np.nan
df

Unnamed: 0,0,1,2
0,-0.204708,,
1,-0.55573,,
2,0.092908,,0.769023
3,1.246435,,-1.296221
4,0.274992,0.228913,1.352917
5,0.886429,-2.001637,-0.371843
6,1.669025,-0.43857,-0.539741


In [33]:
print(df.dropna())      # 결측치 하나라도 갖고 있는 행 drop
df.dropna(thresh=2)     # 결측치가 2개 이상인 행 drop

          0         1         2
4  0.274992  0.228913  1.352917
5  0.886429 -2.001637 -0.371843
6  1.669025 -0.438570 -0.539741


Unnamed: 0,0,1,2
2,0.092908,,0.769023
3,1.246435,,-1.296221
4,0.274992,0.228913,1.352917
5,0.886429,-2.001637,-0.371843
6,1.669025,-0.43857,-0.539741


## 3. **중요!** 결측치 채우기-fillna(method,limit)

In [37]:
df

Unnamed: 0,0,1,2
0,-0.204708,,
1,-0.55573,,
2,0.092908,,0.769023
3,1.246435,,-1.296221
4,0.274992,0.228913,1.352917
5,0.886429,-2.001637,-0.371843
6,1.669025,-0.43857,-0.539741


In [38]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.204708,0.0,0.0
1,-0.55573,0.0,0.0
2,0.092908,0.0,0.769023
3,1.246435,0.0,-1.296221
4,0.274992,0.228913,1.352917
5,0.886429,-2.001637,-0.371843
6,1.669025,-0.43857,-0.539741


### 딕셔너리를 통한 행 별 다른 결측치 채우기

In [39]:
df.fillna({1: 0.5, 2: 0})   # 1번 column의 결측치는 0.5, 2번 column의 결측치는 0

Unnamed: 0,0,1,2
0,-0.204708,0.5,0.0
1,-0.55573,0.5,0.0
2,0.092908,0.5,0.769023
3,1.246435,0.5,-1.296221
4,0.274992,0.228913,1.352917
5,0.886429,-2.001637,-0.371843
6,1.669025,-0.43857,-0.539741


### method

In [41]:
df = pd.DataFrame(np.random.standard_normal((6, 3)))
df.iloc[2:, 1] = np.nan
df.iloc[4:, 2] = np.nan
print(df)
df.fillna(method="ffill")   # 이전의 유효한 값으로 채움

          0         1         2
0  0.332883 -2.359419 -0.199543
1 -1.541996 -0.970736 -1.307030
2  0.286350       NaN -0.753887
3  0.331286       NaN  0.069877
4  0.246674       NaN       NaN
5  1.327195       NaN       NaN


  df.fillna(method="ffill")   # 이전의 유효한 값으로 채움


Unnamed: 0,0,1,2
0,0.332883,-2.359419,-0.199543
1,-1.541996,-0.970736,-1.30703
2,0.28635,-0.970736,-0.753887
3,0.331286,-0.970736,0.069877
4,0.246674,-0.970736,0.069877
5,1.327195,-0.970736,0.069877


In [42]:
df.fillna(method="bfill")   # 이후의 유효한 값으로 채움

  df.fillna(method="bfill")   # 이후의 유효한 값으로 채움


Unnamed: 0,0,1,2
0,0.332883,-2.359419,-0.199543
1,-1.541996,-0.970736,-1.30703
2,0.28635,,-0.753887
3,0.331286,,0.069877
4,0.246674,,
5,1.327195,,


In [43]:
df.fillna(method="ffill", limit=2)  # 연속된 결측치 2개만 method 옵션 따라 채움

  df.fillna(method="ffill", limit=2)  # 연속된 결측치 2개만 method 옵션 따라 채움


Unnamed: 0,0,1,2
0,0.332883,-2.359419,-0.199543
1,-1.541996,-0.970736,-1.30703
2,0.28635,-0.970736,-0.753887
3,0.331286,-0.970736,0.069877
4,0.246674,,0.069877
5,1.327195,,0.069877


In [44]:
data = pd.Series([1., np.nan, 3.5, np.nan, 7])

In [45]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

## 4. 중복 제거-duplicated

In [46]:
data = pd.DataFrame({"k1": ["one", "two"] * 3 + ["two"],
                    "k2": [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [48]:
data.duplicated()   # 각 행이 중복인지 아닌지를 반환

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [50]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [51]:
data["v1"] = range(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [52]:
data.drop_duplicates(subset=["k1"]) # k1이 중복이면 drop

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [53]:
data.drop_duplicates(["k1", "k2"], keep="last") # k1, k2가 중복이면 drop
# keep='last': 중복인 행 중 마지막 행을 남겨놓음(원래는 먼저 나온 것을 남겨 놓음)

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


## 5. **중요!** 함수나 맵핑을 이용하여 데이터 변형-map,applymap

In [5]:
data = pd.DataFrame({"food": ["bacon", "pulled pork", "bacon",
                                "pastrami", "corned beef", "bacon",
                                "pastrami", "honey ham", "nova lox"],
                        "ounces": [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,corned beef,7.5
5,bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [6]:
meat_to_animal = {
  "bacon": "pig",
  "pulled pork": "pig",
  "pastrami": "cow",
  "corned beef": "cow",
  "honey ham": "pig",
  "nova lox": "salmon"
}

In [9]:
data["animal"] = data["food"].map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [10]:
def get_animal(x):
    return meat_to_animal[x]
data["food"].map(get_animal)

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [11]:
df = pd.DataFrame(np.random.standard_normal((7, 3)))

In [12]:
df

Unnamed: 0,0,1,2
0,-0.204708,0.478943,-0.519439
1,-0.55573,1.965781,1.393406
2,0.092908,0.281746,0.769023
3,1.246435,1.007189,-1.296221
4,0.274992,0.228913,1.352917
5,0.886429,-2.001637,-0.371843
6,1.669025,-0.43857,-0.539741


In [13]:
# DataFrame: 행/열 연산시 apply, 연소별 연산시 applymap
df.applymap(lambda x:x**2)  # 원소별로 변형

  df.applymap(lambda x:x**2)  # 원소별로 변형


Unnamed: 0,0,1,2
0,0.041905,0.229387,0.269817
1,0.308836,3.864293,1.94158
2,0.008632,0.079381,0.591396
3,1.5536,1.01443,1.680189
4,0.07562,0.052401,1.830384
5,0.785757,4.006552,0.138267
6,2.785645,0.192343,0.291321


## 6. 값 치환하기-replace

In [14]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [15]:
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [16]:
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [17]:
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [18]:
data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

## 7. 축 색인 이름 바꾸기-map, rename // index.map, columns.map

### map

In [20]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=["Ohio", "Colorado", "New York"],
                    columns=["one", "two", "three", "four"])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [28]:
def transform(x):
    return x[:4].upper()

data.index

Index(['Ohio', 'Colorado', 'New York'], dtype='object')

In [32]:
data.index.map(transform)   # 객체의 축 색인 이름 변경

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [30]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


### rename

rename, index.map, column.map 모두 새로운 객체 생성하여 반환

In [27]:
data.rename(index=str.upper, columns=str.title)

Unnamed: 0,One,Two,Three,Four
OHIO,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


In [25]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [26]:
data.rename(index={"OHIO": "INDIANA"},
            columns={"three": "peekaboo"})

Unnamed: 0,one,two,peekaboo,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


## 8. **중요!** 이산화-cut, qcut

pd.cut( ages , bins ,labels=,right=)  
    • ages: 이산화하기 위한 Series나 리스트   
    • bins: 이산화 그룹 경계값 나타내는 Series나 리스트  
    • labels: 각 이산화 그룹의 이름을 나타내는 Series나 리스트  
    • right: 이산화 시 대괄호와 소괄호 위치 결정 (right=False, 대괄호가 왼쪽) 
    • bins를 활용하지 않는다면 최소값과 최댓값을 기준으로 균등한 길이의 그룹을 계산   

In [34]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [42]:
bins = [18, 25, 35, 60, 100]
age_categories = pd.cut(ages, bins)
age_categories

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [43]:
pd.cut(ages, bins, right=False) # False: 대괄호가 왼쪽

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64, left]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

In [44]:
group_names = ["Youth", "YoungAdult", "MiddleAged", "Senior"]
# labels: 각 이산화 그룹의 이름
pd.cut(ages, bins, labels=group_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

In [45]:
data = np.random.uniform(size=20)
data

array([0.9466, 0.9542, 0.5927, 0.1709, 0.2609, 0.0317, 0.4384, 0.1239,
       0.6483, 0.0026, 0.3184, 0.3815, 0.3028, 0.3062, 0.2683, 0.9132,
       0.3742, 0.3724, 0.5681, 0.146 ])

In [47]:
pd.cut(data, 4, precision=2)    # bins x -> min,max 기준 4개의 균등한 길이 그룹

[(0.72, 0.95], (0.72, 0.95], (0.48, 0.72], (0.0017, 0.24], (0.24, 0.48], ..., (0.72, 0.95], (0.24, 0.48], (0.24, 0.48], (0.48, 0.72], (0.0017, 0.24]]
Length: 20
Categories (4, interval[float64, right]): [(0.0017, 0.24] < (0.24, 0.48] < (0.48, 0.72] < (0.72, 0.95]]

In [51]:
data = np.random.standard_normal(1000)
# qcut
quartiles = pd.qcut(data,4, precision=2)   # 소수점 아래 두 자리까지 계산
print(quartiles)
pd.value_counts(quartiles)

[(-3.44, -0.69], (-0.69, 0.022], (0.73, 2.65], (-3.44, -0.69], (0.022, 0.73], ..., (0.022, 0.73], (0.73, 2.65], (0.73, 2.65], (0.022, 0.73], (0.73, 2.65]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.44, -0.69] < (-0.69, 0.022] < (0.022, 0.73] < (0.73, 2.65]]


  pd.value_counts(quartiles)


(-3.44, -0.69]    250
(-0.69, 0.022]    250
(0.022, 0.73]     250
(0.73, 2.65]      250
Name: count, dtype: int64

In [52]:
# 0~10%, 10~50%, 50~90%, 90~100%로 구간을 나눔.
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.]).value_counts()

(-3.429, -1.25]    100
(-1.25, 0.0216]    400
(0.0216, 1.294]    400
(1.294, 2.654]     100
Name: count, dtype: int64

## 9. 이상치를 찾고 제외-outlier  
이상치: 전체적인 데이터 분포에서 크게 어긋나는 값

In [81]:
data = pd.DataFrame(np.random.standard_normal((1000, 4)))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.005148,-0.046396,-0.028177,-0.014959
std,1.013278,0.990271,0.985223,0.985388
min,-3.025204,-3.508448,-3.087639,-3.066981
25%,-0.724147,-0.710577,-0.726247,-0.704963
50%,-0.027822,-0.045562,-0.030306,-0.036192
75%,0.730788,0.674228,0.66271,0.65413
max,3.004068,3.961734,3.039873,3.389719


In [82]:
col = data[2]
col[col.abs() > 3]

198    3.039873
578   -3.087639
Name: 2, dtype: float64

In [83]:
data[(data.abs() > 3).any(axis="columns")]

Unnamed: 0,0,1,2,3
1,3.004068,-0.344172,-0.310593,-1.246154
73,0.151798,-2.436584,-0.095799,3.389719
157,-0.888403,-3.08956,0.898763,-0.837477
183,1.492226,-0.655538,-0.718596,-3.066981
198,0.040141,0.360827,3.039873,-0.736824
220,-0.148032,-3.508448,-1.50583,0.0065
381,0.733272,0.623477,0.357085,3.153123
578,0.100989,-0.694585,-3.087639,1.379808
655,-0.523147,3.961734,-0.819945,1.44286
920,-1.568923,-3.172614,-0.782276,-1.058917


In [84]:
data[data.abs() > 3]    # 절댓값 3 이상인 값만 원값 반환, 나머지는 결측치

Unnamed: 0,0,1,2,3
0,,,,
1,3.004068,,,
2,,,,
3,,,,
4,,,,
...,...,...,...,...
995,,,,
996,,,,
997,,,,
998,,,,


In [85]:
np.sign(data) * 3

Unnamed: 0,0,1,2,3
0,-3.0,3.0,-3.0,-3.0
1,3.0,-3.0,-3.0,-3.0
2,-3.0,-3.0,3.0,-3.0
3,3.0,3.0,-3.0,-3.0
4,-3.0,-3.0,3.0,-3.0
...,...,...,...,...
995,3.0,-3.0,3.0,3.0
996,-3.0,-3.0,3.0,3.0
997,-3.0,3.0,3.0,3.0
998,-3.0,3.0,3.0,3.0


In [86]:
# 절댓값이 3초과인 값들 -> 원 부호에 따라 +3, -3 할당
data[data.abs() > 3]=np.sign(data) * 3

In [87]:
data.describe()     # max, min 모두 절댓값 3 이하

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.005127,-0.046587,-0.028129,-0.015435
std,1.013191,0.984374,0.984831,0.983429
min,-3.0,-3.0,-3.0,-3.0
25%,-0.724147,-0.710577,-0.726247,-0.704963
50%,-0.027822,-0.045562,-0.030306,-0.036192
75%,0.730788,0.674228,0.66271,0.65413
max,3.0,3.0,3.0,3.0


## 10. **중요!** 더미(표시자행렬) 변수 계산-pd.get_dummies, concat

In [89]:
df = pd.DataFrame({"key": ["b", "b", "a", "c", "a", "b"],
                    "data1": range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [91]:
# 'key' 컬럼의 값에 해당하는 건 1, 아닌 건 0
pd.get_dummies(df["key"], dtype=float)

Unnamed: 0,a,b,c
0,0.0,1.0,0.0
1,0.0,1.0,0.0
2,1.0,0.0,0.0
3,0.0,0.0,1.0
4,1.0,0.0,0.0
5,0.0,1.0,0.0


In [98]:
# prefix: 컬럼값 앞에 추가할 단어
dummies = pd.get_dummies(df["key"], prefix="key", dtype=float)

In [99]:
dummies

Unnamed: 0,key_a,key_b,key_c
0,0.0,1.0,0.0
1,0.0,1.0,0.0
2,1.0,0.0,0.0
3,0.0,0.0,1.0
4,1.0,0.0,0.0
5,0.0,1.0,0.0


In [100]:
# df[data1]과 dummies를 axis=1 방향으로 이어붙이기
df_with_dummy = pd.concat([df[["data1"]],dummies],axis=1)
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0.0,1.0,0.0
1,1,0.0,1.0,0.0
2,2,1.0,0.0,0.0
3,3,0.0,0.0,1.0
4,4,1.0,0.0,0.0
5,5,0.0,1.0,0.0


In [101]:
np.random.seed(12345) # to make the example repeatable
values = np.random.uniform(size=10)
values

array([0.9296, 0.3164, 0.1839, 0.2046, 0.5677, 0.5955, 0.9645, 0.6532,
       0.7489, 0.6536])

In [102]:
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
pd.cut(values, bins,labels=['a','b','c','d','e'])

['e', 'b', 'a', 'b', 'c', 'c', 'e', 'd', 'd', 'd']
Categories (5, object): ['a' < 'b' < 'c' < 'd' < 'e']

In [103]:
pd.get_dummies(pd.cut(values, bins,labels=['a','b','c','d','e']))

Unnamed: 0,a,b,c,d,e
0,False,False,False,False,True
1,False,True,False,False,False
2,True,False,False,False,False
3,False,True,False,False,False
4,False,False,True,False,False
5,False,False,True,False,False
6,False,False,False,False,True
7,False,False,False,True,False
8,False,False,False,True,False
9,False,False,False,True,False
