# Pandas
- 학습목표
    - 구조화된 데이터의 처리를 지원하는 pandas 라이브러리
    - Pandas의 여러 기능과 사용하는 방법 학습
    - Python계의 Excel
    
    
- 핵심키워드
    - pandas
    - Series
    - DataFrame
    - Selection & Drop
    - Dataframe operation
    - Lambda
    - map
    - apply
    - Built-in functions
    
    
- Pandas의 특징
    - 고성능 Array계산 라이브러리인 Numpy와 통합
    - 강력한 '스트레드시트' 처리 기능 제공
    - 인덱싱, 연산용 함수, 전처리 함수 등을 제공함

In [37]:
import pandas as pd
import numpy as np

In [19]:
data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data' # Data URL

df_data = pd.read_csv(data_url, sep='\s+', header=None) # csv데이터로드, separate 빈공간지정, 컬럼명이없어서 header=None

In [20]:
df_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [21]:
df_data.shape

(506, 14)

In [22]:
df_data.columns = [
    'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV' 
]

In [23]:
df_data.head(3)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7


In [32]:
type(df_data.values) # pandas에 value값들은 ndarray로 이루어져 있다.

numpy.ndarray

## Pandas의 구성
- Series와 DataFrame 으로 구성되어 있다.

---
**Series**

In [33]:
# Series: DataFrame중 하나의 Column에 해당하는 데이터의 모음 object
# DataFrame: Data Table 전체를 포함하는 object

In [34]:
# Series
# Column Vector를 표현하는 object

list_data = [1,2,3,4,5]
example_obj = pd.Series(data=list_data)
example_obj

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [35]:
list_data = [1,2,3,4,5]
list_name = ['a', 'b', 'c', 'd', 'e']
example_obj = pd.Series(data=list_data, index=list_name)
example_obj

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [38]:
dict_data = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5}
example_obj = pd.Series(dict_data, dtype=np.float32, name='example_data')
example_obj

a    1.0
b    2.0
c    3.0
d    4.0
e    5.0
Name: example_data, dtype: float32

In [39]:
# data index 접근하기
example_obj['a']

1.0

In [40]:
# data index에 값 할당하기
example_obj['a'] = 3.2
example_obj

a    3.2
b    2.0
c    3.0
d    4.0
e    5.0
Name: example_data, dtype: float32

---
**DataFrame**
- Series를 모아서 만든 Data Table = 기본 2차원

In [41]:
raw_data = {
    'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
    'last_name': ['Miler', 'Jacobson', 'Ali', 'Milner', 'Cooze'],
    'age': [42, 52, 36, 24, 73],
    'city': ['San Francisco', 'Baltimore', 'Miami', 'Douglas', 'Boston']
}

df = pd.DataFrame(raw_data)
df

Unnamed: 0,first_name,last_name,age,city
0,Jason,Miler,42,San Francisco
1,Molly,Jacobson,52,Baltimore
2,Tina,Ali,36,Miami
3,Jake,Milner,24,Douglas
4,Amy,Cooze,73,Boston


In [43]:
# 갖고오고싶은 columns만 선택가능
pd.DataFrame(raw_data, columns = ['age', 'city']) 

Unnamed: 0,age,city
0,42,San Francisco
1,52,Baltimore
2,36,Miami
3,24,Douglas
4,73,Boston


In [46]:
# 새로운 column 추가
pd.DataFrame(raw_data, columns=['first_name', 'last_name', 'age', 'city', 'debt'])

Unnamed: 0,first_name,last_name,age,city,debt
0,Jason,Miler,42,San Francisco,
1,Molly,Jacobson,52,Baltimore,
2,Tina,Ali,36,Miami,
3,Jake,Milner,24,Douglas,
4,Amy,Cooze,73,Boston,


## Pandas의 기능

### 특정 column 선택

In [51]:
# 특정 column 선택 - Series 추출
df.first_name
df['first_name']

0    Jason
1    Molly
2     Tina
3     Jake
4      Amy
Name: first_name, dtype: object

### loc, iloc

In [54]:
# loc - index location
df.loc[0, 'age']

42

In [58]:
s = pd.Series(np.nan, index=[49, 48, 47, 46, 45, 1, 2, 3, 4, 5])
s

49   NaN
48   NaN
47   NaN
46   NaN
45   NaN
1    NaN
2    NaN
3    NaN
4    NaN
5    NaN
dtype: float64

In [63]:
print( s.loc[:2] )
print()
print( s.iloc[:2] )

49   NaN
48   NaN
47   NaN
46   NaN
45   NaN
1    NaN
2    NaN
dtype: float64

49   NaN
48   NaN
dtype: float64


In [57]:
# iloc - index position

print( df['age'].iloc[1:] ) # Series에서도 iloc 사용가능
df.iloc[0, 2]

1    52
2    36
3    24
4    73
Name: age, dtype: int64


42

### column에 새로운 데이터 할당

In [67]:
# 많이 쓰인다

df['debt'] = df.age > 40
df

Unnamed: 0,first_name,last_name,age,city,debt
0,Jason,Miler,42,San Francisco,True
1,Molly,Jacobson,52,Baltimore,True
2,Tina,Ali,36,Miami,False
3,Jake,Milner,24,Douglas,False
4,Amy,Cooze,73,Boston,True


### Transpose

In [68]:
df.T

Unnamed: 0,0,1,2,3,4
first_name,Jason,Molly,Tina,Jake,Amy
last_name,Miler,Jacobson,Ali,Milner,Cooze
age,42,52,36,24,73
city,San Francisco,Baltimore,Miami,Douglas,Boston
debt,True,True,False,False,True


### 값 출력

In [69]:
df.values

array([['Jason', 'Miler', 42, 'San Francisco', True],
       ['Molly', 'Jacobson', 52, 'Baltimore', True],
       ['Tina', 'Ali', 36, 'Miami', False],
       ['Jake', 'Milner', 24, 'Douglas', False],
       ['Amy', 'Cooze', 73, 'Boston', True]], dtype=object)

### csv변환

In [70]:
df.to_csv()

',first_name,last_name,age,city,debt\n0,Jason,Miler,42,San Francisco,True\n1,Molly,Jacobson,52,Baltimore,True\n2,Tina,Ali,36,Miami,False\n3,Jake,Milner,24,Douglas,False\n4,Amy,Cooze,73,Boston,True\n'

### column 삭제
- drop() 있지만, del 쓰는게 가장 쉬움

In [71]:
# df.drop(columns='debt') 
del df['debt']

In [75]:
df

Unnamed: 0,first_name,last_name,age,city
0,Jason,Miler,42,San Francisco
1,Molly,Jacobson,52,Baltimore
2,Tina,Ali,36,Miami
3,Jake,Milner,24,Douglas
4,Amy,Cooze,73,Boston


# Section & Drop

## Section

In [76]:
df['age'].head(3) # 한 개의 column 선택 시

0    42
1    52
2    36
Name: age, dtype: int64

In [78]:
df[['age', 'city']].head(3) # 1개 이상의 columns 선택 시

Unnamed: 0,age,city
0,42,San Francisco
1,52,Baltimore
2,36,Miami


## Selection with index number

In [79]:
df[:3] # column 이름 없이 사용하는 index number는 row 기준 표시

Unnamed: 0,first_name,last_name,age,city
0,Jason,Miler,42,San Francisco
1,Molly,Jacobson,52,Baltimore
2,Tina,Ali,36,Miami


In [80]:
df['city'][:3]

0    San Francisco
1        Baltimore
2            Miami
Name: city, dtype: object

## Series selection

In [81]:
city_series = df['city']
city_series

0    San Francisco
1        Baltimore
2            Miami
3          Douglas
4           Boston
Name: city, dtype: object

In [84]:
city_series[[0, 1, 4]] # fancy index 접근

0    San Francisco
1        Baltimore
4           Boston
Name: city, dtype: object

In [86]:
# filter
# series의 값이 int 나 float 이라면 > 2000 처럼 boolean index이 가능하다

city_series[city_series == 'Boston']

4    Boston
Name: city, dtype: object

## Index 변경

In [88]:
df.index = df['city']
del df['city']
df.head()

Unnamed: 0_level_0,first_name,last_name,age
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
San Francisco,Jason,Miler,42
Baltimore,Molly,Jacobson,52
Miami,Tina,Ali,36
Douglas,Jake,Milner,24
Boston,Amy,Cooze,73


## Basic, loc, iloc 선택방법

In [90]:
# basic
df[['first_name', 'last_name']][:2]

Unnamed: 0_level_0,first_name,last_name
city,Unnamed: 1_level_1,Unnamed: 2_level_1
San Francisco,Jason,Miler
Baltimore,Molly,Jacobson


In [92]:
# loc
df.loc[['Miami', 'Boston'], ['last_name', 'age']]

Unnamed: 0_level_0,last_name,age
city,Unnamed: 1_level_1,Unnamed: 2_level_1
Miami,Ali,36
Boston,Cooze,73


In [93]:
# iloc
df.iloc[:2, :2]

Unnamed: 0_level_0,first_name,last_name
city,Unnamed: 1_level_1,Unnamed: 2_level_1
San Francisco,Jason,Miler
Baltimore,Molly,Jacobson


## Data Drop

In [104]:
df.head()

Unnamed: 0_level_0,first_name,last_name,age
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
San Francisco,Jason,Miler,42
Baltimore,Molly,Jacobson,52
Miami,Tina,Ali,36
Douglas,Jake,Milner,24
Boston,Amy,Cooze,73


In [105]:
# drop()
# 기본적으로 index이름이나 번호를 넣어주면 행이 삭제된다.
# inplace=True 파라미터를 지정해주면 원본 df 이 바뀌고, False는 copy한 결과를 리턴한다.

df.reset_index().drop(1)

Unnamed: 0,city,first_name,last_name,age
0,San Francisco,Jason,Miler,42
2,Miami,Tina,Ali,36
3,Douglas,Jake,Milner,24
4,Boston,Amy,Cooze,73


# Operation
- Series
- DataFrame
- Series + DataFrame

## Series
- index 기준으로 연산수행
- 겹치는 index가 없을경우 NaN 반환

In [110]:
s1 = pd.Series(range(1, 6), index=list('abcde'))
print(s1)

s2 = pd.Series(range(5, 11), index=list('bcedef'))
print(s2)

a    1
b    2
c    3
d    4
e    5
dtype: int64
b     5
c     6
e     7
d     8
e     9
f    10
dtype: int64


In [111]:
s1.add(s2)
s1 + s2

a     NaN
b     7.0
c     9.0
d    12.0
e    12.0
e    14.0
f     NaN
dtype: float64

## DataFrame
- DataFrame은 column과 index를 모두 고려
- add operation을 쓰면 NaN 값0으로 변환
- Operation types
    - add, sub, div, mul

In [113]:
df1 = pd.DataFrame(
    np.arange(9).reshape(3, 3),
    columns=list('abc')
)
df1

Unnamed: 0,a,b,c
0,0,1,2
1,3,4,5
2,6,7,8


In [114]:
df2 = pd.DataFrame(
    np.arange(16).reshape(4, 4),
    columns=list('abcd')
)
df2

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [115]:
df1 + df2

Unnamed: 0,a,b,c,d
0,0.0,2.0,4.0,
1,7.0,9.0,11.0,
2,14.0,16.0,18.0,
3,,,,


In [116]:
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d
0,0.0,2.0,4.0,3.0
1,7.0,9.0,11.0,7.0
2,14.0,16.0,18.0,11.0
3,12.0,13.0,14.0,15.0


## Series + DataFrame

In [118]:
df = pd.DataFrame(
    np.arange(16).reshape(4, 4),
    columns=list('abcd')
)

s = pd.Series(
    np.arange(10, 14),
    index=list('abcd')
)

print(df)
print()
print(s)

    a   b   c   d
0   0   1   2   3
1   4   5   6   7
2   8   9  10  11
3  12  13  14  15

a    10
b    11
c    12
d    13
dtype: int64


In [119]:
# [10, 11, 12, 14] + df
df + s

Unnamed: 0,a,b,c,d
0,10,12,14,16
1,14,16,18,20
2,18,20,22,24
3,22,24,26,28


# lambda, map, apply

## lambda
- 한 줄로 함수를 표현하는 익명 함수 기법
- Lisp 언어에서 시작된 기법으로 오늘날 현대언어에 많이 사용

`lambda argument: expression`

In [121]:
lambda x, y: x + y

# def f(x, y):
#     return x+y

f = lambda x, y: x+y
f(1, 4)

5

## map (***)
- 함수와 Sequence형 데이터를 인자로 받아
- 각 element마다 입력받은 함수를 적용하여 list로 반환
- 일반적으로 함수를 lambda형태로 표현함

`map(function, sequence)`

In [122]:
ex = [1,2,3,4,5]
f = lambda x: x**2
list(map(f, ex))

[1, 4, 9, 16, 25]

---
- Pandas의 Series type의 데이터에도 map() 함수 사용가능
- function대신 dict, sequence형 자료등으로 대체 가능

In [123]:
s1 = pd.Series(np.arange(10))
s1.head(5)

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [124]:
s1.map(lambda x: x**2).head(5)

0     0
1     1
2     4
3     9
4    16
dtype: int64

In [125]:
z = {1: 'A', 2: 'B', 3: 'C'}
s1.map(z)

0    NaN
1      A
2      B
3      C
4    NaN
5    NaN
6    NaN
7    NaN
8    NaN
9    NaN
dtype: object

In [127]:
# terminal 의 curl -O 옵션을 사용하여 파일을 다운로드 하기
!curl -O https://raw.githubusercontent.com/rstudio/Intro/master/data/wages.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 62250  100 62250    0     0   112k      0 --:--:-- --:--:-- --:--:--  111k


In [128]:
df = pd.read_csv('wages.csv')
df.head()

Unnamed: 0,earn,height,sex,race,ed,age
0,79571.299011,73.89,male,white,16,49
1,96396.988643,66.23,female,white,16,62
2,48710.666947,63.77,female,white,16,33
3,80478.096153,63.22,female,other,16,95
4,82089.345498,63.08,female,white,17,43


In [131]:
df['sex'].unique()

array(['male', 'female'], dtype=object)

In [132]:
df['sex_code'] = df['sex'].map({'male': 0, 'female': 1})
df.head()

Unnamed: 0,earn,height,sex,race,ed,age,sex_code
0,79571.299011,73.89,male,white,16,49,0
1,96396.988643,66.23,female,white,16,62,1
2,48710.666947,63.77,female,white,16,33,1
3,80478.096153,63.22,female,other,16,95,1
4,82089.345498,63.08,female,white,17,43,1


In [None]:
df['sex'].replace(
    {'male': 0, 'female': 1}
)

In [136]:
df['sex'].replace(
    ['male', 'female'],
    [0, 1],
    inplace=True
)

In [137]:
df.head()

Unnamed: 0,earn,height,sex,race,ed,age,sex_code
0,79571.299011,73.89,0,white,16,49,0
1,96396.988643,66.23,1,white,16,62,1
2,48710.666947,63.77,1,white,16,33,1
3,80478.096153,63.22,1,other,16,95,1
4,82089.345498,63.08,1,white,17,43,1


## apply
- map과 달리, series 전체(column)에 해당 함수를 적용
- 입력값이 series 데이터로 입력받아 handling 가능

In [140]:
df_info = df[['earn', 'height', 'age']]
df_info.head()

Unnamed: 0,earn,height,age
0,79571.299011,73.89,49
1,96396.988643,66.23,62
2,48710.666947,63.77,33
3,80478.096153,63.22,95
4,82089.345498,63.08,43


In [141]:
f = lambda x: x.max() - x.min()
df_info.apply(f)

earn      318047.708444
height        19.870000
age           73.000000
dtype: float64

# Pandas Built-In Function

## describe
- Numeric Type 데이터의 요약 정보를 보여줌

In [147]:
df = pd.read_csv('wages.csv')
df.head()

Unnamed: 0,earn,height,sex,race,ed,age
0,79571.299011,73.89,male,white,16,49
1,96396.988643,66.23,female,white,16,62
2,48710.666947,63.77,female,white,16,33
3,80478.096153,63.22,female,other,16,95
4,82089.345498,63.08,female,white,17,43


In [148]:
df.describe()

Unnamed: 0,earn,height,ed,age
count,1379.0,1379.0,1379.0,1379.0
mean,32446.292622,66.59264,13.354605,45.328499
std,31257.070006,3.818108,2.438741,15.789715
min,-98.580489,57.34,3.0,22.0
25%,10538.790721,63.72,12.0,33.0
50%,26877.870178,66.05,13.0,42.0
75%,44506.215336,69.315,15.0,55.0
max,317949.127955,77.21,18.0,95.0


## Unique
- series data의 유일한 값을 list를 반환함
- Categorical Data를 Label Encoding 하는 방법
- 즉, 범주데이터를 숫자로 바꿔준다. (one-hot-encoding 과 조금 비슷하다고 생각)

In [149]:
df['race'].unique()

array(['white', 'other', 'hispanic', 'black'], dtype=object)

In [151]:
np.array(dict(enumerate(df['race'].unique()))) # dict type으로 index

array({0: 'white', 1: 'other', 2: 'hispanic', 3: 'black'}, dtype=object)

In [172]:
values = list(np.array(dict(enumerate(df['race'].unique()))).tolist().values())
print(values)

key = list(np.array(dict(enumerate(df['race'].unique()))).tolist().keys())
print(key)

key, values

['white', 'other', 'hispanic', 'black']
[0, 1, 2, 3]


([0, 1, 2, 3], ['white', 'other', 'hispanic', 'black'])

## info
- 행의 개수
- 열의 개수
- 각 열의 결측치 유무
- 각 열의 데이터 타입
- 데이터 타입
- 사용 메모리 확인가능

In [173]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1379 entries, 0 to 1378
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   earn    1379 non-null   float64
 1   height  1379 non-null   float64
 2   sex     1379 non-null   object 
 3   race    1379 non-null   object 
 4   ed      1379 non-null   int64  
 5   age     1379 non-null   int64  
dtypes: float64(2), int64(2), object(2)
memory usage: 64.8+ KB


In [174]:
df.shape

(1379, 6)