### Pandas

- 구조화된 데이터의 처리를 지원하는 파이썬 라이브러리 입니다
- numpy의 ref 이며, 대부분의 스프레드 시트 기능을 제공합니다
- 인덱싱, 연산등이 가능합니다

In [9]:
import pandas as pd
data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data"
df_data = pd.read_csv(data_url, sep="\s+", header=None)
df_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [12]:
# 변수의 값을 지정해줍니다
df_data.columns = ["CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO"," B" ,"LSTAT", "MEDV"]
df_data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [13]:
df_data.values # array 형태로 보여줍니다

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 3.9690e+02, 4.9800e+00,
        2.4000e+01],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 3.9690e+02, 9.1400e+00,
        2.1600e+01],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 3.9283e+02, 4.0300e+00,
        3.4700e+01],
       ...,
       [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 3.9690e+02, 5.6400e+00,
        2.3900e+01],
       [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 3.9345e+02, 6.4800e+00,
        2.2000e+01],
       [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 3.9690e+02, 7.8800e+00,
        1.1900e+01]])

대부분의 데이터는 DB에 존재하기에 필요한 데이터를 쿼리문을 보내서 데이터를 얻는 경우가 많습니다.

#### Series

- DataFrame 중 하나의 columns에 해당하는 데이터의 모음 Object

In [17]:
from pandas import Series
ex = Series([1,2,3,4,5], name="ex")
ex

0    1
1    2
2    3
3    4
4    5
Name: ex, dtype: int64

In [19]:
ex = Series({0:1, 1:10, 2:11}, name="ex")
ex

0     1
1    10
2    11
Name: ex, dtype: int64

- index값으로 value를 사용할 수 있습니다

In [24]:
ex = Series({"a":2,"b":3}) # index label을 직접 작성할 수 있음
ex["a"]

2

#### DataFrame

- Data Table 전체를 포함하는 Object

In [45]:
raw_data = {"first_name":["jam","am","jim"],
           "age":[10,20,30],
           "region":["subway", "home", "beach"]}
raw_data = pd.DataFrame(raw_data, index=["index0",1,2])
raw_data

Unnamed: 0,first_name,age,region
index0,jam,10,subway
1,am,20,home
2,jim,30,beach


In [46]:
pd.DataFrame(raw_data, columns=["age", "region"]) # column 선택

Unnamed: 0,age,region
index0,10,subway
1,20,home
2,30,beach


In [47]:
pd.DataFrame(raw_data, columns=["first_name","age", "region", "face"]) # 새로운 column 추가, 결측값이 넣어져서 생성됨

Unnamed: 0,first_name,age,region,face
index0,jam,10,subway,
1,am,20,home,
2,jim,30,beach,


In [48]:
raw_data.index

Index(['index0', 1, 2], dtype='object')

In [49]:
raw_data.age # 특정 column을 seires 형태로 추출

index0    10
1         20
2         30
Name: age, dtype: int64

In [50]:
raw_data["age"] # 특정 column을 seires 형태로 추출

index0    10
1         20
2         30
Name: age, dtype: int64

#### loc 방법을 이용한 데이터 접근

In [56]:
raw_data.iloc[0] # iloc은 index number를 기준으로 데이터 추출

first_name       jam
age               10
region        subway
Name: index0, dtype: object

In [57]:
raw_data.loc[0] # loc은 index 이름을 기준으로 데이터 추출

KeyError: 0

#### Pandas 조건

In [71]:
s = pd.Series([24,51,63], name="cm")
raw_data["cm"] = s > 40
raw_data

Unnamed: 0,first_name,age,region,cm
index0,jam,10,subway,
1,am,20,home,True
2,jim,30,beach,True


In [73]:
raw_data[:2] # column 이름 없이 사용하는 index number는 row 기준 표시

Unnamed: 0,first_name,age,region,cm
index0,jam,10,subway,
1,am,20,home,True


In [75]:
raw_data["age"][:2] # series selection

index0    10
1         20
Name: age, dtype: int64

In [82]:
raw_data["age"][raw_data["age"] < 20] # boolean index

index0    10
Name: age, dtype: int64

In [83]:
del raw_data["age"] # 특정 column 제거

In [85]:
raw_data

Unnamed: 0,first_name,region,cm
index0,jam,subway,
1,am,home,True
2,jim,beach,True


In [88]:
df_data[["ZN", "NOX"]][:2]

Unnamed: 0,ZN,NOX
0,18.0,0.538
1,0.0,0.469


In [90]:
df_data.iloc[:2,:2]

Unnamed: 0,CRIM,ZN
0,0.00632,18.0
1,0.02731,0.0


In [92]:
df_data.drop([0,1,2,3])

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.90,5.33,36.2
5,0.02985,0.0,2.18,0,0.458,6.430,58.7,6.0622,3,222.0,18.7,394.12,5.21,28.7
6,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311.0,15.2,395.60,12.43,22.9
7,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311.0,15.2,396.90,19.15,27.1
8,0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311.0,15.2,386.63,29.93,16.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273.0,21.0,393.45,6.48,22.0


In [96]:
df_data.drop("CRIM", axis=1)

Unnamed: 0,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296.0,15.3,396.90,4.98,24.0
1,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.90,9.14,21.6
2,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273.0,21.0,391.99,9.67,22.4
502,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273.0,21.0,396.90,9.08,20.6
503,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273.0,21.0,396.90,5.64,23.9
504,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273.0,21.0,393.45,6.48,22.0


In [97]:
df_data

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273.0,21.0,393.45,6.48,22.0


pandas는 원본 데이터를 변환시키고 싶으면 inplace = True 를 하면 됩니다

#### Pandas Operation

In [102]:
s1 = pd.Series(range(1,6), index=list("abcde"))
s2 = pd.Series(range(5,11), index=list("cdefgh"))
s1 + s2 # index를 기준으로 연산 수행, 겹치는 index가 없으면 NaN 값으로 반환

a     NaN
b     NaN
c     8.0
d    10.0
e    12.0
f     NaN
g     NaN
h     NaN
dtype: float64

In [103]:
s1.add(s2, fill_value=0) # NaN 값으로 반환되는 index를 0으로 반환

a     1.0
b     2.0
c     8.0
d    10.0
e    12.0
f     8.0
g     9.0
h    10.0
dtype: float64

In [136]:
# column을 기준으로 broadcasting이 발생합니다
import numpy as np
df1 = pd.DataFrame(np.arange(10).reshape(5,2), columns=list("ab"))
df2 = pd.Series([10, 20], index=list("ab"))

df1 + df2

Unnamed: 0,a,b
0,10,21
1,12,23
2,14,25
3,16,27
4,18,29


#### map, lambda, replace

- pandas series 객체에 map 함수를 적용할 수 있습니다

In [137]:
s1 = Series(np.arange(10))
s1.head()

0    0
1    1
2    2
3    3
4    4
dtype: int32

In [140]:
s1.map(lambda x: x**2).head()

0     0
1     1
2     4
3     9
4    16
dtype: int64

In [146]:
df_data.RAD = df_data.RAD.map({1:"up", 2:"down"}) # "바꾸기 전" : "바꾼 후"
df_data

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,,273.0,21.0,393.45,6.48,22.0


In [148]:
df_data.RAD = df_data.RAD.replace({1:"up", 2:"down"}) # "바꾸기 전" : "바꾼 후"
df_data

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,,273.0,21.0,393.45,6.48,22.0


#### apply

- map과 달리 series 전체(column 별로)에 해당 함수를 적용