In [None]:
## pandas 기본
#### 1.1 Series 객체 생성
#### 1.2 DataFrame 객체 생성
#### 1.3 DataFrame 인덱싱
#### 1.4 null 데이터 처리
#### 1.5 수학함수, Sorting

In [3]:
import numpy as np
import pandas as pd

In [4]:
print(np.__version__)
print(pd.__version__)

1.14.5
0.23.3


### 1.1 Series 객체 생성`

In [8]:
# Series 객체 생성
ser1 = pd.Series([4, 7, -5, 3])
print(ser1, type(ser1))

0    4
1    7
2   -5
3    3
dtype: int64 <class 'pandas.core.series.Series'>


In [10]:
print(ser1.values)
print(ser1.index)
print(ser1.dtypes)

[ 4  7 -5  3]
RangeIndex(start=0, stop=4, step=1)
int64


In [11]:
# Series 객체를 생성 : index 값을 지정
ser2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
print(ser2, type(ser2))

d    4
b    7
a   -5
c    3
dtype: int64 <class 'pandas.core.series.Series'>


In [13]:
# Series 객체를 생성 : dict 타입
mydict = {
    '철수':5000,
    '영희':3000,
    '둘리':1000,
    }
print(mydict)

ser3 = pd.Series(mydict)
print(ser3)

{'철수': 5000, '영희': 3000, '둘리': 1000}
철수    5000
영희    3000
둘리    1000
dtype: int64


In [16]:
# index명과 컬럼명을 지정
ser3.name = '급여'
ser3.index.name = '이름'
print(ser3)

이름
철수    5000
영희    3000
둘리    1000
Name: 급여, dtype: int64


### 1.2 DataFrame 객체 생성

In [17]:
mydict = {
    'names': ['철수', '영희', '둘리', '철수', '로그'],
    'year': [2017, 2018, 2016, 2015, 2018],
    'points': [1.5, 2.0, 3.0, 4.5, 5.8],
}
print(mydict)

{'names': ['철수', '영희', '둘리', '철수', '로그'], 'year': [2017, 2018, 2016, 2015, 2018], 'points': [1.5, 2.0, 3.0, 4.5, 5.8]}


In [24]:
# dict 를 이용한 DataFrame 객체 생성
df1 = pd.DataFrame(mydict)
df1
# print(df1)

Unnamed: 0,names,year,points
0,철수,2017,1.5
1,영희,2018,2.0
2,둘리,2016,3.0
3,철수,2015,4.5
4,로그,2018,5.8


In [28]:
# DataFrame 생성시 컴럼 순서 지정
df2 = pd.DataFrame(mydict, columns=['names', 'points', 'year'])
print(type(df2))
df2

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,names,points,year
0,철수,1.5,2017
1,영희,2.0,2018
2,둘리,3.0,2016
3,철수,4.5,2015
4,로그,5.8,2018


In [32]:
# DataFrame 크기, 컬럼명, 인덱스
print(df2.shape)
print(df2.columns)
print(df2.index)

(5, 3)
Index(['names', 'points', 'year'], dtype='object')
RangeIndex(start=0, stop=5, step=1)
[['철수' 1.5 2017]
 ['영희' 2.0 2018]
 ['둘리' 3.0 2016]
 ['철수' 4.5 2015]
 ['로그' 5.8 2018]]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
names     5 non-null object
points    5 non-null float64
year      5 non-null int64
dtypes: float64(1), int64(1), object(1)
memory usage: 140.0+ bytes
None
         points        year
count  5.000000     5.00000
mean   3.360000  2016.80000
std    1.781292     1.30384
min    1.500000  2015.00000
25%    2.000000  2016.00000
50%    3.000000  2017.00000
75%    4.500000  2018.00000
max    5.800000  2018.00000


In [35]:
# DataFrame 의 값만 확인
print(df2.values)

[['철수' 1.5 2017]
 ['영희' 2.0 2018]
 ['둘리' 3.0 2016]
 ['철수' 4.5 2015]
 ['로그' 5.8 2018]]


In [34]:
# info() : 컬럼 dtype, not null row 건수 등 확인
print(df2.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
names     5 non-null object
points    5 non-null float64
year      5 non-null int64
dtypes: float64(1), int64(1), object(1)
memory usage: 140.0+ bytes
None


In [33]:
# describe() : 숫자타입인 컬럼들의 집계함수
print(df2.describe())

         points        year
count  5.000000     5.00000
mean   3.360000  2016.80000
std    1.781292     1.30384
min    1.500000  2015.00000
25%    2.000000  2016.00000
50%    3.000000  2017.00000
75%    4.500000  2018.00000
max    5.800000  2018.00000


In [39]:
print(type(df2['names']))
df2.names

<class 'pandas.core.series.Series'>


0    철수
1    영희
2    둘리
3    철수
4    로그
Name: names, dtype: object

### 1.3 DataFrame 인덱싱

In [45]:
df3 = pd.DataFrame(mydict, columns=['names', 'year', 'points', 'penalty'], index=['one', 'two', 'three', 'four', 'five'])
print(df3)
df3

      names  year  points penalty
one      철수  2017     1.5     NaN
two      영희  2018     2.0     NaN
three    둘리  2016     3.0     NaN
four     철수  2015     4.5     NaN
five     로그  2018     5.8     NaN


Unnamed: 0,names,year,points,penalty
one,철수,2017,1.5,
two,영희,2018,2.0,
three,둘리,2016,3.0,
four,철수,2015,4.5,
five,로그,2018,5.8,


In [46]:
# column 1개 선택 : 한 개의 컬럼을 선택하면 Series 객체로 반환됨
print(df3['year'])
print(df3.year)
df3.year

one      2017
two      2018
three    2016
four     2015
five     2018
Name: year, dtype: int64
one      2017
two      2018
three    2016
four     2015
five     2018
Name: year, dtype: int64


one      2017
two      2018
three    2016
four     2015
five     2018
Name: year, dtype: int64

In [44]:
# column을 2개 선택
df3[['names', 'year']]

Unnamed: 0,names,year
one,철수,2017
two,영희,2018
three,둘리,2016
four,철수,2015
five,로그,2018


In [47]:
# 특정 컬럼의 값을 변경
df3['penalty'] = np.arange(0.1, 0.6, 0.1)
df3

Unnamed: 0,names,year,points,penalty
one,철수,2017,1.5,0.1
two,영희,2018,2.0,0.2
three,둘리,2016,3.0,0.3
four,철수,2015,4.5,0.4
five,로그,2018,5.8,0.5


In [48]:
# 새로운 컬럼 추가
df3['zeros'] = np.arange(5)
df3

Unnamed: 0,names,year,points,penalty,zeros
one,철수,2017,1.5,0.1,0
two,영희,2018,2.0,0.2,1
three,둘리,2016,3.0,0.3,2
four,철수,2015,4.5,0.4,3
five,로그,2018,5.8,0.5,4


In [51]:
# 새로운 컬럼 추가 : Series 객체를 이용한 컬럼 추가
df3['dept'] = pd.Series([-1.2, -1.5, -2.0], index=['two', 'four', 'five'])
df3

Unnamed: 0,names,year,points,penalty,zeros,dept
one,철수,2017,1.5,0.1,0,
two,영희,2018,2.0,0.2,1,-1.2
three,둘리,2016,3.0,0.3,2,
four,철수,2015,4.5,0.4,3,-1.5
five,로그,2018,5.8,0.5,4,-2.0


In [52]:
# 새로운 컬럼 추가 : Boolean type 컬럼
df3['net_points'] = df3['points'] >= 3.0
df3

Unnamed: 0,names,year,points,penalty,zeros,dept,net_points
one,철수,2017,1.5,0.1,0,,False
two,영희,2018,2.0,0.2,1,-1.2,False
three,둘리,2016,3.0,0.3,2,,True
four,철수,2015,4.5,0.4,3,-1.5,True
five,로그,2018,5.8,0.5,4,-2.0,True


### 1.4 복사

In [54]:
# 단순 복사
import pandas as pd
s = pd.Series([1, 2], index=['a', 'b'])
print(s)
s2 = s
print(id(s))
print(id(s2))




a    1
b    2
dtype: int64
265512560
265512560


In [61]:
# 얇은 복사 : 주소는 달라지나, 값은 공유함
shallow = s.copy(deep=False)
shallow[0] = 5
print(id(shallow))
print(shallow)
print(s)

265513040
a    5
b    2
dtype: int64
a    5
b    2
dtype: int64


In [62]:
# 깊은 복사
deep = s.copy()
deep[0] = 5
print(id(deep))
print(deep)
print(s)

265514416
a    5
b    2
dtype: int64
a    5
b    2
dtype: int64


In [63]:
# deep copy
df4 = df3.copy()
df4

Unnamed: 0,names,year,points,penalty,zeros,dept,net_points
one,철수,2017,1.5,0.1,0,,False
two,영희,2018,2.0,0.2,1,-1.2,False
three,둘리,2016,3.0,0.3,2,,True
four,철수,2015,4.5,0.4,3,-1.5,True
five,로그,2018,5.8,0.5,4,-2.0,True


In [65]:
# 1개의 컬럼 삭제
del df3['net_points']
df3

Unnamed: 0,names,year,points,penalty,zeros,dept
one,철수,2017,1.5,0.1,0,
two,영희,2018,2.0,0.2,1,-1.2
three,둘리,2016,3.0,0.3,2,
four,철수,2015,4.5,0.4,3,-1.5
five,로그,2018,5.8,0.5,4,-2.0


In [66]:
df4

Unnamed: 0,names,year,points,penalty,zeros,dept,net_points
one,철수,2017,1.5,0.1,0,,False
two,영희,2018,2.0,0.2,1,-1.2,False
three,둘리,2016,3.0,0.3,2,,True
four,철수,2015,4.5,0.4,3,-1.5,True
five,로그,2018,5.8,0.5,4,-2.0,True


In [70]:
# 여러개의 컬럼을 삭제 : drop()
# axis=1 동일한 옵션이 columns=['dept']
# inplace=False : default 이며 컬럼을 일시적으로 삭제하고, 결과를 리턴한다.
# inplace=True : 컬럼을 실제로 삭제하고, 결과를 리턴하지 않는다.

# df3.drop(['dept', 'zeros'], axis=1, inplace=True)
df3.drop(columns=['dept', 'zeros'], inplace=False)

Unnamed: 0,names,year,points,penalty
one,철수,2017,1.5,0.1
two,영희,2018,2.0,0.2
three,둘리,2016,3.0,0.3
four,철수,2015,4.5,0.4
five,로그,2018,5.8,0.5


In [71]:
df3

Unnamed: 0,names,year,points,penalty,zeros,dept
one,철수,2017,1.5,0.1,0,
two,영희,2018,2.0,0.2,1,-1.2
three,둘리,2016,3.0,0.3,2,
four,철수,2015,4.5,0.4,3,-1.5
five,로그,2018,5.8,0.5,4,-2.0


In [72]:
df3.drop(columns=['dept', 'zeros'], inplace=True)

In [107]:
df3

Unnamed: 0,names,year,points,penalty
one,철수,2017.0,1.5,0.1
two,영희,2018.0,2.0,0.2
three,둘리,2016.0,3.0,0.3
four,철수,2015.0,4.5,0.4
five,로그,2018.0,5.8,0.5
six,파이썬,2017.0,8.5,0.6


In [77]:
# 행(row) 삭제 : inplace = False 이기 때문에 원본이 실제로 변경되지는 않는다.
df3.drop(['five','three'])

Unnamed: 0,names,year,points,penalty
one,철수,2017,1.5,0.1
two,영희,2018,2.0,0.2
four,철수,2015,4.5,0.4


In [76]:
df3

Unnamed: 0,names,year,points,penalty
one,철수,2017,1.5,0.1
two,영희,2018,2.0,0.2
three,둘리,2016,3.0,0.3
four,철수,2015,4.5,0.4
five,로그,2018,5.8,0.5


In [78]:
# 행(row)을 선택해서 출력하기
df3[0:3]

Unnamed: 0,names,year,points,penalty
one,철수,2017,1.5,0.1
two,영희,2018,2.0,0.2
three,둘리,2016,3.0,0.3


In [80]:
# 범위 인덱싱 : 컬럼을 선택하는 인덱싱과 비슷하기 때문에 권장하지 않음. 컬럼을 선택한 것인지 행을 선택한 것인지 불분명하다.
df3['one':'four']

Unnamed: 0,names,year,points,penalty
one,철수,2017,1.5,0.1
two,영희,2018,2.0,0.2
three,둘리,2016,3.0,0.3
four,철수,2015,4.5,0.4


In [84]:
# loc (location)을 사용
print(type(df3.loc['one']))
df3.loc['one']

<class 'pandas.core.series.Series'>


names        철수
year       2017
points      1.5
penalty     0.1
Name: one, dtype: object

In [89]:
# 
df3.iloc[1:3]

Unnamed: 0,names,year,points,penalty
two,영희,2018,2.0,0.2
three,둘리,2016,3.0,0.3


In [93]:
# 행과 열의 데이터를 같이 가져오기 (슬라이싱 사용)
df3.loc['one' : 'four', 'names' : 'points']

Unnamed: 0,names,year,points
one,철수,2017,1.5
two,영희,2018,2.0
three,둘리,2016,3.0
four,철수,2015,4.5


In [95]:
# 행과 열의 데이터를 같이 가져오기 (여러개의 행과 여러개의 열)
df3.loc[['one', 'five', 'three'], ['names', 'points']]

Unnamed: 0,names,points
one,철수,1.5
five,로그,5.8
three,둘리,3.0


In [97]:
# 행과 열의 데이터를 같이 가져오기 (슬라이싱, 여러개의 컬럼)
df3.loc['two' : 'five', ['names', 'points']]

Unnamed: 0,names,points
two,영희,2.0
three,둘리,3.0
four,철수,4.5
five,로그,5.8


In [98]:
# 새로운 행(row) 추가
df3.loc['six', :] = ['파이썬', 2017, 8.5, 0.6]
df3

Unnamed: 0,names,year,points,penalty
one,철수,2017.0,1.5,0.1
two,영희,2018.0,2.0,0.2
three,둘리,2016.0,3.0,0.3
four,철수,2015.0,4.5,0.4
five,로그,2018.0,5.8,0.5
six,파이썬,2017.0,8.5,0.6


In [99]:
# iloc 사용해서 행과 열의 데이터를 같이 가져오기
df3.iloc[1:5, 1:3]

Unnamed: 0,year,points
two,2018.0,2.0
three,2016.0,3.0
four,2015.0,4.5
five,2018.0,5.8


In [100]:
df3.iloc[[0, 3, 5], [0, 2]]

Unnamed: 0,names,points
one,철수,1.5
four,철수,4.5
six,파이썬,8.5


In [105]:
# Boolean 인덱싱 : year 컬럼 값이 >= 2017
df3.loc[df3['year'] >= 2017, : ]

Unnamed: 0,names,year,points,penalty
one,철수,2017.0,1.5,0.1
two,영희,2018.0,2.0,0.2
five,로그,2018.0,5.8,0.5
six,파이썬,2017.0,8.5,0.6


In [106]:
# Boolean 인덱싱 : names 컬럼의 값이 파이썬이고, names 와 year 컬럼만 가져오기
df3.loc[df3['names'] == '파이썬', ['names', 'year']]

Unnamed: 0,names,year
six,파이썬,2017.0


In [108]:
df4

Unnamed: 0,names,year,points,penalty,zeros,dept,net_points
one,철수,2017,1.5,0.1,0,,False
two,영희,2018,2.0,0.2,1,-1.2,False
three,둘리,2016,3.0,0.3,2,,True
four,철수,2015,4.5,0.4,3,-1.5,True
five,로그,2018,5.8,0.5,4,-2.0,True


In [109]:
# net_points 컬럼의 값이 True 인 것만 선택
df4.loc[df4['net_points'], :]

Unnamed: 0,names,year,points,penalty,zeros,dept,net_points
three,둘리,2016,3.0,0.3,2,,True
four,철수,2015,4.5,0.4,3,-1.5,True
five,로그,2018,5.8,0.5,4,-2.0,True


In [110]:
# net_points 컬럼의 값이 False 인 것만 선택
df4.loc[~df4['net_points'], :]

Unnamed: 0,names,year,points,penalty,zeros,dept,net_points
one,철수,2017,1.5,0.1,0,,False
two,영희,2018,2.0,0.2,1,-1.2,False


### 1.4 null 데이터 처리

In [113]:
# NaN 을 포함하고 있는 행을 선택한다.
df4.loc[df4.isnull()['dept'], :]

Unnamed: 0,names,year,points,penalty,zeros,dept,net_points
one,철수,2017,1.5,0.1,0,,False
three,둘리,2016,3.0,0.3,2,,True


### 1.5 수학함수, Sorting