# 데이터 프레임

<img src="etc/1_dataframe.png" />

### [ pandas ]
데이터프레임 데이터 구조를 사용하려면 pandas 모듈 필요

In [1]:
import pandas as pd

#### ○ 각 열 생성 (Series 생성)

In [2]:
age      = pd.Series([ 26, 42, 27, 25, 20,  20, 21, 22, 23, 25])
score    = pd.Series([3.8,4.2, 2.6,1.0,3.0,4.0, 4.2,2.2,4.1,3.8])
salary =  pd.Series([2700,4000,3000,2700,3200,1000,3000,7000,3200,3500])

#### ○ 데이터프레임 생성

In [3]:
df = pd.DataFrame( {'age': age,
                     'score' :  score  ,
                     'salary' : salary }
)

#### ○ 데이터 확인하기

In [4]:
df   # 데이터 전체 확인

Unnamed: 0,age,score,salary
0,26,3.8,2700
1,42,4.2,4000
2,27,2.6,3000
3,25,1.0,2700
4,20,3.0,3200
5,20,4.0,1000
6,21,4.2,3000
7,22,2.2,7000
8,23,4.1,3200
9,25,3.8,3500


In [5]:
df.head()    # 데이터 앞 5행만 확인

Unnamed: 0,age,score,salary
0,26,3.8,2700
1,42,4.2,4000
2,27,2.6,3000
3,25,1.0,2700
4,20,3.0,3200


In [6]:
df.head(3)    # 데이터 앞 3행만 확인

Unnamed: 0,age,score,salary
0,26,3.8,2700
1,42,4.2,4000
2,27,2.6,3000


In [7]:
df.tail()   # 데이터 뒤 5행만 확인

Unnamed: 0,age,score,salary
5,20,4.0,1000
6,21,4.2,3000
7,22,2.2,7000
8,23,4.1,3200
9,25,3.8,3500


In [8]:
df.tail(3)   # 데이터 뒤 3행만 확인

Unnamed: 0,age,score,salary
7,22,2.2,7000
8,23,4.1,3200
9,25,3.8,3500


In [9]:
df

Unnamed: 0,age,score,salary
0,26,3.8,2700
1,42,4.2,4000
2,27,2.6,3000
3,25,1.0,2700
4,20,3.0,3200
5,20,4.0,1000
6,21,4.2,3000
7,22,2.2,7000
8,23,4.1,3200
9,25,3.8,3500


In [10]:
# 행 인덱스 확인
df.index

RangeIndex(start=0, stop=10, step=1)

In [11]:
# 열 이름 확인
df.columns

Index(['age', 'score', 'salary'], dtype='object')

In [12]:
# 값만 확인
df.values

array([[2.6e+01, 3.8e+00, 2.7e+03],
       [4.2e+01, 4.2e+00, 4.0e+03],
       [2.7e+01, 2.6e+00, 3.0e+03],
       [2.5e+01, 1.0e+00, 2.7e+03],
       [2.0e+01, 3.0e+00, 3.2e+03],
       [2.0e+01, 4.0e+00, 1.0e+03],
       [2.1e+01, 4.2e+00, 3.0e+03],
       [2.2e+01, 2.2e+00, 7.0e+03],
       [2.3e+01, 4.1e+00, 3.2e+03],
       [2.5e+01, 3.8e+00, 3.5e+03]])

In [13]:
# 요약 정보 확인
df.describe()

Unnamed: 0,age,score,salary
count,10.0,10.0,10.0
mean,25.1,3.29,3330.0
std,6.436873,1.071292,1506.320019
min,20.0,1.0,1000.0
25%,21.25,2.7,2775.0
50%,24.0,3.8,3100.0
75%,25.75,4.075,3425.0
max,42.0,4.2,7000.0


In [14]:
# 데이터를 전치한다.
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
age,26.0,42.0,27.0,25.0,20.0,20.0,21.0,22.0,23.0,25.0
score,3.8,4.2,2.6,1.0,3.0,4.0,4.2,2.2,4.1,3.8
salary,2700.0,4000.0,3000.0,2700.0,3200.0,1000.0,3000.0,7000.0,3200.0,3500.0


In [15]:
df.sort_values(by='score')  # 점수 기준 오름 차순 정렬

Unnamed: 0,age,score,salary
3,25,1.0,2700
7,22,2.2,7000
2,27,2.6,3000
4,20,3.0,3200
0,26,3.8,2700
9,25,3.8,3500
5,20,4.0,1000
8,23,4.1,3200
1,42,4.2,4000
6,21,4.2,3000


In [16]:
df.sort_values(by='score', ascending=False ) # 점수 기준 내림 차순 정렬

Unnamed: 0,age,score,salary
1,42,4.2,4000
6,21,4.2,3000
8,23,4.1,3200
5,20,4.0,1000
0,26,3.8,2700
9,25,3.8,3500
4,20,3.0,3200
2,27,2.6,3000
7,22,2.2,7000
3,25,1.0,2700


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     10 non-null     int64  
 1   score   10 non-null     float64
 2   salary  10 non-null     int64  
dtypes: float64(1), int64(2)
memory usage: 372.0 bytes


#### ○ 데이터프레임 열 추가

In [18]:
df['class'] = pd.Series([ 1, 1, 2, 2, 2, 3, 3,4, 4, 4])
# class 열이 존재하면 갱신, 없으면 추가
df.head()

Unnamed: 0,age,score,salary,class
0,26,3.8,2700,1
1,42,4.2,4000,1
2,27,2.6,3000,2
3,25,1.0,2700,2
4,20,3.0,3200,2


#### ○ 데이터프레임 열 삭제

In [19]:
help(pd.DataFrame.drop)  # DataFrame 함수 도움말 보기

Help on function drop in module pandas.core.frame:

drop(self, labels: 'IndexLabel' = None, *, axis: 'Axis' = 0, index: 'IndexLabel' = None, columns: 'IndexLabel' = None, level: 'Level' = None, inplace: 'bool' = False, errors: 'IgnoreRaise' = 'raise') -> 'DataFrame | None'
    Drop specified labels from rows or columns.
    
    Remove rows or columns by specifying label names and corresponding
    axis, or by specifying directly index or column names. When using a
    multi-index, labels on different levels can be removed by specifying
    the level. See the :ref:`user guide <advanced.shown_levels>`
    for more information about the now unused levels.
    
    Parameters
    ----------
    labels : single label or list-like
        Index or column labels to drop. A tuple will be used as a single
        label and not treated as a list-like.
    axis : {0 or 'index', 1 or 'columns'}, default 0
        Whether to drop labels from the index (0 or 'index') or
        columns (1 or 'colum

In [20]:
df.drop (columns=['class'])
# class 컬럼이 삭제된 새로운 데이터프레임을 생성해서 반환. df 자체는 변하지 않음

Unnamed: 0,age,score,salary
0,26,3.8,2700
1,42,4.2,4000
2,27,2.6,3000
3,25,1.0,2700
4,20,3.0,3200
5,20,4.0,1000
6,21,4.2,3000
7,22,2.2,7000
8,23,4.1,3200
9,25,3.8,3500


In [21]:
df

Unnamed: 0,age,score,salary,class
0,26,3.8,2700,1
1,42,4.2,4000,1
2,27,2.6,3000,2
3,25,1.0,2700,2
4,20,3.0,3200,2
5,20,4.0,1000,3
6,21,4.2,3000,3
7,22,2.2,7000,4
8,23,4.1,3200,4
9,25,3.8,3500,4


In [22]:
df.drop(columns=['class'], inplace=True)
# df 자체는 변함.  df의 'class' 컬럼 삭제됨.

In [23]:
df

Unnamed: 0,age,score,salary
0,26,3.8,2700
1,42,4.2,4000
2,27,2.6,3000
3,25,1.0,2700
4,20,3.0,3200
5,20,4.0,1000
6,21,4.2,3000
7,22,2.2,7000
8,23,4.1,3200
9,25,3.8,3500


#### ○ 데이터프레임 행 삭제

In [26]:
df.drop(labels=[0,1] , axis=0 )
# 행 인덱스 0,1인 행을 삭제하여 새로운 데이터프레임을 생성해서 반환.
# df자체는 변하지 않음

Unnamed: 0,age,score,salary
2,27,2.6,3000
3,25,1.0,2700
4,20,3.0,3200
5,20,4.0,1000
6,21,4.2,3000
7,22,2.2,7000
8,23,4.1,3200
9,25,3.8,3500


In [28]:
df.drop(labels=[0,1], axis=0 , inplace=True)
# df 자체는 변함.  df의 행 인덱스 0,1인 행을 삭제

In [29]:
df

Unnamed: 0,age,score,salary
2,27,2.6,3000
3,25,1.0,2700
4,20,3.0,3200
5,20,4.0,1000
6,21,4.2,3000
7,22,2.2,7000
8,23,4.1,3200
9,25,3.8,3500


#### ○ 범주형 열 데이터

In [30]:
age      = pd.Series([ 26, 42, 27, 25, 20,  20, 21, 22, 23, 25]  )
score    = pd.Series([3.8,4.2, 2.6,1.0,3.0,4.0, 4.2,2.2,4.1,3.8] )
salary =  pd.Series([2700,4000,3000,2700,3200,1000,3000,7000,3200,3500])

# 범주형 변수 생성
stu_class = pd.Categorical([ 1, 1, 2, 2, 2, 3, 3,4, 4, 4])
gender    = pd.Categorical([ 'F', 'M', 'M', 'M', 'M', 'F', 'F','F', 'M', 'M'])

In [31]:
df = pd.DataFrame ( {'age': age,
                     'score' :  score  ,
                     'salary' : salary,
                      'class' :stu_class,
                       'gender' : gender}
)
df

Unnamed: 0,age,score,salary,class,gender
0,26,3.8,2700,1,F
1,42,4.2,4000,1,M
2,27,2.6,3000,2,M
3,25,1.0,2700,2,M
4,20,3.0,3200,2,M
5,20,4.0,1000,3,F
6,21,4.2,3000,3,F
7,22,2.2,7000,4,F
8,23,4.1,3200,4,M
9,25,3.8,3500,4,M


#### ○ 일부 데이터를 추출

일부 열만 추출

In [32]:
# 단일 열 추출
# Series 타입으로 추출
df['score']  # 또는 df.score

0    3.8
1    4.2
2    2.6
3    1.0
4    3.0
5    4.0
6    4.2
7    2.2
8    4.1
9    3.8
Name: score, dtype: float64

In [33]:
#  복수 열 추출
df.loc[:, ('class', 'score')]

Unnamed: 0,class,score
0,1,3.8
1,1,4.2
2,2,2.6
3,2,1.0
4,2,3.0
5,3,4.0
6,3,4.2
7,4,2.2
8,4,4.1
9,4,3.8


일부 행만 추출

In [34]:
df[0:3]

Unnamed: 0,age,score,salary,class,gender
0,26,3.8,2700,1,F
1,42,4.2,4000,1,M
2,27,2.6,3000,2,M


행과 열 제한 (by label)

In [36]:
df.loc[0:3 ,('class', 'score')]  # 데이터 프레임으로 반환

Unnamed: 0,class,score
0,1,3.8
1,1,4.2
2,2,2.6
3,2,1.0


In [37]:
df.loc[3,'class']  # # 하나의 값(스칼라)만 반환

2

행과 열 제한 (by 위치)

In [38]:
df.iloc[0:3]       # 0~2 행,  모든 열

Unnamed: 0,age,score,salary,class,gender
0,26,3.8,2700,1,F
1,42,4.2,4000,1,M
2,27,2.6,3000,2,M


In [39]:
df.iloc[0:3 , : ]   # 0~2 행,  모든 열

Unnamed: 0,age,score,salary,class,gender
0,26,3.8,2700,1,F
1,42,4.2,4000,1,M
2,27,2.6,3000,2,M


In [40]:
df.iloc[0:3 ,2:5]   # 0~2 행,  2~4 행

Unnamed: 0,salary,class,gender
0,2700,1,F
1,4000,1,M
2,3000,2,M


In [41]:
df.iloc[: ,2:5]     # 모든 행,  2~4 행

Unnamed: 0,salary,class,gender
0,2700,1,F
1,4000,1,M
2,3000,2,M
3,2700,2,M
4,3200,2,M
5,1000,3,F
6,3000,3,F
7,7000,4,F
8,3200,4,M
9,3500,4,M


In [42]:
df.iloc[[1,2,4],[0,2]] # 특정 위치.

Unnamed: 0,age,salary
1,42,4000
2,27,3000
4,20,3200


In [None]:
df.iloc[0,0]    # 하나의 값(스칼라)만 반환

조건으로 추출

In [43]:
df[df.score > 3.0]   # 또는 df[df["score"] > 3.0]   # score가 3.0 보다 큰 행만

Unnamed: 0,age,score,salary,class,gender
0,26,3.8,2700,1,F
1,42,4.2,4000,1,M
5,20,4.0,1000,3,F
6,21,4.2,3000,3,F
8,23,4.1,3200,4,M
9,25,3.8,3500,4,M


In [44]:
df[df["class"] == 1]           # 1반 만 추출

Unnamed: 0,age,score,salary,class,gender
0,26,3.8,2700,1,F
1,42,4.2,4000,1,M


In [45]:
df[ (df["class"] == 1 )| (df["class"] == 2) ]   # 1반 또는 2반 추출

Unnamed: 0,age,score,salary,class,gender
0,26,3.8,2700,1,F
1,42,4.2,4000,1,M
2,27,2.6,3000,2,M
3,25,1.0,2700,2,M
4,20,3.0,3200,2,M


In [46]:
df[df["class"].isin ([1,2] )]  # 1반 또는 2반 추출

Unnamed: 0,age,score,salary,class,gender
0,26,3.8,2700,1,F
1,42,4.2,4000,1,M
2,27,2.6,3000,2,M
3,25,1.0,2700,2,M
4,20,3.0,3200,2,M


In [47]:
df[ (df.score>=2.0)  & (df.score <= 3.0)]

Unnamed: 0,age,score,salary,class,gender
2,27,2.6,3000,2,M
4,20,3.0,3200,2,M
7,22,2.2,7000,4,F


In [48]:
df.loc[ (df.score >= 2.0) & (df.score <= 3.0),  ("class","score")]

Unnamed: 0,class,score
2,2,2.6
4,2,3.0
7,4,2.2
