# 10.1 Pandas Data Structure

## 10.1.1 Series형 Pandas Data Structure (PDS)

In [24]:
import pandas as pd; import numpy as np

In [25]:
ds1 = pd.Series([4,2,1]); ds1

0    4
1    2
2    1
dtype: int64

In [26]:
ds1.values

array([4, 2, 1], dtype=int64)

In [27]:
ds1.index

RangeIndex(start=0, stop=3, step=1)

In [28]:
ds1[0:2]

0    4
1    2
dtype: int64

In [29]:
ds1[0:3:2]

0    4
2    1
dtype: int64

In [30]:
ds1[[1,2]]

1    2
2    1
dtype: int64

In [31]:
ds1[-1]

KeyError: -1

In [32]:
ds1[2]

1

In [33]:
ds1[2] = -1; ds1

0    4
1    2
2   -1
dtype: int64

In [34]:
ds1[4] = 3. #새로운 요소 추가
ds1

0    4.0
1    2.0
2   -1.0
4    3.0
dtype: float64

In [35]:
ds1[ds1>2]

0    4.0
4    3.0
dtype: float64

In [36]:
ds1*2

0    8.0
1    4.0
2   -2.0
4    6.0
dtype: float64

In [37]:
np.exp(ds1)

0    54.598150
1     7.389056
2     0.367879
4    20.085537
dtype: float64

In [38]:
ds2 = pd.Series([4,2,-1,3], index=['b','c','a','d']); ds2
#default로 들어가는 index array가 아닌 다른 index array를 넣어줄 수도 있다.

b    4
c    2
a   -1
d    3
dtype: int64

In [39]:
dic1 = {'2015':[36,-15.5], '2016': [36.6,-15], '2017': [35.4,-14]}

In [40]:
ds3 = pd.Series(dic1); ds3

2015    [36, -15.5]
2016    [36.6, -15]
2017    [35.4, -14]
dtype: object

In [41]:
ds3.index = ['15','16','17']; ds3

15    [36, -15.5]
16    [36.6, -15]
17    [35.4, -14]
dtype: object

## 10.1.2 DataFrame형 Pandas Data Structure (PDS)

In [123]:
npa = np.array( [['Tom','NY',45,'M'], ['Judy','CA',36,'F'], ['Fred','PA',21,'M']] )

In [124]:
labels = ['Name','State','Age','Gender']

In [125]:
df1 = pd.DataFrame(npa, columns=labels); df1

Unnamed: 0,Name,State,Age,Gender
0,Tom,NY,45,M
1,Judy,CA,36,F
2,Fred,PA,21,M


In [45]:
npa1 = df1.to_numpy(); npa1 #Pandas dataframe을 numpy ndarray로

array([['Tom', 'NY', '45', 'M'],
       ['Judy', 'CA', '36', 'F'],
       ['Fred', 'PA', '21', 'M']], dtype=object)

In [46]:
dic2 = {'Name':['Tom','Judy','Fred'],
        'State':['NY','CA','PA'],
        'Age':[45,36,21],
        'Gender':['M','F','M']}

In [47]:
df2 = pd.DataFrame(dic2); df2

Unnamed: 0,Name,State,Age,Gender
0,Tom,NY,45,M
1,Judy,CA,36,F
2,Fred,PA,21,M


In [48]:
df2 = pd.DataFrame(dic2, index=[1,2,3]); df2

Unnamed: 0,Name,State,Age,Gender
1,Tom,NY,45,M
2,Judy,CA,36,F
3,Fred,PA,21,M


In [49]:
dic3 = {'Name':{1:'Tom',2:'Judy',3:'Fred'},
        'State':{1:'NY',2:'CA',3:'PA'},
        'Age':{1:45,2:36,3:21},
        'Gender':{1:'M',2:'F',3:'M'}}

In [50]:
df3 = pd.DataFrame(dic3);df3

Unnamed: 0,Name,State,Age,Gender
1,Tom,NY,45,M
2,Judy,CA,36,F
3,Fred,PA,21,M


In [51]:
df3.values

array([['Tom', 'NY', 45, 'M'],
       ['Judy', 'CA', 36, 'F'],
       ['Fred', 'PA', 21, 'M']], dtype=object)

In [52]:
df3.columns

Index(['Name', 'State', 'Age', 'Gender'], dtype='object')

In [53]:
list(df3.columns)

['Name', 'State', 'Age', 'Gender']

In [54]:
list(df3.index)

[1, 2, 3]

In [103]:
df3.axes

[Int64Index([1, 2, 3], dtype='int64'),
 Index(['Name', 'State', 'Age', 'Gender'], dtype='object')]

### 10.1.2.1 색인(Indexing)

In [149]:
df1

Unnamed: 0,Name,State,Age,Gender
0,Tom,NY,45,M
1,Judy,CA,36,F
2,Fred,PA,21,M


In [150]:
df1.iloc[0][0]

'Tom'

In [151]:
df1.loc[0][0]

'Tom'

In [152]:
df1.at[0,'Name']

'Tom'

In [153]:
df1.iat[0,0]

'Tom'

In [154]:
df1.loc[:]['Age']

0    45
1    36
2    21
Name: Age, dtype: object

In [155]:
df1['Age']

0    45
1    36
2    21
Name: Age, dtype: object

In [156]:
df1.Age

0    45
1    36
2    21
Name: Age, dtype: object

In [157]:
df1[['Name','Gender']][1:3]

Unnamed: 0,Name,Gender
1,Judy,F
2,Fred,M


In [158]:
df1[:][2:3]

Unnamed: 0,Name,State,Age,Gender
2,Fred,PA,21,M


In [159]:
df1.loc[1]

Name      Judy
State       CA
Age         36
Gender       F
Name: 1, dtype: object

In [160]:
df1.loc[1:3]['State']

1    CA
2    PA
Name: State, dtype: object

In [161]:
len(df1)

3

In [162]:
df1.shape

(3, 4)

### 10.1.2.2 행(Row) 첨가하기

In [163]:
df1.loc[len(df1)] = ['Hellen','TX',4,'F']; df1

Unnamed: 0,Name,State,Age,Gender
0,Tom,NY,45,M
1,Judy,CA,36,F
2,Fred,PA,21,M
3,Hellen,TX,4,F


### 10.1.2.3 열(Column) 첨가하기

In [164]:
df1['Major'] = ['Eng','Math','Physics','Arts']; df1

Unnamed: 0,Name,State,Age,Gender,Major
0,Tom,NY,45,M,Eng
1,Judy,CA,36,F,Math
2,Fred,PA,21,M,Physics
3,Hellen,TX,4,F,Arts


In [165]:
adult = lambda x: '어른' if int(x)>18 else '아이'
df1['Adult'] = list(map(adult,list(df1.Age)))
df1

Unnamed: 0,Name,State,Age,Gender,Major,Adult
0,Tom,NY,45,M,Eng,어른
1,Judy,CA,36,F,Math,어른
2,Fred,PA,21,M,Physics,어른
3,Hellen,TX,4,F,Arts,아이


### 10.1.2.4 열(Column) 제거하기

In [166]:
df1.drop('Gender',axis=1)

Unnamed: 0,Name,State,Age,Major,Adult
0,Tom,NY,45,Eng,어른
1,Judy,CA,36,Math,어른
2,Fred,PA,21,Physics,어른
3,Hellen,TX,4,Arts,아이


In [167]:
df1

Unnamed: 0,Name,State,Age,Gender,Major,Adult
0,Tom,NY,45,M,Eng,어른
1,Judy,CA,36,F,Math,어른
2,Fred,PA,21,M,Physics,어른
3,Hellen,TX,4,F,Arts,아이


In [168]:
df_new = df1.drop('Gender',axis=1); df_new

Unnamed: 0,Name,State,Age,Major,Adult
0,Tom,NY,45,Eng,어른
1,Judy,CA,36,Math,어른
2,Fred,PA,21,Physics,어른
3,Hellen,TX,4,Arts,아이


In [169]:
df1.drop('Gender',axis=1,inplace=True); df1

Unnamed: 0,Name,State,Age,Major,Adult
0,Tom,NY,45,Eng,어른
1,Judy,CA,36,Math,어른
2,Fred,PA,21,Physics,어른
3,Hellen,TX,4,Arts,아이


### 10.1.2.5 행(Row) 제거하기

In [170]:
df1.drop([1], inplace=True); df1

Unnamed: 0,Name,State,Age,Major,Adult
0,Tom,NY,45,Eng,어른
2,Fred,PA,21,Physics,어른
3,Hellen,TX,4,Arts,아이


In [174]:
df1.reset_index(drop=True, inplace=True); df1

Unnamed: 0,Name,State,Age,Major,Adult
0,Tom,NY,45,Eng,어른
1,Fred,PA,21,Physics,어른
2,Hellen,TX,4,Arts,아이


### 10.1.2.6 특정 행/열(들)만 골라내기(Reindexing)

In [191]:
df_ = df1

In [192]:
df_.reindex(index=[0,2], columns=['Name','Major'])

Unnamed: 0,Name,Major
0,Tom,
2,Hellen,


In [193]:
df1

Unnamed: 0,Name,State,age,major,Adult
0,Tom,NY,45,Eng,어른
1,Fred,Pen,21,Physics,어른
2,Hellen,Tex,4,Arts,아이


### 10.1.2.7 열이름(Column Label) 변경하기

In [194]:
newcols = {'Age': 'age', 'Major': 'major'}
df1.rename(columns=newcols, inplace=True)

In [195]:
df1

Unnamed: 0,Name,State,age,major,Adult
0,Tom,NY,45,Eng,어른
1,Fred,Pen,21,Physics,어른
2,Hellen,Tex,4,Arts,아이


### 10.1.2.8 문자열 변경하기

In [196]:
df1.replace(['TX','PA'], ['Tex','Pen'], inplace=True); df1

Unnamed: 0,Name,State,age,major,Adult
0,Tom,NY,45,Eng,어른
1,Fred,Pen,21,Physics,어른
2,Hellen,Tex,4,Arts,아이



### 10.1.2.9 숫자 변경하기

In [197]:
df1['age'] = pd.to_numeric(df1['age'], downcast='float')*2; df1

Unnamed: 0,Name,State,age,major,Adult
0,Tom,NY,90.0,Eng,어른
1,Fred,Pen,42.0,Physics,어른
2,Hellen,Tex,8.0,Arts,아이


### 10.1.2.10 행/열의 배열순서 변경하기

In [198]:
df1 = pd.DataFrame(df1, index=df1.index[::-1], columns=df1.columns[::-1])

In [199]:
df1

Unnamed: 0,Adult,major,age,State,Name
2,아이,Arts,8.0,Tex,Hellen
1,어른,Physics,42.0,Pen,Fred
0,어른,Eng,90.0,NY,Tom


### 10.1.2.11 (맨왼쪽) index 열과 (맨위) label 행에 이름 지정하기

In [200]:
df1.index.name='Index'; df1.columns.name='Item'
df1

Item,Adult,major,age,State,Name
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,아이,Arts,8.0,Tex,Hellen
1,어른,Physics,42.0,Pen,Fred
0,어른,Eng,90.0,NY,Tom


### 10.1.2.12 행과 열을 서로 맞바꾸기(Transpose)

In [201]:
df1T = df1.T; df1T

Index,2,1,0
Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adult,아이,어른,어른
major,Arts,Physics,Eng
age,8.0,42.0,90.0
State,Tex,Pen,NY
Name,Hellen,Fred,Tom


### 10.1.2.13 Column Label 앞/뒤에 문자열 덧붙이기(Prefix/Suffix)

In [202]:
df1T.add_prefix('col_')

Index,col_2,col_1,col_0
Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adult,아이,어른,어른
major,Arts,Physics,Eng
age,8.0,42.0,90.0
State,Tex,Pen,NY
Name,Hellen,Fred,Tom


In [204]:
df1T.add_suffix('_col')

Index,2_col,1_col,0_col
Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adult,아이,어른,어른
major,Arts,Physics,Eng
age,8.0,42.0,90.0
State,Tex,Pen,NY
Name,Hellen,Fred,Tom


### 10.1.3 계층적(Hierarchical) Indexing

In [211]:
df2 = pd.DataFrame({'x':[91,33,45,27,68,54],
                    'y':[15,24,46,38,57,79]},
                   index=[['A','A','A','B','B','C'],
                          [1,2,3,1,2,1]])

In [212]:
df2

Unnamed: 0,Unnamed: 1,x,y
A,1,91,15
A,2,33,24
A,3,45,46
B,1,27,38
B,2,68,57
C,1,54,79


In [213]:
df2.loc['A']['y']

1    15
2    24
3    46
Name: y, dtype: int64

In [214]:
df2.loc['A',2]['y']

24

In [215]:
df2.loc['B'][['y','x']]

Unnamed: 0,y,x
1,38,27
2,57,68


In [216]:
df2.loc[['B','A']]['y']

B  1    38
   2    57
A  1    15
   2    24
   3    46
Name: y, dtype: int64

In [217]:
data44 = np.array([[1,4,2,7],
                   [3,5,8,4],
                   [8,3,6,1],
                   [7,9,6,2]])

In [218]:
df22 = pd.DataFrame(data44,
                    index=[['Store1','Store1','Store2','Store2'],
                           [1,2,1,2]],
                    columns=[['Rose','Rose','Lily','Lily'],
                             ['Red','White','White','Pink']])

In [219]:
df22

Unnamed: 0_level_0,Unnamed: 1_level_0,Rose,Rose,Lily,Lily
Unnamed: 0_level_1,Unnamed: 1_level_1,Red,White,White,Pink
Store1,1,1,4,2,7
Store1,2,3,5,8,4
Store2,1,8,3,6,1
Store2,2,7,9,6,2


In [220]:
df22.index.names = ['Store','Size']

In [221]:
df22.columns.names = ['Flower','Color']; df22

Unnamed: 0_level_0,Flower,Rose,Rose,Lily,Lily
Unnamed: 0_level_1,Color,Red,White,White,Pink
Store,Size,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Store1,1,1,4,2,7
Store1,2,3,5,8,4
Store2,1,8,3,6,1
Store2,2,7,9,6,2
