# 4. Series와 DF객체를 가지고 가주 만나게 될 기본 task들


## 4.1. Reindex

- index 순서를 바꿀 때 사용
- index 순서에 따라 값이 자동으로 바뀜

In [14]:
import pandas as pd
import numpy as np

sr1 = pd.Series([91,100,93,94], index = list('abcd'))
sr1

a     91
b    100
c     93
d     94
dtype: int64

In [9]:

sr1.reindex(['kim','a','b','c','d'])

kim      NaN
a       91.0
b      100.0
c       93.0
d       94.0
dtype: float64

In [11]:
# nan값을 특정 값으로 채우기
sr1.reindex(['kim','a','b','c','d'], fill_value = 0)

kim      0
a       91
b      100
c       93
d       94
dtype: int64

In [12]:
sr1.reindex(['kim','c','b','a','d'], fill_value = 0)

kim      0
c       93
b      100
a       91
d       94
dtype: int64

In [18]:
mat1= np.arange(1, 13).reshape(3, 4)
df1 = pd.DataFrame(mat1, index = ['a','b','c'], columns = ['score1','score2','score3','score4'])
df1

Unnamed: 0,score1,score2,score3,score4
a,1,2,3,4
b,5,6,7,8
c,9,10,11,12


In [21]:
# 없는 column을 가져오면 Nan 처리
df1.reindex(index = ['c','a'], columns = ['score1','score5','score4'],fill_value = 0)

Unnamed: 0,score1,score5,score4
c,9,0,12
a,1,0,4


## 4.2 Dropping 관측치 or변수

In [22]:
sr1 = pd.Series([91,10,93,94], index= ['a','d','c','b'])
sr1

a    91
d    10
c    93
b    94
dtype: int64

In [23]:

df1

Unnamed: 0,score1,score2,score3,score4
a,1,2,3,4
b,5,6,7,8
c,9,10,11,12


In [25]:
# 특정 인덱스 값 없애기
sr1.drop('c')

a    91
d    10
b    94
dtype: int64

In [27]:
sr1  # 원본은 변화 없음.

a    91
d    10
c    93
b    94
dtype: int64

In [29]:
df1.drop('b')

Unnamed: 0,score1,score2,score3,score4
a,1,2,3,4
c,9,10,11,12


In [32]:
df1.drop(['score1','score3'], axis = 1)

Unnamed: 0,score2,score4
a,2,4
b,6,8
c,10,12


## 4.3. 함수 적용

In [35]:
df1 = pd.DataFrame(np.random.randn(9).reshape(3,3), index=['stu1','stu2','stu3'], 
                 columns = ['exam1','exam2','exam3'])


In [36]:
df1

Unnamed: 0,exam1,exam2,exam3
stu1,1.14311,-0.309301,-0.340844
stu2,0.00998,-1.14855,0.719829
stu3,-0.842237,0.726108,0.967067


In [37]:
# lambda 함수
f_length = lambda x: x.max() - x.min()

In [39]:
# df1에 f_length를 적용해라.
df1.apply(f_length)  # defult = 1

exam1    1.985347
exam2    1.874657
exam3    1.307911
dtype: float64

In [40]:
# 학생별로 계산하고 싶을 때는 
df1.apply(f_length, axis = 1)

stu1    1.483955
stu2    1.868379
stu3    1.809303
dtype: float64

In [41]:
df1.apply(np.mean)

exam1    0.103618
exam2   -0.243914
exam3    0.448684
dtype: float64

In [42]:
df1.apply(max)

exam1    1.143110
exam2    0.726108
exam3    0.967067
dtype: float64

In [43]:
f_double = lambda x: x * 2

In [44]:

df1.apply(f_double)

Unnamed: 0,exam1,exam2,exam3
stu1,2.286221,-0.618602,-0.681688
stu2,0.01996,-2.297099,1.439659
stu3,-1.684473,1.452215,1.934133


In [45]:
# applymap: df의 모든 원소가 각각각 들어가서 계산
# apply:행, 열 단위로 계산
df1.applymap(f_double)

Unnamed: 0,exam1,exam2,exam3
stu1,2.286221,-0.618602,-0.681688
stu2,0.01996,-2.297099,1.439659
stu3,-1.684473,1.452215,1.934133


## 4.4. Sorting and randking

In [46]:
sr1

a    91
d    10
c    93
b    94
dtype: int64

In [47]:
sr1.sort_index()

a    91
b    94
c    93
d    10
dtype: int64

In [49]:
sr1.sort_values()

d    10
a    91
c    93
b    94
dtype: int64

In [51]:
sr1.rank()

a    2.0
d    1.0
c    3.0
b    4.0
dtype: float64

In [53]:
sr1.rank(ascending = False)

a    3.0
d    4.0
c    2.0
b    1.0
dtype: float64

In [54]:
df1

Unnamed: 0,exam1,exam2,exam3
stu1,1.14311,-0.309301,-0.340844
stu2,0.00998,-1.14855,0.719829
stu3,-0.842237,0.726108,0.967067


In [55]:
df1.sort_index(axis = 1)

Unnamed: 0,exam1,exam2,exam3
stu1,1.14311,-0.309301,-0.340844
stu2,0.00998,-1.14855,0.719829
stu3,-0.842237,0.726108,0.967067


In [56]:
df1.sort_index(axis = 1, ascending = False)

Unnamed: 0,exam3,exam2,exam1
stu1,-0.340844,-0.309301,1.14311
stu2,0.719829,-1.14855,0.00998
stu3,0.967067,0.726108,-0.842237


In [57]:
df1.sort_values(by='exam3')

Unnamed: 0,exam1,exam2,exam3
stu1,1.14311,-0.309301,-0.340844
stu2,0.00998,-1.14855,0.719829
stu3,-0.842237,0.726108,0.967067


In [60]:
# multi order
df1.sort_values(by=['exam3','exam1'])

Unnamed: 0,exam1,exam2,exam3
stu1,1.14311,-0.309301,-0.340844
stu2,0.00998,-1.14855,0.719829
stu3,-0.842237,0.726108,0.967067


## Summary statistics

In [65]:
df1 = pd.DataFrame({'major' : ['stat', 'stat', 'CS', 'CS', 'math'], 
                    'score1': np.arange(100,90,-2), 
                    'score2': np.arange(90,100,2)})
index=['lee', 'kim', 'park', 'cho', 'song']

In [66]:
df1

Unnamed: 0,major,score1,score2
0,stat,100,90
1,stat,98,92
2,CS,96,94
3,CS,94,96
4,math,92,98


In [67]:
df1.iloc[:, 1:3]

Unnamed: 0,score1,score2
0,100,90
1,98,92
2,96,94
3,94,96
4,92,98


In [68]:
df1.iloc[:, 1:3].mean(axis = 0)

score1    96.0
score2    94.0
dtype: float64

In [70]:
df1.iloc[:, 1:3].idxmax()  # 값이 가장 큰 indxe를 추출

score1    0
score2    4
dtype: int64

In [71]:
df1.describe()

Unnamed: 0,score1,score2
count,5.0,5.0
mean,96.0,94.0
std,3.162278,3.162278
min,92.0,90.0
25%,94.0,92.0
50%,96.0,94.0
75%,98.0,96.0
max,100.0,98.0


In [74]:
df2 = pd.DataFrame([np.nan, np.nan, np.nan, np.nan]).T
df2.columns = ['exam1', 'exam2', 'exam3', 'exam4']
df2.index = ['stu4']

df3 = pd.concat((df1, df2), axis=0)



of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """
