# 4. Series와 DF 객체를 가지고 자조 만나게 될 기본 task들

## 4.1. Reindex

- pd.index는 immurable하여 안정적이고 쉽게 변경하지 못하게 만들어놓았는데, 순서는 바꿀 수 있음.


In [4]:
import numpy as np 
import pandas as pd

sr1 = pd.Series([91, 100, 93, 94], index = list('abcd'))
sr1

a     91
b    100
c     93
d     94
dtype: int64

In [5]:
sr1.reindex(['Kim','a','n','c','d'])

Kim     NaN
a      91.0
n       NaN
c      93.0
d      94.0
dtype: float64

In [6]:
sr1.reindex(['kim','a','n','c','d'], fill_value = 0)

kim     0
a      91
n       0
c      93
d      94
dtype: int64

In [9]:
sr1.reindex(['kim','d','c','n','a'], fill_value = 0)

kim     0
d      94
c      93
n       0
a      91
dtype: int64

In [11]:
# DF에서 reindex 시용하기

df1 = pd.DataFrame(np.arange(1, 13).reshape(3, -1), index = ['a','b','c'], columns = ['score1','score2','score3','score4'])

df1

Unnamed: 0,score1,score2,score3,score4
a,1,2,3,4
b,5,6,7,8
c,9,10,11,12


In [12]:
df1.reindex(index = ['c','a'], columns = ['score1','score5','score4'])

Unnamed: 0,score1,score5,score4
c,9.0,,12.0
a,1.0,,4.0


In [14]:
df1.reindex(index= ['a','v','b'], columns = ['score1','score5'], fill_value = 0)

Unnamed: 0,score1,score5
a,1,0
v,0,0
b,5,0


# 4. 2. Dropping 관측치 or 변수

In [20]:
sr1 = pd.Series([91, 100, 93, 94], index = ['a','b','c','d'])
sr1

a     91
b    100
c     93
d     94
dtype: int64

In [19]:
sr1.drop('c')

a     91
b    100
d     94
dtype: int64

In [21]:
df1= pd.DataFrame(np.arange(1, 13).reshape(3, 4), index = ['a','b','c'], columns = ['score1','score2','score3','score4'])
df1

Unnamed: 0,score1,score2,score3,score4
a,1,2,3,4
b,5,6,7,8
c,9,10,11,12


In [22]:
df1.drop('b')

Unnamed: 0,score1,score2,score3,score4
a,1,2,3,4
c,9,10,11,12


In [24]:
df1.drop(['score1','score2'], axis = 1)

Unnamed: 0,score3,score4
a,3,4
b,7,8
c,11,12


## 4. 3. 함수 적용

In [27]:
df1 = pd.DataFrame(np.random.randn(9).reshape(3,3), index = ['stu1','stu2','str3'], columns = ['exam1','exam2','exam3'])
df1

Unnamed: 0,exam1,exam2,exam3
stu1,0.315438,-1.961823,0.393417
stu2,-1.780788,1.026751,-1.231888
str3,-0.701099,-0.603812,1.19713


### DF 에서는 observation에 대한 함수 적용, variable에 대한 함수 적용도 흔하다.

### lambda 함수
- functional programming 문법
- functional : 인수에 함수를 넣어서 값을 뱉은 함수



In [28]:
def f_length(i):
    return max(i) - min(i)

df1.apply(f_length)

exam1    2.096226
exam2    2.988574
exam3    2.429018
dtype: float64

In [29]:
df1.apply(lambda x : max(x) - min(x))

exam1    2.096226
exam2    2.988574
exam3    2.429018
dtype: float64

In [31]:
df1.apply(lambda x: f_length(x), axis = 0)

exam1    2.096226
exam2    2.988574
exam3    2.429018
dtype: float64

In [32]:
df1.apply(lambda x: f_length(x), axis = 1)

stu1    2.355240
stu2    2.807538
str3    1.898229
dtype: float64

- 각 element에 대한 함수 적용


In [33]:
f_double = lambda x: 2 * 2

In [34]:
df1.applymap(f_double)

Unnamed: 0,exam1,exam2,exam3
stu1,4,4,4
stu2,4,4,4
str3,4,4,4


In [40]:
f_format= lambda x: '{0:.2f}'.format(x)


In [41]:
df1.applymap(f_format)

Unnamed: 0,exam1,exam2,exam3
stu1,0.32,-1.96,0.39
stu2,-1.78,1.03,-1.23
str3,-0.7,-0.6,1.2


## 4.4 Sorting and ranking

In [44]:
sr1 = pd.Series([91, 100, 93, 94], index = ['b','d','c','a'])
sr1

b     91
d    100
c     93
a     94
dtype: int64

In [45]:
sr1.sort_index()

a     94
b     91
c     93
d    100
dtype: int64

In [46]:
sr1.sort_values(0)

b     91
c     93
a     94
d    100
dtype: int64

In [47]:
sr1.rank()

b    1.0
d    4.0
c    2.0
a    3.0
dtype: float64

In [50]:
sr1.rank(ascending = False)

b    4.0
d    1.0
c    3.0
a    2.0
dtype: float64

In [64]:
df1= pd.DataFrame(np.random.randn(9).reshape(3,3), index = ['stu3','stu2','stu1'], columns = ['exam3','exam2','exam1'])

In [65]:
df1.sort_index()

Unnamed: 0,exam3,exam2,exam1
stu1,-1.349892,-1.014351,2.216278
stu2,0.724429,-0.673564,-0.547721
stu3,-1.156014,0.204293,-1.19844


In [66]:

df1.sort_index(axis = 1)

Unnamed: 0,exam1,exam2,exam3
stu3,-1.19844,0.204293,-1.156014
stu2,-0.547721,-0.673564,0.724429
stu1,2.216278,-1.014351,-1.349892


In [67]:

df1.sort_index(axis = 1, ascending = False)

Unnamed: 0,exam3,exam2,exam1
stu3,-1.156014,0.204293,-1.19844
stu2,0.724429,-0.673564,-0.547721
stu1,-1.349892,-1.014351,2.216278


In [68]:
df1.sort_values(by='exam1')

Unnamed: 0,exam3,exam2,exam1
stu3,-1.156014,0.204293,-1.19844
stu2,0.724429,-0.673564,-0.547721
stu1,-1.349892,-1.014351,2.216278


In [69]:
df1.sort_values(by= ['exam1','exam2'])

Unnamed: 0,exam3,exam2,exam1
stu3,-1.156014,0.204293,-1.19844
stu2,0.724429,-0.673564,-0.547721
stu1,-1.349892,-1.014351,2.216278


In [70]:
df1.rank()

Unnamed: 0,exam3,exam2,exam1
stu3,2.0,3.0,1.0
stu2,3.0,2.0,2.0
stu1,1.0,1.0,3.0


In [71]:
df1.rank(axis = 1)

Unnamed: 0,exam3,exam2,exam1
stu3,2.0,3.0,1.0
stu2,3.0,1.0,2.0
stu1,1.0,2.0,3.0


## 4.5 Summary statistics

In [74]:
df1=pd.DataFrame({'major': ['stat','stat','cs','cs','math'],
                  'score1': np.arange(100, 90, -2),
                  'score2': np.arange(90, 100, 2)},
                  index = ['lee','kin','park','cho','song'])
df1

Unnamed: 0,major,score1,score2
lee,stat,100,90
kin,stat,98,92
park,cs,96,94
cho,cs,94,96
song,math,92,98


In [75]:
df1.iloc[:, 1:3]

Unnamed: 0,score1,score2
lee,100,90
kin,98,92
park,96,94
cho,94,96
song,92,98


In [77]:
_.mean(axis = 0)

score1    96.0
score2    94.0
dtype: float64

In [79]:
df1.iloc[:, 1:3].idxmax()

score1     lee
score2    song
dtype: object

In [80]:
df1.describe()

Unnamed: 0,score1,score2
count,5.0,5.0
mean,96.0,94.0
std,3.162278,3.162278
min,92.0,90.0
25%,94.0,92.0
50%,96.0,94.0
75%,98.0,96.0
max,100.0,98.0


In [81]:
df1.major.value_counts()

cs      2
stat    2
math    1
Name: major, dtype: int64

In [82]:
df1.corr()

Unnamed: 0,score1,score2
score1,1.0,-1.0
score2,-1.0,1.0



## 4. 6. filter/fill NA

### 4. 6. 1 filter NA

In [83]:
sr1 = pd.Series([np.nan, 2., np.nan, 4])
sr1

0    NaN
1    2.0
2    NaN
3    4.0
dtype: float64

In [84]:
sr1.dropna()

1    2.0
3    4.0
dtype: float64

In [86]:

sr1[sr1.notnull()]

1    2.0
3    4.0
dtype: float64

In [87]:
df1 = pd.DataFrame([[1, 2, 3, 4],
                   [np.nan, 2, np.nan, 4],
                   [np.nan, 3, 4, 5]])

In [89]:
df1.columns = ['exam1','exam2','exam3','exam4']
df1.index = ['stu1','stu2','stu3']

In [90]:
df1

Unnamed: 0,exam1,exam2,exam3,exam4
stu1,1.0,2,3.0,4
stu2,,2,,4
stu3,,3,4.0,5


In [91]:
df1.dropna()

Unnamed: 0,exam1,exam2,exam3,exam4
stu1,1.0,2,3.0,4


In [93]:
df1.dropna(axis = 1)

Unnamed: 0,exam2,exam4
stu1,2,4
stu2,2,4
stu3,3,5


In [97]:
df2 = pd.DataFrame([np.nan, np.nan, np.nan, np.nan]).T
df2.columns = ['exam1','exam2','exam3','exam4']
df2.index =['stu4']

In [95]:
df2

Unnamed: 0,exam1,exam2,exam3,exam4
stu4,,,,


In [100]:
df3 = pd.concat((df1, df2))
df3

Unnamed: 0,exam1,exam2,exam3,exam4
stu1,1.0,2.0,3.0,4.0
stu2,,2.0,,4.0
stu3,,3.0,4.0,5.0
stu4,,,,


In [102]:
df3.dropna(how = 'all')

Unnamed: 0,exam1,exam2,exam3,exam4
stu1,1.0,2.0,3.0,4.0
stu2,,2.0,,4.0
stu3,,3.0,4.0,5.0


In [103]:
df2.dropna(how = 'any')

Unnamed: 0,exam1,exam2,exam3,exam4


In [106]:
df3.dropna(how = 'all', axis = 1)

Unnamed: 0,exam1,exam2,exam3,exam4
stu1,1.0,2.0,3.0,4.0
stu2,,2.0,,4.0
stu3,,3.0,4.0,5.0
stu4,,,,


### 4. 6. 2. fill NA

In [107]:
df3

Unnamed: 0,exam1,exam2,exam3,exam4
stu1,1.0,2.0,3.0,4.0
stu2,,2.0,,4.0
stu3,,3.0,4.0,5.0
stu4,,,,


In [108]:
df3.fillna(0)

Unnamed: 0,exam1,exam2,exam3,exam4
stu1,1.0,2.0,3.0,4.0
stu2,0.0,2.0,0.0,4.0
stu3,0.0,3.0,4.0,5.0
stu4,0.0,0.0,0.0,0.0


In [111]:
# 평균값으로 채우기
df3.fillna({'exam1': df3.exam1.mean(),
            'exam2': df3.exam2.mean(),
            'exam3': df3.exam3.mean(),
            'exam4': df3.exam4.mean()})

Unnamed: 0,exam1,exam2,exam3,exam4
stu1,1.0,2.0,3.0,4.0
stu2,1.0,2.0,3.5,4.0
stu3,1.0,3.0,4.0,5.0
stu4,1.0,2.333333,3.5,4.333333
