## 데이터 프레임 기본연산

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame(np.random.randn(5, 3),columns=['a','b','c'])
df

Unnamed: 0,a,b,c
0,0.5768,1.016366,0.824414
1,-1.135526,0.120098,-0.121302
2,0.913974,0.646753,-0.611712
3,-0.859664,0.060096,1.556821
4,-2.174075,1.359719,0.823174


## 열하나에 대하여 자주사용되는 연산

In [3]:
df['a']+2

0    2.576800
1    0.864474
2    2.913974
3    1.140336
4   -0.174075
Name: a, dtype: float64

In [4]:
df['a']*2

0    1.153600
1   -2.271052
2    1.827948
3   -1.719327
4   -4.348149
Name: a, dtype: float64

In [5]:
df['a']

0    0.576800
1   -1.135526
2    0.913974
3   -0.859664
4   -2.174075
Name: a, dtype: float64

In [6]:
max(df['a'])

0.9139739854650593

In [7]:
df['a'].max()

0.9139739854650593

In [8]:
df['a'].min()

-2.174074623941147

In [9]:
np.mean(df['a'])

-0.5356980661051673

In [10]:
df['a'].mean()

-0.5356980661051673

In [11]:
df['a'].std()

1.273599878390411

In [12]:
## 열단위 연산
df['a']=df['a']*2
df

Unnamed: 0,a,b,c
0,1.1536,1.016366,0.824414
1,-2.271052,0.120098,-0.121302
2,1.827948,0.646753,-0.611712
3,-1.719327,0.060096,1.556821
4,-4.348149,1.359719,0.823174


In [13]:
df['d']=df['a']+df['b']
df

Unnamed: 0,a,b,c,d
0,1.1536,1.016366,0.824414,2.169966
1,-2.271052,0.120098,-0.121302,-2.150954
2,1.827948,0.646753,-0.611712,2.474701
3,-1.719327,0.060096,1.556821,-1.659231
4,-4.348149,1.359719,0.823174,-2.98843


In [14]:
df['e']=df['a']/df['c']
df

Unnamed: 0,a,b,c,d,e
0,1.1536,1.016366,0.824414,2.169966,1.399297
1,-2.271052,0.120098,-0.121302,-2.150954,18.722371
2,1.827948,0.646753,-0.611712,2.474701,-2.988247
3,-1.719327,0.060096,1.556821,-1.659231,-1.104384
4,-4.348149,1.359719,0.823174,-2.98843,-5.282174


## 행이나 열에 함수 적용하기 

### apply: 여러개의 값 input -> 하나의 값 output (행, 열단위)

In [15]:
df['a']

0    1.153600
1   -2.271052
2    1.827948
3   -1.719327
4   -4.348149
Name: a, dtype: float64

In [16]:
np.mean(df['a'])

-1.0713961322103347

In [17]:
# 모든 열의 평균을 구하고 싶다면 어떻게 해야할까?
# apply: 모든 열이나 행단위로 지정된 연산을 수행하는 함수

In [18]:
df.apply(np.mean) #행단위적용

a   -1.071396
b    0.640606
c    0.494279
d   -0.430790
e    2.149373
dtype: float64

In [19]:
df.apply(np.mean,axis=1) #열단위적용

0    1.312729
1    2.859832
2    0.269889
3   -0.573205
4   -2.087172
dtype: float64

In [20]:
np.max(df['a'])

1.8279479709301185

In [21]:
df.apply(np.max)

a     1.827948
b     1.359719
c     1.556821
d     2.474701
e    18.722371
dtype: float64

In [22]:
df.apply(np.max,axis=1)

0     2.169966
1    18.722371
2     2.474701
3     1.556821
4     1.359719
dtype: float64

### apply 사용자 함수와 함께 사용하기

In [23]:
def diff(x):
    return x.max()-x.min()

In [24]:
df.apply(diff)

a     6.176097
b     1.299623
c     2.168533
d     5.463131
e    24.004545
dtype: float64

In [25]:
#각 열별로 0보다 큰 수의 갯수를 구하여라
#각 행별로 0보다 작은 수의 갯수를 하여라

### agg (aggregation) 함수

In [26]:
df.agg([np.mean,np.sum,np.std]) #axis는 통하지 않음

Unnamed: 0,a,b,c,d,e
mean,-1.071396,0.640606,0.494279,-0.43079,2.149373
sum,-5.356981,3.203032,2.471395,-2.153948,10.746864
std,2.5472,0.562645,0.858533,2.560047,9.584321


### describe 함수

In [27]:
df.describe()

Unnamed: 0,a,b,c,d,e
count,5.0,5.0,5.0,5.0,5.0
mean,-1.071396,0.640606,0.494279,-0.43079,2.149373
std,2.5472,0.562645,0.858533,2.560047,9.584321
min,-4.348149,0.060096,-0.611712,-2.98843,-5.282174
25%,-2.271052,0.120098,-0.121302,-2.150954,-2.988247
50%,-1.719327,0.646753,0.823174,-1.659231,-1.104384
75%,1.1536,1.016366,0.824414,2.169966,1.399297
max,1.827948,1.359719,1.556821,2.474701,18.722371


### transform: 여러개의 값 -> 여러개의 값으로 변환 (행, 열단위)

In [28]:
np.abs([-1,-2])

array([1, 2])

In [29]:
df.transform(np.abs)

Unnamed: 0,a,b,c,d,e
0,1.1536,1.016366,0.824414,2.169966,1.399297
1,2.271052,0.120098,0.121302,2.150954,18.722371
2,1.827948,0.646753,0.611712,2.474701,2.988247
3,1.719327,0.060096,1.556821,1.659231,1.104384
4,4.348149,1.359719,0.823174,2.98843,5.282174


In [30]:
def pos_neg(x):
    result=[]
    for i in x:
        if i > 0:
            result.append(1)
        else:
            result.append(-1)
    return result

pos_neg([-3.2,-2,1,1,-5])

[-1, -1, 1, 1, -1]

In [31]:
df.transform(pos_neg)

Unnamed: 0,a,b,c,d,e
0,1,1,1,1,1
1,-1,1,-1,-1,1
2,1,1,-1,1,-1
3,-1,1,1,-1,-1
4,-1,1,1,-1,-1


In [32]:
df

Unnamed: 0,a,b,c,d,e
0,1.1536,1.016366,0.824414,2.169966,1.399297
1,-2.271052,0.120098,-0.121302,-2.150954,18.722371
2,1.827948,0.646753,-0.611712,2.474701,-2.988247
3,-1.719327,0.060096,1.556821,-1.659231,-1.104384
4,-4.348149,1.359719,0.823174,-2.98843,-5.282174


### 특수함수(유용한)

In [44]:
## 이산변수에서 값의 빈도수 세기

In [39]:
df2=pd.DataFrame({'a':['a','a','b','c','a'],'b':['f','f','f','m','m']})

In [42]:
df2['a'].value_counts()

a    3
c    1
b    1
Name: a, dtype: int64