## Pandas 기초 문법

In [4]:
import pandas as pd
df = pd.read_csv('./data/gapminder.tsv', sep = '\t')

In [5]:
df

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.853030
2,Afghanistan,Asia,1962,31.997,10267083,853.100710
3,Afghanistan,Asia,1967,34.020,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106
...,...,...,...,...,...,...
1699,Zimbabwe,Africa,1987,62.351,9216418,706.157306
1700,Zimbabwe,Africa,1992,60.377,10704340,693.420786
1701,Zimbabwe,Africa,1997,46.809,11404948,792.449960
1702,Zimbabwe,Africa,2002,39.989,11926563,672.038623


### Groupby

In [7]:
# 컬럼을 잡고 통계치 확인 가능
df.groupby('year').pop.sum()

year
1952    2406957150
1957    2664404580
1962    2899782974
1967    3217478384
1972    3576977158
1977    3930045807
1982    4289436840
1987    4691477418
1992    5110710260
1997    5515204472
2002    5886977579
2007    6251013179
Name: pop, dtype: int64

In [8]:
# 두가지 이상
df.groupby('year')[['lifeExp','pop']].mean()

Unnamed: 0_level_0,lifeExp,pop
year,Unnamed: 1_level_1,Unnamed: 2_level_1
1952,49.05762,16950400.0
1957,51.507401,18763410.0
1962,53.609249,20421010.0
1967,55.67829,22658300.0
1972,57.647386,25189980.0
1977,59.570157,27676380.0
1982,61.533197,30207300.0
1987,63.212613,33038570.0
1992,64.160338,35990920.0
1997,65.014676,38839470.0


- df.groupby('year').lifeExp.agg(내가 원하는 함수)

In [9]:
def my_Avg(values):
    n = len(values)
    sum_1 = 0
    for value in values:
        sum_1 += value
    
    return sum_1/n

In [10]:
df.groupby('year').lifeExp.agg(my_Avg)

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64

- numpy 연산자 사용

In [13]:
import numpy as np
df.groupby('year').lifeExp.agg([np.mean, np.std, np.count_nonzero])

Unnamed: 0_level_0,mean,std,count_nonzero
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1952,49.05762,12.225956,142
1957,51.507401,12.231286,142
1962,53.609249,12.097245,142
1967,55.67829,11.718858,142
1972,57.647386,11.381953,142
1977,59.570157,11.227229,142
1982,61.533197,10.770618,142
1987,63.212613,10.556285,142
1992,64.160338,11.22738,142
1997,65.014676,11.559439,142


### Merge
- 데이터를 합치는 경우, 테이블을 병합시키고 정리해서 원하는 데이터셋 만드는 경우 사용
- merge, concat 

In [14]:
test1 = pd.DataFrame({'class':['파문기','데분기','데분중','데분고'],
                    '인원':[100,200,300,40]})
test2 = pd.DataFrame({'class':['파문기','데분기','데분중','데분고'],
                    '벌점평균':[5,6,7,3]})

In [15]:
print(test1)
print(test2)

  class   인원
0   파문기  100
1   데분기  200
2   데분중  300
3   데분고   40
  class  벌점평균
0   파문기     5
1   데분기     6
2   데분중     7
3   데분고     3


In [16]:
pd.merge(test1, test2, how = 'left', on = 'class') # 공통의 키를 잡고 병합

Unnamed: 0,class,인원,벌점평균
0,파문기,100,5
1,데분기,200,6
2,데분중,300,7
3,데분고,40,3


### Concat
- 덩어리 + 덩어리
- 공통의 키를 잡는 개념이 아님, 지정 안함

In [17]:
sp_1 = pd.concat([test1,test2], axis = 0) # 행방향

In [18]:
sp_1

Unnamed: 0,class,인원,벌점평균
0,파문기,100.0,
1,데분기,200.0,
2,데분중,300.0,
3,데분고,40.0,
0,파문기,,5.0
1,데분기,,6.0
2,데분중,,7.0
3,데분고,,3.0


In [20]:
sp_2 = pd.concat([test1,test2], axis = 1)

In [21]:
sp_2

Unnamed: 0,class,인원,class.1,벌점평균
0,파문기,100,파문기,5
1,데분기,200,데분기,6
2,데분중,300,데분중,7
3,데분고,40,데분고,3


### 데이터를 접근하는 방법(loc, iloc)

- loc : 인덱스를 기준으로 데이터를 접근하는 방식, 컬럼의 이름 그대로 가지고 올 수 있다.
- iloc : 행의 순서에 따라 데이터를 접근하는 방식, 컬럼을 숫자로 대체해서 가지고 와야 한다.

In [23]:
df_sp = df[0:100]

In [26]:
df_sp

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.853030
2,Afghanistan,Asia,1962,31.997,10267083,853.100710
3,Afghanistan,Asia,1967,34.020,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106
...,...,...,...,...,...,...
95,Bahrain,Asia,2007,75.635,708573,29796.048340
96,Bangladesh,Asia,1952,37.484,46886859,684.244172
97,Bangladesh,Asia,1957,39.348,51365468,661.637458
98,Bangladesh,Asia,1962,41.216,56839289,686.341554


In [27]:
df_sp.iloc[1] # 두번째 행 출력

country      Afghanistan
continent           Asia
year                1957
lifeExp           30.332
pop              9240934
gdpPercap      820.85303
Name: 1, dtype: object

In [28]:
df_sp.loc[0:100]

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.853030
2,Afghanistan,Asia,1962,31.997,10267083,853.100710
3,Afghanistan,Asia,1967,34.020,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106
...,...,...,...,...,...,...
95,Bahrain,Asia,2007,75.635,708573,29796.048340
96,Bangladesh,Asia,1952,37.484,46886859,684.244172
97,Bangladesh,Asia,1957,39.348,51365468,661.637458
98,Bangladesh,Asia,1962,41.216,56839289,686.341554


In [29]:
df_sp.loc[0,'year']

1952

In [30]:
df_sp.iloc[0,-1]

779.4453145

### Series 연산
- 통계치는 당연히 다 가능하다( min,max..mean,median,std)
- append( 연결도 가능 )
- desrcribe()
- drop_duplicates() 중복 제거하기
- get_values()시리즈의 값 구하기
- sort_values()
- to_frame()

In [33]:
pd.Series(['파문기',10])

0    파문기
1     10
dtype: object

In [35]:
# 컬럼을 하나만 선택할 시 시리즈 형태로 출력
df_se = df['pop']

In [36]:
df_se

0        8425333
1        9240934
2       10267083
3       11537966
4       13079460
          ...   
1699     9216418
1700    10704340
1701    11404948
1702    11926563
1703    12311143
Name: pop, Length: 1704, dtype: int64

### 데이터를 깔끔하게 만드는 법 melt
- 기준을 잡는 설정 
- id_vars 위치를 그대로 유지할 열의 이름 지정
- value_vars 행으로 위치를 변경할 열의 이름
- var_name value_vars 위치로 변경한 열의 이름 지정
- value_name var_name 위치로 변경한 열의 이름 지정

In [37]:
# 통계청 데이터
pew = pd.read_csv('./data/pew.csv')

In [38]:
pew

Unnamed: 0,religion,<$10k,$10-20k,$20-30k,$30-40k,$40-50k,$50-75k,$75-100k,$100-150k,>150k,Don't know/refused
0,Agnostic,27,34,60,81,76,137,122,109,84,96
1,Atheist,12,27,37,52,35,70,73,59,74,76
2,Buddhist,27,21,30,34,33,58,62,39,53,54
3,Catholic,418,617,732,670,638,1116,949,792,633,1489
4,Don’t know/refused,15,14,15,11,10,35,21,17,18,116
5,Evangelical Prot,575,869,1064,982,881,1486,949,723,414,1529
6,Hindu,1,9,7,9,11,34,47,48,54,37
7,Historically Black Prot,228,244,236,238,197,223,131,81,78,339
8,Jehovah's Witness,20,27,24,24,21,30,15,11,6,37
9,Jewish,19,19,25,25,30,95,69,87,151,162


In [39]:
pd.melt(pew,id_vars = 'religion')

Unnamed: 0,religion,variable,value
0,Agnostic,<$10k,27
1,Atheist,<$10k,12
2,Buddhist,<$10k,27
3,Catholic,<$10k,418
4,Don’t know/refused,<$10k,15
...,...,...,...
175,Orthodox,Don't know/refused,73
176,Other Christian,Don't know/refused,18
177,Other Faiths,Don't know/refused,71
178,Other World Religions,Don't know/refused,8


In [40]:
pd.melt(pew,id_vars='religion',var_name='income',value_name='count')

Unnamed: 0,religion,income,count
0,Agnostic,<$10k,27
1,Atheist,<$10k,12
2,Buddhist,<$10k,27
3,Catholic,<$10k,418
4,Don’t know/refused,<$10k,15
...,...,...,...
175,Orthodox,Don't know/refused,73
176,Other Christian,Don't know/refused,18
177,Other Faiths,Don't know/refused,71
178,Other World Religions,Don't know/refused,8
