### 데이터 불러오기

In [3]:
import pandas as pd
df = pd.read_csv('../data/gapminder.tsv', sep='\t')


### 데이터 살펴보기

In [8]:
df.head() # 상위 5개 출력 (파라미터 없는 경우 기본값 5개)

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106


In [9]:
df.tail() # 하위 5개 출력

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
1699,Zimbabwe,Africa,1987,62.351,9216418,706.157306
1700,Zimbabwe,Africa,1992,60.377,10704340,693.420786
1701,Zimbabwe,Africa,1997,46.809,11404948,792.44996
1702,Zimbabwe,Africa,2002,39.989,11926563,672.038623
1703,Zimbabwe,Africa,2007,43.487,12311143,469.709298


In [10]:
type(df) # DataFrame : 행렬 형태의 2차원 자료구조

pandas.core.frame.DataFrame

In [11]:
df.shape # 행, 열

(1704, 6)

In [14]:
df.shape[0], df.shape[1] # 행의 갯수, 열의 갯수

(1704, 6)

In [17]:
# 데이터프레임 기본정보
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   country    1704 non-null   object 
 1   continent  1704 non-null   object 
 2   year       1704 non-null   int64  
 3   lifeExp    1704 non-null   float64
 4   pop        1704 non-null   int64  
 5   gdpPercap  1704 non-null   float64
dtypes: float64(2), int64(2), object(2)
memory usage: 80.0+ KB


In [18]:
# 컬럼 확인
df.columns

Index(['country', 'continent', 'year', 'lifeExp', 'pop', 'gdpPercap'], dtype='object')

In [21]:
df.columns[0]

'country'

In [22]:
# 타입 확인
df.dtypes

country       object
continent     object
year           int64
lifeExp      float64
pop            int64
gdpPercap    float64
dtype: object

In [23]:
# 기술통계량 (요약통계)
df.describe()

Unnamed: 0,year,lifeExp,pop,gdpPercap
count,1704.0,1704.0,1704.0,1704.0
mean,1979.5,59.474439,29601210.0,7215.327081
std,17.26533,12.917107,106157900.0,9857.454543
min,1952.0,23.599,60011.0,241.165876
25%,1965.75,48.198,2793664.0,1202.060309
50%,1979.5,60.7125,7023596.0,3531.846988
75%,1993.25,70.8455,19585220.0,9325.462346
max,2007.0,82.603,1318683000.0,113523.1329


In [24]:
# 데이터의 개수(컬럼별)
df.count()

country      1704
continent    1704
year         1704
lifeExp      1704
pop          1704
gdpPercap    1704
dtype: int64

In [26]:
# 특정 컬럼 개수
df['country'].count()

1704

In [27]:
# 고유값(유니크한 값)
# 범주형 데이터 각 범주별 개수
df['country'].value_counts()

Afghanistan          12
Pakistan             12
New Zealand          12
Nicaragua            12
Niger                12
                     ..
Eritrea              12
Equatorial Guinea    12
El Salvador          12
Egypt                12
Zimbabwe             12
Name: country, Length: 142, dtype: int64

In [28]:
type(df['continent'].value_counts())

pandas.core.series.Series

### 통계함수

In [35]:
# 평균
df[['lifeExp', 'pop']].mean()

lifeExp    5.947444e+01
pop        2.960121e+07
dtype: float64

In [33]:
df.mean()

  df.mean()


year         1.979500e+03
lifeExp      5.947444e+01
pop          2.960121e+07
gdpPercap    7.215327e+03
dtype: float64

In [36]:
# 중앙값
df['lifeExp'].median()

60.7125

In [38]:
# 최대, 최소
df['lifeExp'].max(), df['lifeExp'].min()

(82.603, 23.599)

In [39]:
# 분산, 표준편차
df['lifeExp'].var(), df['lifeExp'].std()

(166.85166397687885, 12.917107415241187)

In [40]:
# 합계
df['lifeExp'].sum()

101344.44467999999

In [41]:
# 상관계수
# 두 변수(컬럼)간의 관계 (인과관계는 아님)
# -1~1 사이
# 1에 가까우면 양의 상관관계, -1에 가까우면 음의 상관관계
df.corr()

Unnamed: 0,year,lifeExp,pop,gdpPercap
year,1.0,0.435611,0.082308,0.227318
lifeExp,0.435611,1.0,0.064955,0.583706
pop,0.082308,0.064955,1.0,-0.0256
gdpPercap,0.227318,0.583706,-0.0256,1.0


### 데이터 추출
- 열 추출 : ['열이름']  1개 -> 시리즈, 2개 이상 -> 데이터프레임
- 행 추출 : iloc, loc


In [42]:
# 새로운 변수 대입
country = df['country']

In [44]:
type(country)

pandas.core.series.Series

In [45]:
# 국가, 대륙, 년도 추출
subdf = df[['country', 'continent', 'year']]
subdf

Unnamed: 0,country,continent,year
0,Afghanistan,Asia,1952
1,Afghanistan,Asia,1957
2,Afghanistan,Asia,1962
3,Afghanistan,Asia,1967
4,Afghanistan,Asia,1972
...,...,...,...
1699,Zimbabwe,Africa,1987
1700,Zimbabwe,Africa,1992
1701,Zimbabwe,Africa,1997
1702,Zimbabwe,Africa,2002


In [48]:
df.loc[0] # 인덱스명

country      Afghanistan
continent           Asia
year                1952
lifeExp           28.801
pop              8425333
gdpPercap     779.445314
Name: 0, dtype: object

In [49]:
df.iloc[0] # 인덱스번호

country      Afghanistan
continent           Asia
year                1952
lifeExp           28.801
pop              8425333
gdpPercap     779.445314
Name: 0, dtype: object

In [50]:
df.loc[-1] # 키 에러

KeyError: -1

In [51]:
df.iloc[-1] # 시리즈 타입

country        Zimbabwe
continent        Africa
year               2007
lifeExp          43.487
pop            12311143
gdpPercap    469.709298
Name: 1703, dtype: object

In [52]:
df.tail(1) # 데이터프레임 타입

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
1703,Zimbabwe,Africa,2007,43.487,12311143,469.709298


In [53]:
df.loc[[0, 10, 100]] # 행이 여러개이면 시리즈가 아닌 데이터프레임 타입으로 나옴

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
10,Afghanistan,Asia,2002,42.129,25268405,726.734055
100,Bangladesh,Asia,1972,45.252,70759295,630.233627


In [55]:
# 행, 열
subset = df.loc[:, ['year', 'pop']]
subset

Unnamed: 0,year,pop
0,1952,8425333
1,1957,9240934
2,1962,10267083
3,1967,11537966
4,1972,13079460
...,...,...
1699,1987,9216418
1700,1992,10704340
1701,1997,11404948
1702,2002,11926563


In [56]:
df.iloc[:, [2,4]]

Unnamed: 0,year,pop
0,1952,8425333
1,1957,9240934
2,1962,10267083
3,1967,11537966
4,1972,13079460
...,...,...
1699,1987,9216418
1700,1992,10704340
1701,1997,11404948
1702,2002,11926563


### 그룹핑

In [59]:
# 연도별 기대수명 평균
df.groupby('year')['lifeExp'].mean()

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64

In [10]:
df.groupby(['year','continent']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,lifeExp,pop,gdpPercap
year,continent,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1952,Africa,39.1355,4570010.0,1252.572466
1952,Americas,53.27984,13806100.0,4079.062552
1952,Asia,46.314394,42283560.0,5195.484004
1952,Europe,64.4085,13937360.0,5661.057435
1952,Oceania,69.255,5343003.0,10298.08565
1957,Africa,41.266346,5093033.0,1385.236062
1957,Americas,55.96028,15478160.0,4616.043733
1957,Asia,49.318544,47356990.0,5787.73294
1957,Europe,66.703067,14596350.0,6963.012816
1957,Oceania,70.295,5970988.0,11598.522455


In [60]:
# 연도별, 대륙별 기대수명과 gdp 평균
df.groupby(['year','continent'])[['lifeExp', 'gdpPercap']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,lifeExp,gdpPercap
year,continent,Unnamed: 2_level_1,Unnamed: 3_level_1
1952,Africa,39.1355,1252.572466
1952,Americas,53.27984,4079.062552
1952,Asia,46.314394,5195.484004
1952,Europe,64.4085,5661.057435
1952,Oceania,69.255,10298.08565
1957,Africa,41.266346,1385.236062
1957,Americas,55.96028,4616.043733
1957,Asia,49.318544,5787.73294
1957,Europe,66.703067,6963.012816
1957,Oceania,70.295,11598.522455


In [61]:
# 그룹핑한 데이터의 개수
# 대륙별 국가 개수
df.groupby('continent')['country'].count()

continent
Africa      624
Americas    300
Asia        396
Europe      360
Oceania      24
Name: country, dtype: int64

In [62]:
# 대륙별 국가 개수(유니크)
df.groupby('continent')['country'].nunique()

continent
Africa      52
Americas    25
Asia        33
Europe      30
Oceania      2
Name: country, dtype: int64