# 값 세기
- https://pandas.pydata.org/docs/reference/api/pandas.Series.value_counts.html

In [1]:
import pandas as pd

In [2]:
rich_df = pd.read_csv('TopRichestInWorld.csv')
rich_df

Unnamed: 0,Name,NetWorth,Age,Country/Territory,Source,Industry
0,Elon Musk,"$219,000,000,000",50,United States,"Tesla, SpaceX",Automotive
1,Jeff Bezos,"$171,000,000,000",58,United States,Amazon,Technology
2,Bernard Arnault & family,"$158,000,000,000",73,France,LVMH,Fashion & Retail
3,Bill Gates,"$129,000,000,000",66,United States,Microsoft,Technology
4,Warren Buffett,"$118,000,000,000",91,United States,Berkshire Hathaway,Finance & Investments
...,...,...,...,...,...,...
96,Vladimir Potanin,"$17,300,000,000",61,Russia,metals,Metals & Mining
97,Harold Hamm & family,"$17,200,000,000",76,United States,oil & gas,Energy
98,Sun Piaoyang,"$17,100,000,000",63,China,pharmaceuticals,Healthcare
99,Luo Liguo & family,"$17,000,000,000",66,China,chemicals,Manufacturing


In [3]:
# 특정 열 2개 가져오기
rich_df=pd.read_csv('TopRichestInWorld.csv', usecols=['Country/Territory', 'Industry'])
rich_df

Unnamed: 0,Country/Territory,Industry
0,United States,Automotive
1,United States,Technology
2,France,Fashion & Retail
3,United States,Technology
4,United States,Finance & Investments
...,...,...
96,Russia,Metals & Mining
97,United States,Energy
98,China,Healthcare
99,China,Manufacturing


## .value_counts()

In [5]:
# 나라 열만 가져오기 -> 시리즈 화
rich_country=pd.read_csv('TopRichestInWorld.csv', usecols=['Country/Territory']).squeeze()
rich_country

0      United States
1      United States
2             France
3      United States
4      United States
           ...      
96            Russia
97     United States
98             China
99             China
100        Hong Kong
Name: Country/Territory, Length: 101, dtype: object

In [7]:
# value_counts : collection.Counter와 비슷한 기능
rich_country.value_counts()

Country/Territory
United States    37
China            18
India             7
Germany           7
France            7
Hong Kong         4
Japan             3
Australia         2
Russia            2
Indonesia         2
Italy             2
Canada            2
Mexico            2
Austria           1
Spain             1
Switzerland       1
Chile             1
Singapore         1
Sweden            1
Name: count, dtype: int64

In [9]:
# 산업 열만 가져오기 -> 시리즈 화
rich_industry = pd.read_csv('TopRichestInWorld.csv', usecols=['Industry']).squeeze()
rich_industry

0                 Automotive
1                 Technology
2           Fashion & Retail
3                 Technology
4      Finance & Investments
               ...          
96           Metals & Mining
97                    Energy
98                Healthcare
99             Manufacturing
100              Real Estate
Name: Industry, Length: 101, dtype: object

In [10]:
# 부자들이 가장 많이 하는 산업
rich_industry.value_counts()

Industry
Fashion & Retail         18
Technology               15
Finance & Investments    13
Metals & Mining           9
Food & Beverage           8
Automotive                7
Diversified               6
Manufacturing             5
Healthcare                5
Media & Entertainment     4
Real Estate               3
Telecom                   2
Logistics                 2
Energy                    2
Gambling & Casinos        1
Service                   1
Name: count, dtype: int64

## normalize= - 정규화

In [11]:
#비중계산
rich_industry.value_counts(normalize=True)

Industry
Fashion & Retail         0.178218
Technology               0.148515
Finance & Investments    0.128713
Metals & Mining          0.089109
Food & Beverage          0.079208
Automotive               0.069307
Diversified              0.059406
Manufacturing            0.049505
Healthcare               0.049505
Media & Entertainment    0.039604
Real Estate              0.029703
Telecom                  0.019802
Logistics                0.019802
Energy                   0.019802
Gambling & Casinos       0.009901
Service                  0.009901
Name: proportion, dtype: float64

In [14]:
rich_industry.value_counts(normalize=False)

Industry
Fashion & Retail         18
Technology               15
Finance & Investments    13
Metals & Mining           9
Food & Beverage           8
Automotive                7
Diversified               6
Manufacturing             5
Healthcare                5
Media & Entertainment     4
Real Estate               3
Telecom                   2
Logistics                 2
Energy                    2
Gambling & Casinos        1
Service                   1
Name: count, dtype: int64

**인자 정보**
> **normalize : bool, default False**  
If True then the object returned will contain the relative frequencies of the unique values.
---
**normalize**
- 퍼센트처럼 비중으로 계산합니다.
- 미국이 36퍼센트, 중국이 17퍼센트

In [15]:
#  따라서 합은 1이 된다.
rich_industry.value_counts(normalize=True).sum()

1.0

## sort=, ascending=
-sort 정렬 한다/ 안한다 
-ascending을 쓰면 정렬이 되긴함.1 

In [16]:
# 부자로 가장 유니크한 산업을 찾는다면
rich_industry.value_counts(normalize=True,sort=True)

Industry
Fashion & Retail         0.178218
Technology               0.148515
Finance & Investments    0.128713
Metals & Mining          0.089109
Food & Beverage          0.079208
Automotive               0.069307
Diversified              0.059406
Manufacturing            0.049505
Healthcare               0.049505
Media & Entertainment    0.039604
Real Estate              0.029703
Telecom                  0.019802
Logistics                0.019802
Energy                   0.019802
Gambling & Casinos       0.009901
Service                  0.009901
Name: proportion, dtype: float64

In [18]:
# 오름차순 정렬한다면
rich_industry.value_counts(normalize=True, ascending=True)

Industry
Gambling & Casinos       0.009901
Service                  0.009901
Telecom                  0.019802
Logistics                0.019802
Energy                   0.019802
Real Estate              0.029703
Media & Entertainment    0.039604
Manufacturing            0.049505
Healthcare               0.049505
Diversified              0.059406
Automotive               0.069307
Food & Beverage          0.079208
Metals & Mining          0.089109
Finance & Investments    0.128713
Technology               0.148515
Fashion & Retail         0.178218
Name: proportion, dtype: float64

In [19]:
# normalize를 빼면
rich_industry.value_counts( ascending=True)

Industry
Gambling & Casinos        1
Service                   1
Telecom                   2
Logistics                 2
Energy                    2
Real Estate               3
Media & Entertainment     4
Manufacturing             5
Healthcare                5
Diversified               6
Automotive                7
Food & Beverage           8
Metals & Mining           9
Finance & Investments    13
Technology               15
Fashion & Retail         18
Name: count, dtype: int64

>**sort : bool, default True**
Sort by frequencies when True. Preserve the order of the data when False.
    
>**ascending : bool, default False**
Sort in ascending order.

---

**sort**
- 빈도별로 정렬
- False면 정렬이 안됨.

**ascending**
- True면 오름차순 정렬
- False면 내림차순

In [21]:
# 굳이 이렇게 할 필요가 있을까?
rich_industry.value_counts(normalize=True).sort_values()

Industry
Gambling & Casinos       0.009901
Service                  0.009901
Telecom                  0.019802
Logistics                0.019802
Energy                   0.019802
Real Estate              0.029703
Media & Entertainment    0.039604
Manufacturing            0.049505
Healthcare               0.049505
Diversified              0.059406
Automotive               0.069307
Food & Beverage          0.079208
Metals & Mining          0.089109
Finance & Investments    0.128713
Technology               0.148515
Fashion & Retail         0.178218
Name: proportion, dtype: float64

## bins=
- 범위설정

In [26]:
# 나이 열만 가져오기 -> 시리즈 화
rich_age=pd.read_csv('TopRichestInWorld.csv', usecols=['Age']).squeeze()
rich_age


0      50
1      58
2      73
3      66
4      91
       ..
96     61
97     76
98     63
99     66
100    75
Name: Age, Length: 101, dtype: int64

In [29]:
rich_age.value_counts(bins=5)
#5가지 범위로 나눠서 카운트

(55.6, 68.4]      33
(68.4, 81.2]      27
(81.2, 94.0]      19
(42.8, 55.6]      17
(29.935, 42.8]     5
Name: count, dtype: int64

> **bins : int, optional**  
Rather than count values, group them into half-open bins, a convenience for pd.cut, only works with numeric data.
---
**bins**
- 개수를 입력하여, 그 만큼 구간을 나누어 카운트 합니다.
- bins가 지정안되면, 고유 값의 빈도로 카운드 합니다.
- 숫자처럼 연속형 데이터만 가능합니다.

## dropna= - NaN처리

In [30]:
data = ['banana', None, 'apple', 'banana', 'apple']
data_set = pd.Series(data)
data_set

0    banana
1      None
2     apple
3    banana
4     apple
dtype: object

In [31]:
data_set.value_counts()
# 기본적으로 none 데이터는 무시하고 카운트!! 

banana    2
apple     2
Name: count, dtype: int64

In [32]:
# 카운트에 포함할수있게
data_set.value_counts(dropna=False)

banana    2
apple     2
None      1
Name: count, dtype: int64

> **dropna : bool, default True**
Don’t include counts of NaN.

---
**dropna**
- 카운트 시에 NaN형 데이터를 처리할지를 결정합니다.
- defalut True라서 None이 카운트가 되지 않다가
- False시 None도 카운트가 된다.

In [37]:
# 여러분이 보게 될 수도 있는 예시
import numpy as np
data = ['banana', None, 'apple', 'banana', 'apple', np.nan]
data_set = pd.Series(data)
data_set

0    banana
1      None
2     apple
3    banana
4     apple
5       NaN
dtype: object

In [38]:
data_set.value_counts()

banana    2
apple     2
Name: count, dtype: int64

In [39]:
data_set.value_counts(normalize=True)

banana    0.5
apple     0.5
Name: proportion, dtype: float64

In [40]:
data_set.value_counts(dropna=False)

banana    2
apple     2
None      1
NaN       1
Name: count, dtype: int64

In [41]:
data_set.value_counts(dropna=False, normalize=True)

banana    0.333333
apple     0.333333
None      0.166667
NaN       0.166667
Name: proportion, dtype: float64