# 미국의 영어 이름 분석
### 캐글의 USA Name Data
- 1910년부터 2015년까지의 데이터 존재

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
# 파일 불러오기

file = 'C:/Users/psuny/Desktop/babyNamesUS.csv'
raw = pd.read_csv(file)
raw

Unnamed: 0,StateCode,Sex,YearOfBirth,Name,Number
0,AK,F,1910,Mary,14
1,AK,F,1910,Annie,12
2,AK,F,1910,Anna,10
3,AK,F,1910,Margaret,8
4,AK,F,1910,Helen,7
...,...,...,...,...,...
1048570,FL,F,1993,Francis,8
1048571,FL,F,1993,Francisca,8
1048572,FL,F,1993,Gillian,8
1048573,FL,F,1993,Gisela,8


In [4]:
# 데이터 구조 확인

raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 5 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   StateCode    1048575 non-null  object
 1   Sex          1048575 non-null  object
 2   YearOfBirth  1048575 non-null  int64 
 3   Name         1048575 non-null  object
 4   Number       1048575 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 40.0+ MB


#### 집계하기(pivot_table)
* pd.pivot_table(index = '칼럼명', columns = '칼럼명', values = '칼럼명', aggfunc = 'sum')
* aggfunc 옵션: sum, count, mean,...

In [5]:
# 이름 사용 빈도수 집계하기

raw.pivot_table(index = 'Name', values = 'Number', aggfunc = 'sum')

Unnamed: 0_level_0,Number
Name,Unnamed: 1_level_1
Aadan,18
Aaden,855
Aadhav,14
Aadhya,188
Aadi,116
...,...
Zylah,36
Zyler,38
Zyon,97
Zyra,23


In [5]:
# 이름, 성별 사용 빈도수 집계하기

name_df = raw.pivot_table(index = 'Name', values = 'Number', columns = 'Sex', aggfunc = 'sum')
name_df

Sex,F,M
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Aadan,,18.0
Aaden,,855.0
Aadhav,,14.0
Aadhya,188.0,
Aadi,,116.0
...,...,...
Zylah,36.0,
Zyler,,38.0
Zyon,6.0,91.0
Zyra,23.0,


In [6]:
name_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20815 entries, Aadan to Zyrah
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   F       14140 non-null  float64
 1   M       8658 non-null   float64
dtypes: float64(2)
memory usage: 487.9+ KB


#### 빈 데이터 채워넣기
* 공통된 값을 입력하거나(ex: 0)
* 임의의 수를 입력하거나(ex: 평균, 최대값, 최소값, 주변값 등)
* 빈 데이터를 분석에서 제외하거나

In [7]:
# 결측치 채워넣기. 이 데이터에서는 데이터가 비어있다는 의미는 해당 이름이 한 번도 사용된 적이 없다는 의미이므로 0을 채워넣기

name_df = name_df.fillna(0)
name_df

Sex,F,M
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Aadan,0.0,18.0
Aaden,0.0,855.0
Aadhav,0.0,14.0
Aadhya,188.0,0.0
Aadi,0.0,116.0
...,...,...
Zylah,36.0,0.0
Zyler,0.0,38.0
Zyon,6.0,91.0
Zyra,23.0,0.0


In [8]:
name_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20815 entries, Aadan to Zyrah
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   F       20815 non-null  float64
 1   M       20815 non-null  float64
dtypes: float64(2)
memory usage: 487.9+ KB


# Q. 남자/여자 가장 많이 사용되는 이름은? 
* => 이럴 때 정렬하기 사용 .sort_values(by = '칼럼명', ascending = False)

In [9]:
# 남자 이름 사용순위 top5는?

name_df.sort_values(by = 'M', ascending = False)     # ascending = False는 내림차순. 생략가능(기본값은 오름차순)

Sex,F,M
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Michael,4133.0,725757.0
James,3050.0,693271.0
Robert,2469.0,674934.0
John,2398.0,670893.0
David,2003.0,615943.0
...,...,...
Jemimah,5.0,0.0
Jemma,535.0,0.0
Jena,1819.0,0.0
Jenae,510.0,0.0


In [10]:
# 남자 칼럼 기준 내림차순으로 정렬한 데이터 상위 5개 이름 확인

name_df.sort_values(by = 'M', ascending = False).head().index

Index(['Michael', 'James', 'Robert', 'John', 'David'], dtype='object', name='Name')

* 남자 이름 사용순위 top5 = 'Michael', 'James', 'Robert', 'John', 'David'

In [11]:
# 여자 칼럼 기준 내림차순으로 정렬한 데이터 상위 5개 이름 확인

name_df.sort_values(by = 'F', ascending = False).head().index

Index(['Mary', 'Jennifer', 'Elizabeth', 'Patricia', 'Linda'], dtype='object', name='Name')

* 여자 이름 사용 순위 top5 = 'Mary', 'Jennifer', 'Elizabeth', 'Patricia', 'Linda'

#### 칼럼별 데이터 종류 확인
* df['칼럼명'].unique()
* df['칼럼명'].value_counts()

In [12]:
# StateCode 칼럼에 어떤 값들이 있는지 확인

raw['StateCode'].unique()

array(['AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL'],
      dtype=object)

In [13]:
# StateCode 칼럼의 값이 종류별로 몇 번 사용되었는지 확인

raw['StateCode'].value_counts()

CA    361128
AL    128556
AZ    108599
CO    101403
AR     97560
CT     78039
FL     61322
DC     53933
DE     30892
AK     27143
Name: StateCode, dtype: int64

In [14]:
# 연도별 데이터 수 확인

raw['YearOfBirth'].value_counts()

2007    17166
2008    17109
2009    16914
2014    16820
2006    16810
        ...  
1914     3997
1913     3417
1912     3148
1911     2392
1910     2358
Name: YearOfBirth, Length: 106, dtype: int64

# Q. 남자 여자 구분없이 사용되는 공통 이름은?
* 남자, 여자 비율 차이가 적을수록 성별 구분 없는 이름

In [15]:
# 먼저, 성별 등록된 이름 횟수 정리
# 피벗 테이블을 이용해 이름/성별에 따른 등록 횟수 정리

name_df = raw.pivot_table(index = 'Name', columns = 'Sex', values = 'Number', aggfunc = 'sum')

# 결측치는 0을 입력

name_df = name_df.fillna(0)

# 소수점 형태의 실수형태로 되어 있으므로, 이를 int 정수형으로 변경
name_df = name_df.astype(int)
name_df

Sex,F,M
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Aadan,0,18
Aaden,0,855
Aadhav,0,14
Aadhya,188,0
Aadi,0,116
...,...,...
Zylah,36,0
Zyler,0,38
Zyon,6,91
Zyra,23,0


In [16]:
# 남자/여자 이름 등록수 합계 계산

name_df['Sum'] = name_df['F'] + name_df['M']
name_df.head()

Sex,F,M,Sum
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aadan,0,18,18
Aaden,0,855,855
Aadhav,0,14,14
Aadhya,188,0,188
Aadi,0,116,116


In [17]:
# 남자/여자 등록 비율 계산

name_df['F_ratio'] = name_df['F'] / name_df['Sum']
name_df['M_ratio'] = name_df['M'] / name_df['Sum']

# 두 비율의 차이 계산
name_df['M_F_Gap'] = abs(name_df['F_ratio'] - name_df['M_ratio'])     # abs함수는 절대값
name_df.head()

Sex,F,M,Sum,F_ratio,M_ratio,M_F_Gap
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Aadan,0,18,18,0.0,1.0,1.0
Aaden,0,855,855,0.0,1.0,1.0
Aadhav,0,14,14,0.0,1.0,1.0
Aadhya,188,0,188,1.0,0.0,1.0
Aadi,0,116,116,0.0,1.0,1.0


In [18]:
# 이름이 가장 많이 사용된 수(Sum)을 기준으로 내림차순 정렬

name_df = name_df.sort_values(by = 'Sum', ascending = False)
name_df.head()

Sex,F,M,Sum,F_ratio,M_ratio,M_F_Gap
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Michael,4133,725757,729890,0.005662,0.994338,0.988675
James,3050,693271,696321,0.00438,0.99562,0.99124
Robert,2469,674934,677403,0.003645,0.996355,0.99271
John,2398,670893,673291,0.003562,0.996438,0.992877
David,2003,615943,617946,0.003241,0.996759,0.993517


In [19]:
# 남자/여자 비율 차이가 0.1보다 작은 경우 찾기

cond = name_df['M_F_Gap'] < 0.1
name_df[cond]

Sex,F,M,Sum,F_ratio,M_ratio,M_F_Gap
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Jessie,25842,21259,47101,0.548651,0.451349,0.097302
Riley,15539,14929,30468,0.510011,0.489989,0.020021
Emerson,2341,2471,4812,0.486492,0.513508,0.027016
Justice,2083,2461,4544,0.458407,0.541593,0.083187
Kris,2100,2055,4155,0.505415,0.494585,0.010830
...,...,...,...,...,...,...
Yihan,5,5,10,0.500000,0.500000,0.000000
Alika,5,5,10,0.500000,0.500000,0.000000
Rajdeep,5,5,10,0.500000,0.500000,0.000000
Ariyan,5,5,10,0.500000,0.500000,0.000000


In [20]:
# 남자/여자 구분없이 가장 많이 사용되는 이름 top10

name_df[cond].head(10).index

Index(['Jessie', 'Riley', 'Emerson', 'Justice', 'Kris', 'Carey', 'Amari',
       'Stevie', 'Merle', 'Jaylin'],
      dtype='object', name='Name')

* 남자/여자 구분없이 가장 많이 사용되는 이름 top10 = 'Jessie', 'Riley', 'Emerson', 'Justice', 'Kris', 'Carey', 'Amari', 'Stevie', 'Merle', 'Jaylin'

# Q. 가장 대표적인 미국 이름은?

In [21]:
# unique()를 통해, 'YearOfBirth'(기간)에 들어 있는 값 확인

raw['YearOfBirth'].unique()

array([1910, 1911, 1912, 1913, 1914, 1915, 1916, 1917, 1918, 1919, 1920,
       1921, 1922, 1923, 1924, 1925, 1926, 1927, 1928, 1929, 1930, 1931,
       1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941, 1942,
       1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953,
       1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964,
       1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975,
       1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986,
       1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997,
       1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008,
       2009, 2010, 2011, 2012, 2013, 2014, 2015], dtype=int64)

#### 세대 기준으로 그룹 만들기
* 한 세대 나누는 기준 30년: 2020년 기준 30년씩 구분
    * 1930년대 이전
    * 1960년대 이전
    * 1990년대 이전
    * 2020년 이전

In [22]:
# 'YearOfBirht'칼럼에서 세대별로 나누고 새로운 리스트에 저장

year_class_list = []

for year in raw['YearOfBirth']:
    if year <= 1930:
        year_class = '1930년이전'
    elif year <= 1960:
        year_class = '1960년이전'
    elif year <= 1990:
        year_class = '1990년이전'
    else:
        year_class = '2020년이전'
    year_class_list.append(year_class)

raw['year_class'] = year_class_list
raw

Unnamed: 0,StateCode,Sex,YearOfBirth,Name,Number,year_class
0,AK,F,1910,Mary,14,1930년이전
1,AK,F,1910,Annie,12,1930년이전
2,AK,F,1910,Anna,10,1930년이전
3,AK,F,1910,Margaret,8,1930년이전
4,AK,F,1910,Helen,7,1930년이전
...,...,...,...,...,...,...
1048570,FL,F,1993,Francis,8,2020년이전
1048571,FL,F,1993,Francisca,8,2020년이전
1048572,FL,F,1993,Gillian,8,2020년이전
1048573,FL,F,1993,Gisela,8,2020년이전


In [23]:
# pivot_table()을 활용하여 이름/성별, 세대별 이름 등록수 합계 표 생성

name_period = raw.pivot_table(index = ['Name', 'Sex'], columns = 'year_class', values = 'Number', aggfunc = 'sum')

# 결측치 있으므로 결측치는 0으로 채우고, 정수로 변환

name_period = name_period.fillna(0)
name_period = name_period.astype(int)     # 정수로 변경
name_period

Unnamed: 0_level_0,year_class,1930년이전,1960년이전,1990년이전,2020년이전
Name,Sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Aadan,M,0,0,0,18
Aaden,M,0,0,0,855
Aadhav,M,0,0,0,14
Aadhya,F,0,0,0,188
Aadi,M,0,0,0,116
...,...,...,...,...,...
Zyler,M,0,0,0,38
Zyon,F,0,0,0,6
Zyon,M,0,0,0,91
Zyra,F,0,0,0,23


#### 전체 칼럼 합계 계산하기
* 모든 칼럼을 하나씩 더해도 되고: df['칼럼1'] + df['칼럼2'] + ... + df['칼럼n']
* sum() 활용: df.sum(axis = 1)     # axis = 0이 기본값으로 지정되어 있고 이는 row별 합계를 나타내므로 칼럼별 합계를 하려면 axis = 1을 해야 함

In [24]:
# 전체 칼럼 합쳐서 sum 칼럼 생성

name_period['sum'] = name_period.sum(axis = 1)
name_period

Unnamed: 0_level_0,year_class,1930년이전,1960년이전,1990년이전,2020년이전,sum
Name,Sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Aadan,M,0,0,0,18,18
Aaden,M,0,0,0,855,855
Aadhav,M,0,0,0,14,14
Aadhya,F,0,0,0,188,188
Aadi,M,0,0,0,116,116
...,...,...,...,...,...,...
Zyler,M,0,0,0,38,38
Zyon,F,0,0,0,6,6
Zyon,M,0,0,0,91,91
Zyra,F,0,0,0,23,23


In [25]:
name_period.columns

Index(['1930년이전', '1960년이전', '1990년이전', '2020년이전', 'sum'], dtype='object', name='year_class')

In [26]:
# 모든 칼럼을 sum(칼럼별 합계)로 나누어, 세대별 비율 확인

for col in name_period.columns:
    col_new = col + '비율'    
    name_period[col_new] = name_period[col] / name_period['sum']

name_period

Unnamed: 0_level_0,year_class,1930년이전,1960년이전,1990년이전,2020년이전,sum,1930년이전비율,1960년이전비율,1990년이전비율,2020년이전비율,sum비율
Name,Sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Aadan,M,0,0,0,18,18,0.0,0.0,0.0,1.0,1.0
Aaden,M,0,0,0,855,855,0.0,0.0,0.0,1.0,1.0
Aadhav,M,0,0,0,14,14,0.0,0.0,0.0,1.0,1.0
Aadhya,F,0,0,0,188,188,0.0,0.0,0.0,1.0,1.0
Aadi,M,0,0,0,116,116,0.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
Zyler,M,0,0,0,38,38,0.0,0.0,0.0,1.0,1.0
Zyon,F,0,0,0,6,6,0.0,0.0,0.0,1.0,1.0
Zyon,M,0,0,0,91,91,0.0,0.0,0.0,1.0,1.0
Zyra,F,0,0,0,23,23,0.0,0.0,0.0,1.0,1.0


In [27]:
# 이름 사용수 합계(sum), 2020년이전비율, 1990년이전비율 기준으로 내림차순 정리

name_period = name_period.sort_values(by = ['sum', '2020년이전비율', '1990년이전비율'], ascending = False)
name_period

Unnamed: 0_level_0,year_class,1930년이전,1960년이전,1990년이전,2020년이전,sum,1930년이전비율,1960년이전비율,1990년이전비율,2020년이전비율,sum비율
Name,Sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Michael,M,4990,198074,377295,145398,725757,0.006876,0.272921,0.519864,0.200340,1.0
James,M,97838,288091,225243,82099,693271,0.141125,0.415553,0.324899,0.118423,1.0
Robert,M,87070,292338,231058,64468,674934,0.129005,0.433136,0.342342,0.095517,1.0
John,M,98536,268873,227108,76376,670893,0.146873,0.400769,0.338516,0.113842,1.0
David,M,16463,203033,278429,118018,615943,0.026728,0.329630,0.452037,0.191605,1.0
...,...,...,...,...,...,...,...,...,...,...,...
Yoshiro,M,5,0,0,0,5,1.000000,0.000000,0.000000,0.000000,1.0
Ysabel,M,5,0,0,0,5,1.000000,0.000000,0.000000,0.000000,1.0
Yvonnie,F,0,5,0,0,5,0.000000,1.000000,0.000000,0.000000,1.0
Zebedee,M,0,5,0,0,5,0.000000,1.000000,0.000000,0.000000,1.0


In [28]:
# 인덱스가 여러개로 되어 있을 경우, 인덱스를 활용하는 것이 복잡하기 때문에 인덱스로 설정된 이름과 성별을 칼럼으로 변경

name_period = name_period.reset_index()
name_period

year_class,Name,Sex,1930년이전,1960년이전,1990년이전,2020년이전,sum,1930년이전비율,1960년이전비율,1990년이전비율,2020년이전비율,sum비율
0,Michael,M,4990,198074,377295,145398,725757,0.006876,0.272921,0.519864,0.200340,1.0
1,James,M,97838,288091,225243,82099,693271,0.141125,0.415553,0.324899,0.118423,1.0
2,Robert,M,87070,292338,231058,64468,674934,0.129005,0.433136,0.342342,0.095517,1.0
3,John,M,98536,268873,227108,76376,670893,0.146873,0.400769,0.338516,0.113842,1.0
4,David,M,16463,203033,278429,118018,615943,0.026728,0.329630,0.452037,0.191605,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
22793,Yoshiro,M,5,0,0,0,5,1.000000,0.000000,0.000000,0.000000,1.0
22794,Ysabel,M,5,0,0,0,5,1.000000,0.000000,0.000000,0.000000,1.0
22795,Yvonnie,F,0,5,0,0,5,0.000000,1.000000,0.000000,0.000000,1.0
22796,Zebedee,M,0,5,0,0,5,0.000000,1.000000,0.000000,0.000000,1.0


In [29]:
# 남자 이름 중 가장 많이 사용된 이름 상위 10개 확인

cond = name_period['Sex'] == 'M'
name_period[cond].head(10)

year_class,Name,Sex,1930년이전,1960년이전,1990년이전,2020년이전,sum,1930년이전비율,1960년이전비율,1990년이전비율,2020년이전비율,sum비율
0,Michael,M,4990,198074,377295,145398,725757,0.006876,0.272921,0.519864,0.20034,1.0
1,James,M,97838,288091,225243,82099,693271,0.141125,0.415553,0.324899,0.118423,1.0
2,Robert,M,87070,292338,231058,64468,674934,0.129005,0.433136,0.342342,0.095517,1.0
3,John,M,98536,268873,227108,76376,670893,0.146873,0.400769,0.338516,0.113842,1.0
4,David,M,16463,203033,278429,118018,615943,0.026728,0.32963,0.452037,0.191605,1.0
6,William,M,89173,200843,141872,85908,517796,0.172216,0.387881,0.273992,0.165911,1.0
7,Richard,M,30680,185139,131367,35293,382479,0.080214,0.48405,0.343462,0.092274,1.0
8,Christopher,M,335,20961,233318,123408,378022,0.000886,0.055449,0.617207,0.326457,1.0
9,Daniel,M,7133,59581,166941,139894,373549,0.019095,0.1595,0.446905,0.3745,1.0
10,Joseph,M,34908,75603,130341,100905,341757,0.102143,0.221219,0.381385,0.295254,1.0


* 남자 이름 중 가장 많이 사용된 이름 top10 = Michael, James, Robert, John, David, William, Richard, Christopher, Daniel, Joseph
* 이 중 James, Robert, John, Richard의 경우 1960년이전 사용비율이 40%이상으로 요즘 트렌드와는 맞지 않음

In [32]:
# 여자 이름 중 가장 많이 사용된 이름 top10

cond = name_period['Sex'] == 'F'
name_period[cond].head(10)

year_class,Name,Sex,1930년이전,1960년이전,1990년이전,2020년이전,sum,1930년이전비율,1960년이전비율,1990년이전비율,2020년이전비율,sum비율
5,Mary,F,143702,260110,92833,22798,519443,0.276646,0.500748,0.178716,0.043889,1.0
11,Jennifer,F,0,12279,253345,59759,325383,0.0,0.037737,0.778606,0.183657,1.0
16,Elizabeth,F,27705,60658,112556,72495,273414,0.10133,0.221854,0.411669,0.265147,1.0
17,Patricia,F,12951,165330,69739,8988,257008,0.050391,0.643287,0.27135,0.034972,1.0
18,Linda,F,813,191589,51390,6207,249999,0.003252,0.766359,0.205561,0.024828,1.0
19,Jessica,F,0,1380,153315,92631,247326,0.0,0.00558,0.61989,0.37453,1.0
23,Barbara,F,24206,154488,35465,3030,217189,0.111451,0.711307,0.163291,0.013951,1.0
28,Sarah,F,11765,20330,93470,68456,194021,0.060638,0.104782,0.481752,0.352828,1.0
30,Michelle,F,0,12895,136478,39668,189041,0.0,0.068213,0.721949,0.209838,1.0
31,Ashley,F,0,0,89243,97123,186366,0.0,0.0,0.478859,0.521141,1.0


* 여자 이름 중 가장 많이 사용된 이름 top10 = Mary, Jennifer, Elizabeth, Patricia, Linda, Jessica, Barbara, Sarah, Michelle, Ashley
* Mary, Patricia, Linda, Barbara는 1960년 이전에 50%이상 사용됨.
* Jennifer, Jessica, Michelle의 경우 1990년이전에 가장 많이 사용됨
* Ashley의 경우 2020년이전비율이 가장 높음

In [35]:
# 2020년 이전 비율이 30%보다 큰 남자 이름 top5

cond_age = name_period['2020년이전비율'] > 0.3
cond_sex = name_period['Sex'] == 'M'
cond = cond_age & cond_sex
name_period[cond].head(5)

year_class,Name,Sex,1930년이전,1960년이전,1990년이전,2020년이전,sum,1930년이전비율,1960년이전비율,1990년이전비율,2020년이전비율,sum비율
8,Christopher,M,335,20961,233318,123408,378022,0.000886,0.055449,0.617207,0.326457,1.0
9,Daniel,M,7133,59581,166941,139894,373549,0.019095,0.1595,0.446905,0.3745,1.0
14,Matthew,M,1160,8822,148707,121522,280211,0.00414,0.031483,0.530697,0.43368,1.0
15,Anthony,M,7132,36965,114441,121379,279917,0.025479,0.132057,0.408839,0.433625,1.0
20,Andrew,M,7369,18639,94219,117022,237249,0.03106,0.078563,0.397131,0.493245,1.0


In [36]:
# 2020년 이전 비율이 30%보다 큰 여자 이름 top5

cond_age = name_period['2020년이전비율'] > 0.3
cond_sex = name_period['Sex'] == 'F'
cond = cond_age & cond_sex
name_period[cond].head(5)

year_class,Name,Sex,1930년이전,1960년이전,1990년이전,2020년이전,sum,1930년이전비율,1960년이전비율,1990년이전비율,2020년이전비율,sum비율
19,Jessica,F,0,1380,153315,92631,247326,0.0,0.00558,0.61989,0.37453,1.0
28,Sarah,F,11765,20330,93470,68456,194021,0.060638,0.104782,0.481752,0.352828,1.0
31,Ashley,F,0,0,89243,97123,186366,0.0,0.0,0.478859,0.521141,1.0
37,Stephanie,F,252,11271,111214,55909,178646,0.001411,0.063091,0.622538,0.31296,1.0
51,Emily,F,3816,6191,38195,105767,153969,0.024784,0.040209,0.248069,0.686937,1.0
