# Baby Names: Data Exploration

In [18]:
import pandas as pd
import altair as alt
from altair import datum
import seaborn as sns
import os

In [3]:
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [4]:
def open_files(path:str='namesbystate',cols:list=['state','sex','year','name','frequency'])->pd.DataFrame:
    '''Reads all of the files in a folder and returns a df.'''
    output = None
    for f in os.listdir(path):
        # check if csv or txt
        if ('.csv' not in f.lower()) and ('.txt' not in f.lower()):
            continue
        csv_path = f'{path}/{f}'
        df = pd.read_csv(csv_path)
        df.columns=cols
        # update the output
        if output is None:
            output = df.copy()
        else:
            output = pd.concat([output,df],axis=0)
    return output

In [5]:
# read file
df=open_files()

# save file, combined
df.to_csv('combined.csv')


Now that we have read in the data, we will start describing it.

In [6]:
df.describe(include='all')

Unnamed: 0,state,sex,year,name,frequency
count,6504110,6504110,6504110.0,6504110,6504110.0
unique,51,2,,33058,
top,CA,F,,James,
freq,413893,3611983,,7519,
mean,,,1978.523,,50.02046
std,,,31.7534,,170.995
min,,,1910.0,,5.0
25%,,,1954.0,,7.0
50%,,,1984.0,,12.0
75%,,,2006.0,,32.0


There are 51 states (DC), 2 sexes, years from 1910 to 2023. The mean frequency of a name per year is 50 and the median is 12.

In [13]:
name_freq = df.groupby(['name','sex'])['frequency'].sum().reset_index().sort_values('frequency',ascending=False)
name_freq.head(10)

Unnamed: 0,name,sex,frequency
15045,James,M,5059617
16563,John,M,4911037
28396,Robert,M,4755851
23984,Michael,M,4396657
34753,William,M,3951735
23172,Mary,F,3730282
8276,David,M,3638586
16819,Joseph,M,2564135
28223,Richard,M,2548914
6375,Charles,M,2303298


In [16]:
male = alt.Chart(name_freq[name_freq.sex=='M'].head(10)).mark_bar().encode(
    alt.X('name:N').sort('-y'),
    alt.Y('frequency:Q')
)

female = alt.Chart(name_freq[name_freq.sex=='F'].head(10)).mark_bar().encode(
    alt.X('name:N').sort('-y'),
    alt.Y('frequency:Q')
)

alt.hconcat(male,female)

In [17]:
name_freq = df[df.year==2023].groupby(['name','sex'])['frequency'].sum().reset_index().sort_values('frequency',ascending=False)
name_freq.head(10)

male = alt.Chart(name_freq[(name_freq.sex=='M')].head(10)).mark_bar().encode(
    alt.X('name:N').sort('-y'),
    alt.Y('frequency:Q')
)

female = alt.Chart(name_freq[name_freq.sex=='F'].head(10)).mark_bar().encode(
    alt.X('name:N').sort('-y'),
    alt.Y('frequency:Q')
)

alt.hconcat(male,female)