# Baby Names
---

**Purpose:**

* Techniques for handling raw data
* Creating your own measures

**Data Source:**

https://www.ssa.gov/OACT/babynames/

## Featured Libaries

In [None]:
# system libraries
import os
import sys

# check system information
print('Python Information', sys.version)
print('This is your current directory', os.getcwd())

In [None]:
import zipfile
import glob
import csv

import re
import random

from collections import Counter
from pprint import pprint

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from pylab import rcParams
rcParams['figure.figsize'] = 12, 6

plt.style.use('ggplot')

In [None]:
!pip install wget

In [None]:
import wget

## Data Extraction

In [None]:
URL = 'https://www.ssa.gov/oact/babynames/state/namesbystate.zip'
filename = wget.download(URL)

In [None]:
try:
    os.mkdir('OUTPUT')
except FileExistsError:
     print('The directory already exists')

In [None]:
with zipfile.ZipFile(filename, mode='r') as z:
    z.extractall(path='BABY_DATA')

In [None]:
files = glob.glob('BABY_DATA/*.TXT')
pprint(files, compact=True, width=80)
print(len(files))

In [None]:
with open(np.random.choice(files), 'r') as f:
    for line in f.readlines()[0:10]:
        print(line)

In [None]:
df = pd.concat([pd.read_csv(file, header=None) for file in files], ignore_index=True)
df.info()

In [None]:
df.head()

In [None]:
df.columns = ['stabbr', 'sex', 'birth_year', 'baby_name', 'name_count']
df.head()

In [None]:
df.name_count.sum()

In [None]:
df.nunique()

In [None]:
df.sample(10)

In [None]:
name_list = ['Greg', 'Peter', 'Bobby', 'Mike']

df[df['baby_name'].isin(name_list)]

## Plotting

In [None]:
def plot_names(name_list=[]):
    df[df['baby_name'].isin(name_list)]\
    .pivot_table(index='birth_year', columns='baby_name', values='name_count', aggfunc=np.sum)\
    .plot.area(colormap='coolwarm', alpha=.4, figsize=(14,4), stacked=True);
    
plot_names(name_list)

In [None]:
plot_names(['Marcia', 'Jan', 'Cindy', 'Carol'])

In [None]:
def plot_sex(name=str):
    df[df['baby_name'].str.startswith(name)]\
    .pivot_table(index='birth_year', columns='sex', values='name_count', aggfunc=np.sum)\
    .plot.area(colormap='Spectral', alpha=.4, figsize=(14,6), subplots=True, title=name);
    
plot_sex('Leslie')

In [None]:
plot_sex('Jes')

## Creating New Measures

In [None]:
# get the first initial of each name

df['initial'] = df['baby_name'].str[0]
df.head()

In [None]:
# get the length of each name

df['name_len'] = df['baby_name'].str.len()
df.head()

In [None]:
# turn the year into datetime

df['year_end'] = pd.to_datetime(df['birth_year'].astype(str) + '1231')
df.head()

In [None]:
# map M = Male & F = female

df['gender'] = df['sex'].map({'M':'Male', 'F':'Female'})
df.head()

## Plot Using New Measures

In [None]:
df.groupby('year_end')['name_count'].sum().plot();

In [None]:
df.pivot_table(index='year_end', columns='sex', values='name_count', aggfunc=np.sum)\
.plot(color=['pink', 'slateblue']);

In [None]:
df.query('birth_year >= 2008')\
.pivot_table(index='birth_year', columns='initial', values='name_count', fill_value=0, aggfunc=np.sum)\
.plot(subplots=True, layout=(6,6), figsize=(20,20), fontsize=7, grid=True, marker='o', linewidth=.7);

In [None]:
df.boxplot(column='name_len', vert=False); 

In [None]:
df.boxplot(column='name_len', by='initial', rot=0, grid=False);