# Introduction to Python  

## Pandas - Exploratory Data Analysis (EDA) part 2

In [1]:
import os
import zipfile
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from IPython.display import display, Image
from IPython.core.interactiveshell import InteractiveShell
#InteractiveShell.ast_node_interactivity = "all"

%matplotlib inline

## Pandas Data Manipulation: Baby names in the USA

In [2]:
names1880 = pd.read_csv(os.path.join('../Data','names','yob1880.txt'), names=['name', 'sex', 'births'])

FileNotFoundError: [Errno 2] No such file or directory: '../Data/names/yob1880.txt'

In [None]:
names1880[0:20]

In [None]:
#names1880.head()
names1880.tail()

In [None]:
names1880.groupby('sex').births.sum()

In [None]:
years = range(1880, 2020)
pieces = []
columns = ['name', 'sex', 'births']
for year in years:
    path = os.path.join('../Data','names','yob{}.txt'.format(year))
    frame = pd.read_csv(path, names=columns)
    frame['year'] = year
    pieces.append(frame)
    
# Concatenate everything into a single DataFrame
names = pd.concat(pieces, ignore_index=True)

In [None]:
names.head(10)

In [None]:
names.info()

In [None]:
names.groupby('sex').births.sum()

In [None]:
total_births = names.pivot_table('births', index='year', columns='sex', aggfunc=sum)

In [None]:
total_births

In [None]:
total_births.tail()

In [None]:
total_births.plot(title='Total births by sex and year', figsize=(12,6));

In [None]:
def add_prop(group):
    # Integer division floors
    births = group.births.astype(float)
    group['percent'] = births / births.sum()
    return group

names = names.groupby(['year', 'sex']).apply(add_prop)

In [None]:
#names
names.head()

In [None]:
names[names.percent > 0.085]

In [None]:
names[names.name.str.startswith('Stephany')]

In [None]:
np.allclose(names.groupby(['year', 'sex']).percent.sum(), 1)

In [None]:
def get_top1000(group):
    return group.sort_values(by='births', ascending=False)[:1000]

grouped = names.groupby(['year', 'sex'])
top1000 = grouped.apply(get_top1000)

In [None]:
#top1000
pd.options.display.float_format = '{:,.3f}'.format
top1000[:15]

In [None]:
boys = top1000[top1000.sex == 'M']
girls = top1000[top1000.sex == 'F']

In [None]:
Walter_names = boys[boys.name=='Walter']
Walter_names[:10]

In [None]:
top1000.drop(['sex','year'], axis=1, inplace=True)
total_births_top1000 = top1000.pivot_table('births', index='year', columns='name', aggfunc=sum)

In [None]:
total_births_top1000.head()

In [None]:
subset = total_births_top1000[['John', 'Harry', 'Mary', 'Marilyn']]
subset.plot(subplots=True, figsize=(12, 12), grid=False,
title="Number of births per year")

In [None]:
table = top1000.pivot_table('percent', index='year', columns='sex', aggfunc=sum)

table.plot(title='Sum of table1000.percent by year and sex', 
           yticks=np.linspace(0, 1.2, 13), 
           xticks=range(1880, 2020, 10), 
           figsize=(12,8))

In [None]:
df = boys[boys.year == 2019]
prop_cumsum = df.sort_values(by='percent', ascending=False).percent.cumsum()

In [None]:
def get_quantile_count(group, q=0.5):
    group = group.sort_values(by='percent', ascending=False)
    return group.percent.cumsum().values.searchsorted(q) + 1

prop_cumsum.values.searchsorted(0.5)

In [None]:
diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count)
diversity = diversity.unstack('sex')
diversity.head()

In [None]:
diversity.plot(title="Number of popular names in top 50%", figsize=(12,8))

In [None]:
# extract last letter from name column

get_last_letter = lambda x: x[-1]
last_letters = names['name'].map(get_last_letter)

last_letters.name = 'last_letter'

table = names.pivot_table('births', index=last_letters, columns=['sex', 'year'], aggfunc=sum)

subtable = table.reindex(columns=[1910, 1960, 2010], level='year')
subtable.head()

In [None]:
subtable.sum()

In [None]:
letter_prop = subtable / subtable.sum().astype(float)

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(10, 8))
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')
letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female',legend=False)

In [None]:
letter_prop = table / table.sum().astype(float)
dny_ts = letter_prop.loc[['d', 'n', 'y'], 'M'].T
dny_ts.head()

In [None]:
letter_prop

In [None]:
dny_ts.plot(figsize=(12,8))

In [None]:
all_names = top1000.name.unique()
mask = np.array(['lesl' in x.lower() for x in all_names])
lesley_like = all_names[mask]
lesley_like

In [None]:
filtered = top1000[top1000.name.isin(lesley_like)]
filtered.groupby('name').births.sum()

In [None]:
table = filtered.pivot_table('births', index='year', columns='sex', aggfunc='sum')
table = table.div(table.sum(1), axis=0)
table.tail()

In [None]:
table.plot(style={'M': 'k-', 'F': 'k--'}, figsize=(12,8))