# Laboratorio - Joins con Pandas

Las operaciones de tipo merge aparecen frecuentemente cuando debemos combinar datos de diversas fuentes. 

En este ejemplo vamos a ver datos de población y área por estado en EEUU y vamos a crear un ranking de los estados por su densidad de población total en el año 2010.


In [1]:
# Importar las librerías necesarias

import pandas as pd
from IPython.display import display

In [2]:
# Crear dataframes con los archivos state-population.csv, state-areas.csv y state-abbrevs.csv

pop = pd.read_csv('state-population.csv')
areas = pd.read_csv('state-areas.csv')
abb = pd.read_csv('state-abbrevs.csv')

In [3]:
display(pop.head())
print('pop.shape = {}'.format(pop.shape))

Unnamed: 0,state/region,ages,year,population
0,AL,under18,2012,1117489.0
1,AL,total,2012,4817528.0
2,AL,under18,2010,1130966.0
3,AL,total,2010,4785570.0
4,AL,under18,2011,1125763.0


pop.shape = (2544, 4)


In [4]:
display(areas.head())
print('areas.shape = {}'.format(areas.shape))

Unnamed: 0,state,area (sq. mi)
0,Alabama,52423
1,Alaska,656425
2,Arizona,114006
3,Arkansas,53182
4,California,163707


areas.shape = (52, 2)


In [5]:
display(abb.head())
print('abb.shape = {}'.format(abb.shape))

Unnamed: 0,state,abbreviation
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA


abb.shape = (51, 2)


In [6]:
# Unir la tabla de población con las abreviaturas para obtener el nombre completo del Estado

pop_name = pd.merge(pop, abb, left_on='state/region', right_on='abbreviation', how='left')

In [7]:
# Eliminar las columnas duplicadas

pop_name.drop('abbreviation', axis=1, inplace=True)

In [8]:
pop_name.head()

Unnamed: 0,state/region,ages,year,population,state
0,AL,under18,2012,1117489.0,Alabama
1,AL,total,2012,4817528.0,Alabama
2,AL,under18,2010,1130966.0,Alabama
3,AL,total,2010,4785570.0,Alabama
4,AL,under18,2011,1125763.0,Alabama


In [9]:
# Verificar la existencia de valores nulos
pop_name.isnull().sum()

state/region     0
ages             0
year             0
population      20
state           96
dtype: int64

In [10]:
# Inverstigar los valores nulos ¿Siguen algún patrón los datos faltantes?

pop_name[pop_name['state'].isnull()].sample(50)

Unnamed: 0,state/region,ages,year,population,state
2469,PR,under18,2000,1089063.0,
2486,PR,under18,2013,814068.0,
2500,USA,under18,1992,66509177.0,
2485,PR,under18,2008,945705.0,
2463,PR,total,1998,,
2511,USA,under18,1997,70920738.0,
2529,USA,under18,2006,73757714.0,
2461,PR,total,1996,,
2513,USA,total,1998,275854116.0,
2524,USA,total,2004,292805298.0,


In [11]:
pop_name.loc[pop_name['state'].isnull(), 'state/region'].unique()

array(['PR', 'USA'], dtype=object)

In [12]:
# ¿Qué pasa con el estado de Puerto Rico? Corregir la tabla para incluir el Estado cuando la abreviatura no funciona

pop_name.loc[pop_name['state/region'] == 'PR', 'state'] = 'Puerto Rico'
pop_name.loc[pop_name['state/region'] == 'USA', 'state'] = 'United States'

In [13]:
pop_name.isnull().sum()

state/region     0
ages             0
year             0
population      20
state            0
dtype: int64

In [14]:
# Unir la tabla de población y abreviaturas con la de áreas
display(pop_name.head())
display(areas.head())

Unnamed: 0,state/region,ages,year,population,state
0,AL,under18,2012,1117489.0,Alabama
1,AL,total,2012,4817528.0,Alabama
2,AL,under18,2010,1130966.0,Alabama
3,AL,total,2010,4785570.0,Alabama
4,AL,under18,2011,1125763.0,Alabama


Unnamed: 0,state,area (sq. mi)
0,Alabama,52423
1,Alaska,656425
2,Arizona,114006
3,Arkansas,53182
4,California,163707


In [15]:
areas['state'].unique()

array(['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
       'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia',
       'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas',
       'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts',
       'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana',
       'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico',
       'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma',
       'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina',
       'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont',
       'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming',
       'District of Columbia', 'Puerto Rico'], dtype=object)

In [16]:
pop_full = pd.merge(pop_name, areas, on='state', how='left')

In [17]:
# Identificar los datos faltantes

pop_full.isnull().sum()

state/region      0
ages              0
year              0
population       20
state             0
area (sq. mi)    48
dtype: int64

In [18]:
# Descartar los datos faltantes, si fuera conveniente con dropna

pop_full.dropna(axis=0, inplace=True)

In [19]:
pop_full.isnull().sum()

state/region     0
ages             0
year             0
population       0
state            0
area (sq. mi)    0
dtype: int64

In [20]:
# Filtrar la tabla para el año 2010 y el tipo de población relevante para calcular la densidad total
data_2010 = pop_full.loc[(pop_full['year']==2010)&(pop_full['ages']=='total'), :]

In [21]:
data_2010.head()

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
3,AL,total,2010,4785570.0,Alabama,52423.0
91,AK,total,2010,713868.0,Alaska,656425.0
101,AZ,total,2010,6408790.0,Arizona,114006.0
189,AR,total,2010,2922280.0,Arkansas,53182.0
197,CA,total,2010,37333601.0,California,163707.0


In [22]:
# Calcular en una nueva serie la densidad de población para 2010 y generar un ranking de estados

data_2010.set_index('state', inplace=True)
density = data_2010['population'] / data_2010['area (sq. mi)']


In [24]:
density.sort_values(ascending=False, inplace=True)

In [25]:
density.head()

state
District of Columbia    8898.897059
Puerto Rico             1058.665149
New Jersey              1009.253268
Rhode Island             681.339159
Connecticut              645.600649
dtype: float64