# Laboratorio - Joins con Pandas

Las operaciones de tipo merge aparecen frecuentemente cuando debemos combinar datos de diversas fuentes. 

En este ejemplo vamos a ver datos de población y área por estado en EEUU y vamos a crear un ranking de los estados por su densidad de población total en el año 2010.


In [195]:
# Importar las librerías necesarias
import pandas as pd
import numpy as np


In [196]:
# Crear dataframes con los archivos state-population.csv, state-areas.csv y state-abbrevs.csv
dfabb = pd.read_csv('state-abbrevs.csv')
dfarea = pd.read_csv('state-areas.csv')
dfpopu = pd.read_csv('state-population.csv')

dfabb.head()



Unnamed: 0,state,abbreviation
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA


In [197]:
dfarea.head()

Unnamed: 0,state,area (sq. mi)
0,Alabama,52423
1,Alaska,656425
2,Arizona,114006
3,Arkansas,53182
4,California,163707


In [198]:
dfpopu.head()

Unnamed: 0,state/region,ages,year,population
0,AL,under18,2012,1117489.0
1,AL,total,2012,4817528.0
2,AL,under18,2010,1130966.0
3,AL,total,2010,4785570.0
4,AL,under18,2011,1125763.0


In [199]:
# Unir la tabla de población con las abreviaturas para obtener el nombre completo del Estado

df_estate_full = pd.merge(dfabb, dfpopu, left_on="abbreviation", right_on="state/region", how='right')
df_estate_full.head()

Unnamed: 0,state,abbreviation,state/region,ages,year,population
0,Alabama,AL,AL,under18,2012,1117489.0
1,Alabama,AL,AL,total,2012,4817528.0
2,Alabama,AL,AL,under18,2010,1130966.0
3,Alabama,AL,AL,total,2010,4785570.0
4,Alabama,AL,AL,under18,2011,1125763.0


In [200]:
# Eliminar las columnas duplicadas
df_estate_full = df_estate_full.drop(['abbreviation'], axis=1)


In [201]:
df_estate_full.head()

Unnamed: 0,state,state/region,ages,year,population
0,Alabama,AL,under18,2012,1117489.0
1,Alabama,AL,total,2012,4817528.0
2,Alabama,AL,under18,2010,1130966.0
3,Alabama,AL,total,2010,4785570.0
4,Alabama,AL,under18,2011,1125763.0


In [202]:
# Verificar la existencia de valores nulos
df_estate_full.isnull().sum()

state           96
state/region     0
ages             0
year             0
population      20
dtype: int64

In [203]:
# Inverstigar los valores nulos ¿Siguen algún patrón los datos faltantes?
df_estate_full[df_estate_full['state'].isnull()]


Unnamed: 0,state,state/region,ages,year,population
2448,,PR,under18,1990,
2449,,PR,total,1990,
2450,,PR,total,1991,
2451,,PR,under18,1991,
2452,,PR,total,1993,
2453,,PR,under18,1993,
2454,,PR,under18,1992,
2455,,PR,total,1992,
2456,,PR,under18,1994,
2457,,PR,total,1994,


In [204]:
df_estate_full.loc[df_estate_full['state/region'] == 'PR']

Unnamed: 0,state,state/region,ages,year,population
2448,,PR,under18,1990,
2449,,PR,total,1990,
2450,,PR,total,1991,
2451,,PR,under18,1991,
2452,,PR,total,1993,
2453,,PR,under18,1993,
2454,,PR,under18,1992,
2455,,PR,total,1992,
2456,,PR,under18,1994,
2457,,PR,total,1994,


In [205]:
# ¿Qué pasa con el estado de Puerto Rico? Corregir la tabla para incluir el Estado cuando la abreviatura no funciona
df_estate_full.loc[df_estate_full['state/region'] == 'PR', 'state'] = 'Puerto Rico'

In [206]:
df_estate_full[df_estate_full['state'].isnull()]

Unnamed: 0,state,state/region,ages,year,population
2496,,USA,under18,1990,64218512.0
2497,,USA,total,1990,249622814.0
2498,,USA,total,1991,252980942.0
2499,,USA,under18,1991,65313018.0
2500,,USA,under18,1992,66509177.0
2501,,USA,total,1992,256514231.0
2502,,USA,total,1993,259918595.0
2503,,USA,under18,1993,67594938.0
2504,,USA,under18,1994,68640936.0
2505,,USA,total,1994,263125826.0


In [207]:
df_estate_full.loc[df_estate_full['state/region'] == 'USA', 'state'] = 'Estados Unidos'

In [208]:
df_estate_full.isnull().sum()

state            0
state/region     0
ages             0
year             0
population      20
dtype: int64

In [209]:
# Unir la tabla de población y abreviaturas con la de áreas
df_estate_full2 = pd.merge(df_estate_full, dfarea, how = 'left')

In [210]:
df_estate_full2.head()

Unnamed: 0,state,state/region,ages,year,population,area (sq. mi)
0,Alabama,AL,under18,2012,1117489.0,52423.0
1,Alabama,AL,total,2012,4817528.0,52423.0
2,Alabama,AL,under18,2010,1130966.0,52423.0
3,Alabama,AL,total,2010,4785570.0,52423.0
4,Alabama,AL,under18,2011,1125763.0,52423.0


In [211]:
# Identificar los datos faltantes
df_estate_full2.isnull().sum()


state             0
state/region      0
ages              0
year              0
population       20
area (sq. mi)    48
dtype: int64

In [212]:
# Descartar los datos faltantes, si fuera conveniente con dropna
dffinal = df_estate_full2.dropna()


In [213]:
dffinal.isnull().sum()

state            0
state/region     0
ages             0
year             0
population       0
area (sq. mi)    0
dtype: int64

In [214]:
dffinal.head()

Unnamed: 0,state,state/region,ages,year,population,area (sq. mi)
0,Alabama,AL,under18,2012,1117489.0,52423.0
1,Alabama,AL,total,2012,4817528.0,52423.0
2,Alabama,AL,under18,2010,1130966.0,52423.0
3,Alabama,AL,total,2010,4785570.0,52423.0
4,Alabama,AL,under18,2011,1125763.0,52423.0


In [216]:
# Filtrar la tabla para el año 2010 y el tipo de población relevante para calcular la densidad total
df2010 = dffinal.loc[(dffinal['ages'] == 'total') & (dffinal['year'] == 2010)]

In [217]:
df2010.head()

Unnamed: 0,state,state/region,ages,year,population,area (sq. mi)
3,Alabama,AL,total,2010,4785570.0,52423.0
91,Alaska,AK,total,2010,713868.0,656425.0
101,Arizona,AZ,total,2010,6408790.0,114006.0
189,Arkansas,AR,total,2010,2922280.0,53182.0
197,California,CA,total,2010,37333601.0,163707.0


In [218]:

dfdensidad = df2010['population']/df2010['area (sq. mi)']
dfdensidad.head()

3       91.287603
91       1.087509
101     56.214497
189     54.948667
197    228.051342
dtype: float64

In [221]:
df2010.append(dfdensidad, ignore_index=True)
df2010.head()

Unnamed: 0,state,state/region,ages,year,population,area (sq. mi)
3,Alabama,AL,total,2010,4785570.0,52423.0
91,Alaska,AK,total,2010,713868.0,656425.0
101,Arizona,AZ,total,2010,6408790.0,114006.0
189,Arkansas,AR,total,2010,2922280.0,53182.0
197,California,CA,total,2010,37333601.0,163707.0


In [None]:
# Calcular en una nueva serie la densidad de población para 2010 y generar un ranking de estados
densidad.sort_values(ascending=True)
