In [17]:
import pandas as pd
import numpy as np
import requests
import ast

pd.options.display.max_columns = None

# Pair Programming ETL Transformación I

Tendréis que usar el csv attacks_limpieza_completa que tenéis adjunto abajo.

En la lección de hoy aprendimos como transformar nuestros datos para que estén preparados para almacearlos en una BBDD. En este momento tenemos dos fuentes de datos:

    1. El csv con los ataques de tiburones que hemos estado limpiando hasta ahora, el que os hemos adjuntado (attacks_limpieza_completa). Sentiros libres de usar vuestros propios csv en caso de que queráis.

    2. El csv con los datos climáticos de los principales paises que tienen ataques de tiburones, el que creamos en el pair programming de ayer.

El objetivo de la sesión de hoy será juntar en un único csv la información de ambas fuentes. Para ello:

- Cargaremos los dos ficheros de datos
- Del dataframe de los ataques nos quedaremos solo con las filas de los países que seleccionamos en la lección de ayer:

    - USA
    - Australia
    - New Zealand
    - South Africa
    - Papua New Guinea

- Del dataframe de los datos climáticos seleccionaremos todas las columnas.

- Cuando ya tengamos todos los datos deseados juntaremos los dos csv.
Para hacer esta unión tendremos que hacer un groupby en la tabla de clima para sacar una media de las medidas climáticas por país.

- Antes de hacer el groupby si nos fijamos tenemos dos columnas rh_profile y wind_profile cuya información es una lista de diccionarios. Si intentamos hacer la media de eso no nos dará un valor real. A este problema ya nos enfrentamos en la clase invertida de ETL-2, donde teníais un Bonus para desempaquetar esta información. En caso de que en aquel ejercicio no lo consigierais os dejamos por aquí una posible solución que nos permite separar esa información en distintas columnas. Os dejamos el código documentado. ⚠️ Os recomendamos que vayáis desgranando el código y viendo lo que nos devuelve cada línea de código para entenderlo mejor.

In [18]:
# Cargamos el df de los ataques
df = pd.read_csv("../data/05-tiburon_4.csv", index_col = 0)

In [19]:
df.head()

Unnamed: 0,type,country,activity,species,month,fatal,gender,year,age
0,Boating,usa,Paddling,White,Jun,N,F,2018.0,57.0
1,Unprovoked,brazil,Swimming,Tiger,Jun,Y,M,2018.0,18.0
2,Unprovoked,usa,Walking,Bull,May,N,M,2018.0,15.0
3,Provoked,australia,Feeding sharks,Grey,May,N,M,2018.0,32.0
4,Invalid,england,Fishing,Unspecified,May,N,M,2018.0,21.0


In [20]:
# Cargamos el dataframe del clima
df_clima = pd.read_csv("../data/00-datos_clima.csv", index_col = 0)

In [21]:
df_clima.head(2)

Unnamed: 0,timepoint,cloudcover,highcloud,midcloud,lowcloud,rh_profile,wind_profile,temp2m,lifted_index,rh2m,msl_pressure,prec_type,prec_amount,snow_depth,wind10m.direction,wind10m.speed,country,latitud,longitud
0,3,7,-9999,-9999,-9999,"[{'layer': '950mb', 'rh': 5}, {'layer': '900mb...","[{'layer': '950mb', 'direction': 150, 'speed':...",16,6,4,1024,none,0,0,110,2,usa,39.78373,-100.445882
1,6,8,-9999,-9999,-9999,"[{'layer': '950mb', 'rh': 6}, {'layer': '900mb...","[{'layer': '950mb', 'direction': 265, 'speed':...",15,6,6,1025,none,0,0,220,2,usa,39.78373,-100.445882


In [22]:
# Hacemos una lista solo con los paises que hay en el df del clima
lista_paises = df_clima["country"].unique().tolist()
lista_paises

['usa', 'australia', 'south africa', 'new zealand', 'papua new guinea']

In [23]:
# Creamos un dataframe filtrando el df de los ataques con los paises que están en clima
df_pais = df[df["country"].isin(lista_paises)]
df_pais.head()

Unnamed: 0,type,country,activity,species,month,fatal,gender,year,age
0,Boating,usa,Paddling,White,Jun,N,F,2018.0,57.0
2,Unprovoked,usa,Walking,Bull,May,N,M,2018.0,15.0
3,Provoked,australia,Feeding sharks,Grey,May,N,M,2018.0,32.0
6,Unprovoked,australia,Surfing,Unspecified,Apr,N,M,2018.0,60.0
8,Unprovoked,south africa,Paddle-skiing,White,Apr,N,M,2018.0,33.0


In [24]:
# Hacemos que dentro de la columna haya listas que podamos separar por columnas
df_clima["rh_profile"] = df_clima["rh_profile"].apply(ast.literal_eval)
df_clima["wind_profile"] = df_clima["wind_profile"].apply(ast.literal_eval)

In [25]:
# Ahora podremos separar la columna en diferentes columnas, una por cada diccionario
df_rh = df_clima["rh_profile"].apply(pd.Series)
df_rh.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,"{'layer': '950mb', 'rh': 5}","{'layer': '900mb', 'rh': 7}","{'layer': '850mb', 'rh': 8}","{'layer': '800mb', 'rh': 13}","{'layer': '750mb', 'rh': 11}","{'layer': '700mb', 'rh': 7}","{'layer': '650mb', 'rh': 3}","{'layer': '600mb', 'rh': 1}","{'layer': '550mb', 'rh': 3}","{'layer': '500mb', 'rh': 10}","{'layer': '450mb', 'rh': 11}","{'layer': '400mb', 'rh': 11}","{'layer': '350mb', 'rh': 13}","{'layer': '300mb', 'rh': 15}","{'layer': '250mb', 'rh': 15}","{'layer': '200mb', 'rh': 14}"
1,"{'layer': '950mb', 'rh': 6}","{'layer': '900mb', 'rh': 7}","{'layer': '850mb', 'rh': 6}","{'layer': '800mb', 'rh': 8}","{'layer': '750mb', 'rh': 11}","{'layer': '700mb', 'rh': 6}","{'layer': '650mb', 'rh': 3}","{'layer': '600mb', 'rh': 8}","{'layer': '550mb', 'rh': 15}","{'layer': '500mb', 'rh': 16}","{'layer': '450mb', 'rh': 16}","{'layer': '400mb', 'rh': 16}","{'layer': '350mb', 'rh': 16}","{'layer': '300mb', 'rh': 12}","{'layer': '250mb', 'rh': 14}","{'layer': '200mb', 'rh': 6}"


In [26]:
df_clima.shape[1]

19

In [27]:
for i in range(len(df_rh.columns)):
    #Aplicamos el apply y extraemos el valor de la key "layer" y lo almacenamos en una variable que convertimos a string
    nombre = "rh_" + str(df_rh[i].apply(pd.Series)["layer"][0])
    # hacemos lo mismo con una variable que se llame valores para "guardar" los valores de la celda
    valores = list(df_rh[i].apply(pd.Series)["rh"])
    # usamos el método insert de los dataframes para ir añadiendo esta información a el dataframe con la información del clima.
    df_clima.insert(i, nombre, valores)

In [28]:
df_clima.head(1)

Unnamed: 0,rh_950mb,rh_900mb,rh_850mb,rh_800mb,rh_750mb,rh_700mb,rh_650mb,rh_600mb,rh_550mb,rh_500mb,rh_450mb,rh_400mb,rh_350mb,rh_300mb,rh_250mb,rh_200mb,timepoint,cloudcover,highcloud,midcloud,lowcloud,rh_profile,wind_profile,temp2m,lifted_index,rh2m,msl_pressure,prec_type,prec_amount,snow_depth,wind10m.direction,wind10m.speed,country,latitud,longitud
0,5,7,8,13,11,7,3,1,3,10,11,11,13,15,15,14,3,7,-9999,-9999,-9999,"[{'layer': '950mb', 'rh': 5}, {'layer': '900mb...","[{'layer': '950mb', 'direction': 150, 'speed':...",16,6,4,1024,none,0,0,110,2,usa,39.78373,-100.445882


In [29]:
# Hacemos lo mismo en la columna Wind_profile
df_wind = df_clima["wind_profile"].apply(pd.Series)
df_wind.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,"{'layer': '950mb', 'direction': 150, 'speed': 2}","{'layer': '900mb', 'direction': 240, 'speed': 2}","{'layer': '850mb', 'direction': 260, 'speed': 3}","{'layer': '800mb', 'direction': 275, 'speed': 3}","{'layer': '750mb', 'direction': 280, 'speed': 4}","{'layer': '700mb', 'direction': 265, 'speed': 5}","{'layer': '650mb', 'direction': 270, 'speed': 5}","{'layer': '600mb', 'direction': 265, 'speed': 6}","{'layer': '550mb', 'direction': 255, 'speed': 6}","{'layer': '500mb', 'direction': 250, 'speed': 6}","{'layer': '450mb', 'direction': 245, 'speed': 6}","{'layer': '400mb', 'direction': 250, 'speed': 6}","{'layer': '350mb', 'direction': 250, 'speed': 6}","{'layer': '300mb', 'direction': 250, 'speed': 7}","{'layer': '250mb', 'direction': 245, 'speed': 7}","{'layer': '200mb', 'direction': 250, 'speed': 7}"
1,"{'layer': '950mb', 'direction': 265, 'speed': 2}","{'layer': '900mb', 'direction': 330, 'speed': 3}","{'layer': '850mb', 'direction': 340, 'speed': 4}","{'layer': '800mb', 'direction': 340, 'speed': 3}","{'layer': '750mb', 'direction': 325, 'speed': 3}","{'layer': '700mb', 'direction': 305, 'speed': 3}","{'layer': '650mb', 'direction': 295, 'speed': 4}","{'layer': '600mb', 'direction': 275, 'speed': 6}","{'layer': '550mb', 'direction': 265, 'speed': 6}","{'layer': '500mb', 'direction': 260, 'speed': 6}","{'layer': '450mb', 'direction': 250, 'speed': 7}","{'layer': '400mb', 'direction': 255, 'speed': 7}","{'layer': '350mb', 'direction': 255, 'speed': 7}","{'layer': '300mb', 'direction': 245, 'speed': 7}","{'layer': '250mb', 'direction': 250, 'speed': 7}","{'layer': '200mb', 'direction': 255, 'speed': 7}"


In [30]:
for i in range(len(df_wind.columns)):
    #Aplicamos el apply y extraemos el valor de la key "layer" y lo almacenamos en una variable que convertimos a string
    nombre = "wind_direction" + str(df_wind[i].apply(pd.Series)["layer"][0])
    # hacemos lo mismo con una variable que se llame valores para "guardar" los valores de la celda
    valores = list(df_wind[i].apply(pd.Series)["direction"])
    # usamos el método insert de los dataframes para ir añadiendo esta información a el dataframe con la información del clima.
    df_clima.insert(i, nombre, valores)

In [31]:
for i in range(len(df_wind.columns)):
    # Hacemos lo mismo para speed
    nombre2= "wind_speed" + str(df_wind[i].apply(pd.Series)["layer"][0])
    valores2 = list(df_wind[i].apply(pd.Series)["speed"])
    # usamos el método insert de los dataframes para ir añadiendo esta información a el dataframe con la información del clima.
    df_clima.insert(i, nombre2, valores2)

In [32]:
df_clima.head(2)

Unnamed: 0,wind_speed950mb,wind_speed900mb,wind_speed850mb,wind_speed800mb,wind_speed750mb,wind_speed700mb,wind_speed650mb,wind_speed600mb,wind_speed550mb,wind_speed500mb,wind_speed450mb,wind_speed400mb,wind_speed350mb,wind_speed300mb,wind_speed250mb,wind_speed200mb,wind_direction950mb,wind_direction900mb,wind_direction850mb,wind_direction800mb,wind_direction750mb,wind_direction700mb,wind_direction650mb,wind_direction600mb,wind_direction550mb,wind_direction500mb,wind_direction450mb,wind_direction400mb,wind_direction350mb,wind_direction300mb,wind_direction250mb,wind_direction200mb,rh_950mb,rh_900mb,rh_850mb,rh_800mb,rh_750mb,rh_700mb,rh_650mb,rh_600mb,rh_550mb,rh_500mb,rh_450mb,rh_400mb,rh_350mb,rh_300mb,rh_250mb,rh_200mb,timepoint,cloudcover,highcloud,midcloud,lowcloud,rh_profile,wind_profile,temp2m,lifted_index,rh2m,msl_pressure,prec_type,prec_amount,snow_depth,wind10m.direction,wind10m.speed,country,latitud,longitud
0,2,2,3,3,4,5,5,6,6,6,6,6,6,7,7,7,150,240,260,275,280,265,270,265,255,250,245,250,250,250,245,250,5,7,8,13,11,7,3,1,3,10,11,11,13,15,15,14,3,7,-9999,-9999,-9999,"[{'layer': '950mb', 'rh': 5}, {'layer': '900mb...","[{'layer': '950mb', 'direction': 150, 'speed':...",16,6,4,1024,none,0,0,110,2,usa,39.78373,-100.445882
1,2,3,4,3,3,3,4,6,6,6,7,7,7,7,7,7,265,330,340,340,325,305,295,275,265,260,250,255,255,245,250,255,6,7,6,8,11,6,3,8,15,16,16,16,16,12,14,6,6,8,-9999,-9999,-9999,"[{'layer': '950mb', 'rh': 6}, {'layer': '900mb...","[{'layer': '950mb', 'direction': 265, 'speed':...",15,6,6,1025,none,0,0,220,2,usa,39.78373,-100.445882


In [33]:
# Borramos las columnas rh_profile y wind_profile que ya no nos sirven
df_clima.drop(["rh_profile", "wind_profile"], axis= 1, inplace= True)

In [34]:
df_clima.head(1)

Unnamed: 0,wind_speed950mb,wind_speed900mb,wind_speed850mb,wind_speed800mb,wind_speed750mb,wind_speed700mb,wind_speed650mb,wind_speed600mb,wind_speed550mb,wind_speed500mb,wind_speed450mb,wind_speed400mb,wind_speed350mb,wind_speed300mb,wind_speed250mb,wind_speed200mb,wind_direction950mb,wind_direction900mb,wind_direction850mb,wind_direction800mb,wind_direction750mb,wind_direction700mb,wind_direction650mb,wind_direction600mb,wind_direction550mb,wind_direction500mb,wind_direction450mb,wind_direction400mb,wind_direction350mb,wind_direction300mb,wind_direction250mb,wind_direction200mb,rh_950mb,rh_900mb,rh_850mb,rh_800mb,rh_750mb,rh_700mb,rh_650mb,rh_600mb,rh_550mb,rh_500mb,rh_450mb,rh_400mb,rh_350mb,rh_300mb,rh_250mb,rh_200mb,timepoint,cloudcover,highcloud,midcloud,lowcloud,temp2m,lifted_index,rh2m,msl_pressure,prec_type,prec_amount,snow_depth,wind10m.direction,wind10m.speed,country,latitud,longitud
0,2,2,3,3,4,5,5,6,6,6,6,6,6,7,7,7,150,240,260,275,280,265,270,265,255,250,245,250,250,250,245,250,5,7,8,13,11,7,3,1,3,10,11,11,13,15,15,14,3,7,-9999,-9999,-9999,16,6,4,1024,none,0,0,110,2,usa,39.78373,-100.445882


In [35]:
# Hacemos un groupby por pais y sacamos la media
df_media_c = df_clima.groupby("country").mean().reset_index()

  df_media_c = df_clima.groupby("country").mean().reset_index()


In [36]:
df_media_c.head(2)

Unnamed: 0,country,wind_speed950mb,wind_speed900mb,wind_speed850mb,wind_speed800mb,wind_speed750mb,wind_speed700mb,wind_speed650mb,wind_speed600mb,wind_speed550mb,wind_speed500mb,wind_speed450mb,wind_speed400mb,wind_speed350mb,wind_speed300mb,wind_speed250mb,wind_speed200mb,wind_direction950mb,wind_direction900mb,wind_direction850mb,wind_direction800mb,wind_direction750mb,wind_direction700mb,wind_direction650mb,wind_direction600mb,wind_direction550mb,wind_direction500mb,wind_direction450mb,wind_direction400mb,wind_direction350mb,wind_direction300mb,wind_direction250mb,wind_direction200mb,rh_950mb,rh_900mb,rh_850mb,rh_800mb,rh_750mb,rh_700mb,rh_650mb,rh_600mb,rh_550mb,rh_500mb,rh_450mb,rh_400mb,rh_350mb,rh_300mb,rh_250mb,rh_200mb,timepoint,cloudcover,highcloud,midcloud,lowcloud,temp2m,lifted_index,rh2m,msl_pressure,prec_amount,snow_depth,wind10m.direction,wind10m.speed,latitud,longitud
0,australia,3.5625,3.578125,3.546875,3.515625,3.421875,3.5,3.515625,3.546875,3.65625,3.59375,3.59375,3.546875,3.5625,3.921875,4.734375,5.203125,79.921875,80.546875,101.09375,102.109375,103.828125,99.609375,106.40625,96.796875,88.90625,80.46875,82.96875,89.921875,121.953125,129.921875,139.296875,142.109375,13.890625,11.4375,8.484375,6.375,5.296875,4.46875,3.796875,3.40625,3.078125,3.375,3.375,3.203125,2.453125,1.390625,3.5625,5.84375,97.5,2.640625,-9999.0,-9999.0,-9999.0,25.875,-3.828125,10.828125,1015.9375,2.796875,0.0,80.859375,3.296875,-24.776109,134.755
1,new zealand,3.34375,3.234375,3.171875,3.15625,3.1875,3.1875,3.28125,3.421875,3.546875,3.796875,4.03125,4.234375,4.65625,5.015625,6.265625,7.21875,141.875,143.4375,147.421875,155.390625,179.609375,190.703125,192.890625,212.421875,230.625,241.796875,255.546875,269.21875,264.921875,255.15625,251.953125,260.3125,13.171875,12.546875,4.625,2.578125,2.484375,2.8125,3.0625,3.0,2.71875,2.828125,2.734375,3.875,5.65625,7.09375,4.984375,1.640625,97.5,6.09375,-9999.0,-9999.0,-9999.0,14.953125,10.265625,10.296875,1020.90625,2.671875,0.0,133.828125,3.015625,-41.500083,172.834408


In [37]:
df_pais.head(2)

Unnamed: 0,type,country,activity,species,month,fatal,gender,year,age
0,Boating,usa,Paddling,White,Jun,N,F,2018.0,57.0
2,Unprovoked,usa,Walking,Bull,May,N,M,2018.0,15.0


In [38]:
# Unimos el dataframe con las medias del clima por pais, con el df de los ataques
df_pais_clima = pd.merge(df_pais, df_media_c, on = "country", how= "left")

In [39]:
df_pais_clima.head()

Unnamed: 0,type,country,activity,species,month,fatal,gender,year,age,wind_speed950mb,wind_speed900mb,wind_speed850mb,wind_speed800mb,wind_speed750mb,wind_speed700mb,wind_speed650mb,wind_speed600mb,wind_speed550mb,wind_speed500mb,wind_speed450mb,wind_speed400mb,wind_speed350mb,wind_speed300mb,wind_speed250mb,wind_speed200mb,wind_direction950mb,wind_direction900mb,wind_direction850mb,wind_direction800mb,wind_direction750mb,wind_direction700mb,wind_direction650mb,wind_direction600mb,wind_direction550mb,wind_direction500mb,wind_direction450mb,wind_direction400mb,wind_direction350mb,wind_direction300mb,wind_direction250mb,wind_direction200mb,rh_950mb,rh_900mb,rh_850mb,rh_800mb,rh_750mb,rh_700mb,rh_650mb,rh_600mb,rh_550mb,rh_500mb,rh_450mb,rh_400mb,rh_350mb,rh_300mb,rh_250mb,rh_200mb,timepoint,cloudcover,highcloud,midcloud,lowcloud,temp2m,lifted_index,rh2m,msl_pressure,prec_amount,snow_depth,wind10m.direction,wind10m.speed,latitud,longitud
0,Boating,usa,Paddling,White,Jun,N,F,2018.0,57.0,2.875,3.265625,3.75,4.15625,4.328125,4.421875,4.609375,5.0,5.28125,5.625,6.015625,6.375,6.890625,7.6875,8.25,8.40625,234.6875,265.546875,301.328125,313.203125,287.265625,305.625,291.328125,294.453125,293.125,293.4375,292.96875,288.4375,292.96875,284.375,274.140625,304.6875,4.375,5.140625,5.65625,4.71875,3.1875,2.59375,2.296875,2.1875,2.9375,4.0,4.578125,4.921875,5.3125,6.140625,7.296875,5.984375,97.5,4.59375,-9999.0,-9999.0,-9999.0,12.8125,12.328125,4.28125,1019.5625,0.484375,0.0,244.0625,2.671875,39.78373,-100.445882
1,Unprovoked,usa,Walking,Bull,May,N,M,2018.0,15.0,2.875,3.265625,3.75,4.15625,4.328125,4.421875,4.609375,5.0,5.28125,5.625,6.015625,6.375,6.890625,7.6875,8.25,8.40625,234.6875,265.546875,301.328125,313.203125,287.265625,305.625,291.328125,294.453125,293.125,293.4375,292.96875,288.4375,292.96875,284.375,274.140625,304.6875,4.375,5.140625,5.65625,4.71875,3.1875,2.59375,2.296875,2.1875,2.9375,4.0,4.578125,4.921875,5.3125,6.140625,7.296875,5.984375,97.5,4.59375,-9999.0,-9999.0,-9999.0,12.8125,12.328125,4.28125,1019.5625,0.484375,0.0,244.0625,2.671875,39.78373,-100.445882
2,Provoked,australia,Feeding sharks,Grey,May,N,M,2018.0,32.0,3.5625,3.578125,3.546875,3.515625,3.421875,3.5,3.515625,3.546875,3.65625,3.59375,3.59375,3.546875,3.5625,3.921875,4.734375,5.203125,79.921875,80.546875,101.09375,102.109375,103.828125,99.609375,106.40625,96.796875,88.90625,80.46875,82.96875,89.921875,121.953125,129.921875,139.296875,142.109375,13.890625,11.4375,8.484375,6.375,5.296875,4.46875,3.796875,3.40625,3.078125,3.375,3.375,3.203125,2.453125,1.390625,3.5625,5.84375,97.5,2.640625,-9999.0,-9999.0,-9999.0,25.875,-3.828125,10.828125,1015.9375,2.796875,0.0,80.859375,3.296875,-24.776109,134.755
3,Unprovoked,australia,Surfing,Unspecified,Apr,N,M,2018.0,60.0,3.5625,3.578125,3.546875,3.515625,3.421875,3.5,3.515625,3.546875,3.65625,3.59375,3.59375,3.546875,3.5625,3.921875,4.734375,5.203125,79.921875,80.546875,101.09375,102.109375,103.828125,99.609375,106.40625,96.796875,88.90625,80.46875,82.96875,89.921875,121.953125,129.921875,139.296875,142.109375,13.890625,11.4375,8.484375,6.375,5.296875,4.46875,3.796875,3.40625,3.078125,3.375,3.375,3.203125,2.453125,1.390625,3.5625,5.84375,97.5,2.640625,-9999.0,-9999.0,-9999.0,25.875,-3.828125,10.828125,1015.9375,2.796875,0.0,80.859375,3.296875,-24.776109,134.755
4,Unprovoked,south africa,Paddle-skiing,White,Apr,N,M,2018.0,33.0,2.5,2.359375,2.234375,2.15625,2.1875,2.140625,2.375,2.484375,2.734375,2.984375,3.546875,3.703125,4.015625,4.140625,4.53125,4.84375,150.078125,191.25,182.65625,179.296875,204.296875,222.03125,224.609375,220.703125,228.125,230.78125,232.8125,237.1875,233.828125,230.078125,223.4375,216.25,12.921875,9.71875,8.015625,8.609375,9.4375,9.8125,10.203125,8.1875,5.796875,4.78125,3.3125,2.125,2.984375,3.796875,5.21875,7.765625,97.5,6.015625,-9999.0,-9999.0,-9999.0,23.875,2.015625,10.21875,1018.859375,1.078125,0.0,145.703125,2.515625,-28.816624,24.991639


In [40]:
df_pais_clima.shape

(1352, 72)

- Guardar los resultados obtenidos en un csv que usaremos en próximos ejercicios de pair programming.

In [41]:
# Guardamos
df_pais_clima.to_csv("../data/00-ataques-clima.csv")