# Analyzing

In this jupyter notebook the dataframe obtained from 'code2_cleaning' will be analized. The objectives are:

- Mosquito per day obtention.

- API (Application Programming Interface) use to get climate data. API is a server that you can use to retrieve and send data to using code. APIs are most commonly used to retrieve data.

- Explain some basic results.

In [489]:
# Data treatment
# ------------------------------------------------------------------------------
import numpy as np
import pandas as pd
from datetime import date, datetime
import holidays
import requests #To check if an API link works
from IPython.core.interactiveshell import InteractiveShell #Show more than one output per cell
InteractiveShell.ast_node_interactivity = "all"
import ast

# API for accessing open weather and climate data
# ------------------------------------------------------------------------------
from meteostat import Point, Daily

# Graphs
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Warnings configuration
# ------------------------------------------------------------------------------
import warnings
warnings.filterwarnings('once')

# Library to create pickle files.
# ------------------------------------------------------------------------------
import pickle
import os

# Progress bar
# ------------------------------------------------------------------------------
from tqdm import tqdm

# Put a pause between API calls
# ------------------------------------------------------------------------------
import time

In [7]:
# To show all the columns of our dataframe.
pd.options.display.max_columns=None

In [639]:
# Create the first dataframe containing the data from the csv obtained from Gbif.
df_0 = pd.read_csv('../data/mosquito1_clean.csv', index_col=0)

# Check the first three rows to see how this dataframe looks like.
df_0.head(3)

Unnamed: 0,event_date,year,month,day,country_code,latitude,longitude,witness,issue
0,2022-11-04,2022,11,4,ES,41.51019,2.24589,Roger Eritja,CONTINENT_DERIVED_FROM_COORDINATES
1,2021-08-27,2021,8,27,IT,44.40289,8.98775,Karin Bakran-Lebl;Ana Klobucar;UNIROMA1;Roger ...,CONTINENT_DERIVED_FROM_COORDINATES
3,2022-08-11,2022,8,11,IT,41.70922,12.78512,UNIROMA1;Eleonora Longo;Francesco Severini;Rog...,CONTINENT_DERIVED_FROM_COORDINATES


In [555]:
df_test1=df_0.head(10)
df_test1

Unnamed: 0,event_date,year,month,day,country_code,latitude,longitude,witness,issue
0,2022-11-04,2022,11,4,ES,41.51019,2.24589,Roger Eritja,CONTINENT_DERIVED_FROM_COORDINATES
1,2021-08-27,2021,8,27,IT,44.40289,8.98775,Karin Bakran-Lebl;Ana Klobucar;UNIROMA1;Roger ...,CONTINENT_DERIVED_FROM_COORDINATES
3,2022-08-11,2022,8,11,IT,41.70922,12.78512,UNIROMA1;Eleonora Longo;Francesco Severini;Rog...,CONTINENT_DERIVED_FROM_COORDINATES
4,2022-06-27,2022,6,27,HU,47.46323,19.17698,Anonymous expert;Kornélia Kurucz;Gábor Kemenes...,CONTINENT_DERIVED_FROM_COORDINATES
5,2021-10-20,2021,10,20,ES,39.43645,2.75412,Maria Angeles Puig;Mikel Bengoa Paulis;Ignacio...,CONTINENT_DERIVED_FROM_COORDINATES
7,2022-08-26,2022,8,26,IT,43.76617,11.27971,UNIROMA1;Roger Eritja,CONTINENT_DERIVED_FROM_COORDINATES
8,2022-06-22,2022,6,22,IT,40.30343,17.72688,Roger Eritja;UNIROMA1,CONTINENT_DERIVED_FROM_COORDINATES
9,2022-08-16,2022,8,16,ES,37.39345,-6.08236,Isis Sanpera-Calbet;Mikel Alexander González;D...,CONTINENT_DERIVED_FROM_COORDINATES
10,2022-08-13,2022,8,13,IT,45.57203,12.10044,UNIROMA1;Eleonora Longo;Francesco Severini;Rog...,CONTINENT_DERIVED_FROM_COORDINATES
12,2022-06-22,2022,6,22,IT,41.83784,12.69665,Roger Eritja;UNIROMA1,CONTINENT_DERIVED_FROM_COORDINATES


In [659]:

dataz

Unnamed: 0,event_date,latitude,longitude


In [676]:
dataz_dict={'event_date':['1111-11-11'],'latitude':[11.11111],'longitude':[11.11111],
                'time':['1111-11-11'],'tavg':[11.1],'tmin':[11.1],'tmax':[11.1],'prcp':[1.1],
                'snow':[1.1],'wdir':[1.1],'wspd':[1.1],'wpgt':[1.1],'pres':[1.1],'tsun':[1.1]}
dataz=pd.DataFrame.from_dict(dataz_dict)
dataz

Unnamed: 0,event_date,latitude,longitude,time,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
0,1111-11-11,11.11111,11.11111,1111-11-11,11.1,11.1,11.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1


In [690]:
datay_dict={'event_date':['1111-11-11'],'latitude':[11.11111],'longitude':[11.11111],
                'time':['1111-11-11'],'tavg':[11.1],'tmin':[11.1],'tmax':[11.1],'prcp':[1.1],
                'snow':[1.1],'wdir':[1.1],'wspd':[1.1],'wpgt':[1.1],'pres':[1.1],'tsun':[1.1]}
datay=pd.DataFrame.from_dict(datay_dict)
datay.loc[1]={'event_date':'1111-11-11','latitude':11.11111,'longitude':11.11111,
                'time':'1111-11-11','tavg':11.1,'tmin':11.1,'tmax':11.1,'prcp':1.1,
                'snow':1.1,'wdir':1.1,'wspd':1.1,'wpgt':1.1,'pres':1.1,'tsun':1.1}
datay

Unnamed: 0,event_date,latitude,longitude,time,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
0,1111-11-11,11.11111,11.11111,1111-11-11,11.1,11.1,11.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1
1,1111-11-11,11.11111,11.11111,1111-11-11,11.1,11.1,11.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1


In [695]:
for i in range(2):
    print(i)

0
1


In [696]:
for i in tqdm(range(len(datay))):
        date=datay.iloc[i]['event_date']
        latitude=datay.iloc[i]['latitude']
        longitude=datay.iloc[i]['longitude']
        datay.loc[i]={'event_date':date,'latitude':latitude,'longitude':longitude,
                'time':'1111-11-11','tavg':11.1,'tmin':11.1,'tmax':11.1,'prcp':1.1,
                'snow':1.1,'wdir':1.1,'wspd':1.1,'wpgt':1.1,'pres':1.1,'tsun':1.1}
        #len(dataz)
        #dataz.iloc[0]['event_date']# = date
        #dataz.iloc[len(dataz)]['latitude'] = latitude
        #dataz.iloc[len(dataz)]['longitude'] = longitude
        #dataz.iloc[len(dataz)]['time'] = data.iloc[0]['time']
        #dataz.iloc[len(dataz)]['tavg'] = data.iloc[0]['tavg']
        #dataz.iloc[len(dataz)]['tmin'] = data.iloc[0]['tmin']
        #dataz.iloc[len(dataz)]['tmax'] = data.iloc[0]['tmax']
        #dataz.iloc[len(dataz)]['prcp'] = data.iloc[0]['prcp']
        #dataz.iloc[len(dataz)]['snow'] = data.iloc[0]['snow']
        #dataz.iloc[len(dataz)]['wdir'] = data.iloc[0]['wdir']
        #dataz.iloc[len(dataz)]['wspd'] = data.iloc[0]['wspd']
        #dataz.iloc[len(dataz)]['wpgt'] = data.iloc[0]['wpgt']
        #dataz.iloc[len(dataz)]['pres'] = data.iloc[0]['pres']
        #dataz.iloc[len(dataz)]['tsun'] = data.iloc[0]['tsun']
datay

100%|██████████| 2/2 [00:00<00:00, 347.76it/s]


Unnamed: 0,event_date,latitude,longitude,time,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
0,1111-11-11,11.11111,11.11111,1111-11-11,11.1,11.1,11.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1
1,1111-11-11,11.11111,11.11111,1111-11-11,11.1,11.1,11.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1


In [661]:
test_abc_dict={'event_date':[],'latitude':[],'longitude':[],'time':[],'tavg':[],'tmin':[],'tmax':[],'prcp':[],'snow':[],'wdir':[],'wspd':[],'wpgt':[],'pres':[],'tsun':[]}
test_abc=pd.DataFrame.from_dict(test_abc_dict)
test_abc

Unnamed: 0,event_date,latitude,longitude,time,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun


In [701]:
def get_weather(df):
    # Function to obtain past weather and climate data

    dataz_dict={'event_date':['1111-11-11'],'latitude':[11.11111],'longitude':[11.11111],
                'time':['1111-11-11'],'tavg':[11.1],'tmin':[11.1],'tmax':[11.1],'prcp':[1.1],
                'snow':[1.1],'wdir':[1.1],'wspd':[1.1],'wpgt':[1.1],'pres':[1.1],'tsun':[1.1]}
    dataz=pd.DataFrame.from_dict(dataz_dict)
    #dataz=pd.Dataframe{'event_date':[],'latitude'[],'longitude'[]}
    #date0=df.iloc[0]['event_date']
    #latitude0=df.iloc[0]['latitude']
    #longitude0=df.iloc[0]['longitude']
    #
    ##API use for just one row and get the columns that this API gives.
    ## Set time period
    #start0 = pd.to_datetime(date0, format='%Y-%m-%d')
    #end0 = pd.to_datetime(date0, format='%Y-%m-%d')
    #
    ## Create Point (Automatically select weather stations by geographic location)
    #location0 = Point(latitude0, longitude0)
    #
    ## Get daily data
    #data0 = Daily(location0, start0, end0)
    #data0 = data0.fetch().reset_index()
    #data0.drop([0],inplace=True)

    # Create a for loop to go row by row and use the API for the whole Dataframe.
    # tqdm() is used to have a progress bar.
    for i in tqdm(range(len(df))):
        date=df.iloc[i]['event_date']
        latitude=df.iloc[i]['latitude']
        longitude=df.iloc[i]['longitude']
    
        # Set time period
        start = pd.to_datetime(date, format='%Y-%m-%d')
        end = pd.to_datetime(date, format='%Y-%m-%d')
    
        # Create Point (Automatically select weather stations by geographic location)
        location = Point(latitude, longitude)
           
        # Get daily data
        data = Daily(location, start, end)
        data = data.fetch().reset_index()
        # try & except is used because some rows have problems to get the weather data from the API.
        # Those rows will have null values.
        try:
            dataz.loc[i]={'event_date':date,'latitude':latitude,'longitude':longitude,
                'time':data.iloc[0]['time'],'tavg':data.iloc[0]['tavg'],'tmin':data.iloc[0]['tmin'],
                'tmax':data.iloc[0]['tmax'],'prcp':data.iloc[0]['prcp'],'snow':data.iloc[0]['snow'],
                'wdir':data.iloc[0]['wdir'],'wspd':data.iloc[0]['wspd'],'wpgt':data.iloc[0]['wpgt'],
                'pres':data.iloc[0]['pres'],'tsun':data.iloc[0]['tsun']}
            
            time.sleep(0.01)    #time.sleep() is used to stop the for lood for 0.01seconds, so
                                # the API does not "get overwhelmed".
        except:
            dataz.loc[i]={'event_date':date,'latitude':latitude,'longitude':longitude,
                'time':np.nan,'tavg':np.nan,'tmin':np.nan,
                'tmax':np.nan,'prcp':np.nan,'snow':np.nan,
                'wdir':np.nan,'wspd':np.nan,'wpgt':np.nan,
                'pres':np.nan,'tsun':np.nan}

            time.sleep(0.01)

    return dataz

In [663]:
df_test1

Unnamed: 0,event_date,year,month,day,country_code,latitude,longitude,witness,issue
0,2022-11-04,2022,11,4,ES,41.51019,2.24589,Roger Eritja,CONTINENT_DERIVED_FROM_COORDINATES
1,2021-08-27,2021,8,27,IT,44.40289,8.98775,Karin Bakran-Lebl;Ana Klobucar;UNIROMA1;Roger ...,CONTINENT_DERIVED_FROM_COORDINATES
3,2022-08-11,2022,8,11,IT,41.70922,12.78512,UNIROMA1;Eleonora Longo;Francesco Severini;Rog...,CONTINENT_DERIVED_FROM_COORDINATES
4,2022-06-27,2022,6,27,HU,47.46323,19.17698,Anonymous expert;Kornélia Kurucz;Gábor Kemenes...,CONTINENT_DERIVED_FROM_COORDINATES
5,2021-10-20,2021,10,20,ES,39.43645,2.75412,Maria Angeles Puig;Mikel Bengoa Paulis;Ignacio...,CONTINENT_DERIVED_FROM_COORDINATES
7,2022-08-26,2022,8,26,IT,43.76617,11.27971,UNIROMA1;Roger Eritja,CONTINENT_DERIVED_FROM_COORDINATES
8,2022-06-22,2022,6,22,IT,40.30343,17.72688,Roger Eritja;UNIROMA1,CONTINENT_DERIVED_FROM_COORDINATES
9,2022-08-16,2022,8,16,ES,37.39345,-6.08236,Isis Sanpera-Calbet;Mikel Alexander González;D...,CONTINENT_DERIVED_FROM_COORDINATES
10,2022-08-13,2022,8,13,IT,45.57203,12.10044,UNIROMA1;Eleonora Longo;Francesco Severini;Rog...,CONTINENT_DERIVED_FROM_COORDINATES
12,2022-06-22,2022,6,22,IT,41.83784,12.69665,Roger Eritja;UNIROMA1,CONTINENT_DERIVED_FROM_COORDINATES


In [702]:
df_test1_meteo=get_weather(df_test1)
df_test1_meteo

100%|██████████| 10/10 [00:01<00:00,  5.15it/s]


Unnamed: 0,event_date,latitude,longitude,time,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
0,2022-11-04,41.51019,2.24589,2022-11-04 00:00:00,15.4,11.0,20.0,0.0,,271.0,19.5,42.6,1013.8,
1,2021-08-27,44.40289,8.98775,2021-08-27 00:00:00,25.0,23.0,28.1,0.3,,68.0,13.8,27.8,1007.5,
2,2022-08-11,41.70922,12.78512,2022-08-11 00:00:00,27.7,22.0,32.0,1.1,,173.0,9.0,,1013.9,
3,2022-06-27,47.46323,19.17698,2022-06-27 00:00:00,26.6,17.7,33.1,0.0,,89.0,6.2,38.0,1016.6,
4,2021-10-20,39.43645,2.75412,2021-10-20 00:00:00,18.3,17.7,24.8,0.0,,83.0,7.1,,1023.5,
5,2022-08-26,43.76617,11.27971,2022-08-26 00:00:00,26.1,19.9,33.3,0.3,,34.0,5.4,18.5,1010.9,
6,2022-06-22,40.30343,17.72688,,,,,,,,,,,
7,2022-08-16,37.39345,-6.08236,2022-08-16 00:00:00,24.9,18.7,30.9,0.0,,256.0,14.2,40.8,1011.3,
8,2022-08-13,45.57203,12.10044,2022-08-13 00:00:00,23.1,16.0,28.3,0.0,,31.0,7.2,,1013.0,
9,2022-06-22,41.83784,12.69665,2022-06-22 00:00:00,27.5,20.1,33.8,0.0,,176.0,10.1,,1014.0,


In [601]:
df_0_meteo=get_weather(df_0)
df_0_meteo.head(10)

100%|██████████| 14512/14512 [11:58<00:00, 20.21it/s]


Unnamed: 0,time,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
0,2022-11-04,15.4,11.0,20.0,0.0,,271.0,19.5,42.6,1013.8,
1,2021-08-27,25.0,23.0,28.1,0.3,,68.0,13.8,27.8,1007.5,
2,2022-08-11,27.7,22.0,32.0,1.1,,173.0,9.0,,1013.9,
3,2022-06-27,26.6,17.7,33.1,0.0,,89.0,6.2,38.0,1016.6,
4,2021-10-20,18.3,17.7,24.8,0.0,,83.0,7.1,,1023.5,
5,2022-08-26,26.1,19.9,33.3,0.3,,34.0,5.4,18.5,1010.9,
6,NaT,,,,,,,,,,
7,2022-08-16,24.9,18.7,30.9,0.0,,256.0,14.2,40.8,1011.3,
8,2022-08-13,23.1,16.0,28.3,0.0,,31.0,7.2,,1013.0,
9,2022-06-22,27.5,20.1,33.8,0.0,,176.0,10.1,,1014.0,


In [638]:
df_0_meteo_index=df_0_meteo.reset_index()
df_0_meteo_index.head(10)

Unnamed: 0,index,event_date,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
0,0,2022-11-04,15.4,11.0,20.0,0.0,,271.0,19.5,42.6,1013.8,
1,1,2021-08-27,25.0,23.0,28.1,0.3,,68.0,13.8,27.8,1007.5,
2,2,2022-08-11,27.7,22.0,32.0,1.1,,173.0,9.0,,1013.9,
3,3,2022-06-27,26.6,17.7,33.1,0.0,,89.0,6.2,38.0,1016.6,
4,4,2021-10-20,18.3,17.7,24.8,0.0,,83.0,7.1,,1023.5,
5,5,2022-08-26,26.1,19.9,33.3,0.3,,34.0,5.4,18.5,1010.9,
6,6,NaT,,,,,,,,,,
7,7,2022-08-16,24.9,18.7,30.9,0.0,,256.0,14.2,40.8,1011.3,
8,8,2022-08-13,23.1,16.0,28.3,0.0,,31.0,7.2,,1013.0,
9,9,2022-06-22,27.5,20.1,33.8,0.0,,176.0,10.1,,1014.0,


In [640]:
df_0_index=df_0.reset_index()
df_0_index.head(10)

Unnamed: 0,index,event_date,year,month,day,country_code,latitude,longitude,witness,issue
0,0,2022-11-04,2022,11,4,ES,41.51019,2.24589,Roger Eritja,CONTINENT_DERIVED_FROM_COORDINATES
1,1,2021-08-27,2021,8,27,IT,44.40289,8.98775,Karin Bakran-Lebl;Ana Klobucar;UNIROMA1;Roger ...,CONTINENT_DERIVED_FROM_COORDINATES
2,3,2022-08-11,2022,8,11,IT,41.70922,12.78512,UNIROMA1;Eleonora Longo;Francesco Severini;Rog...,CONTINENT_DERIVED_FROM_COORDINATES
3,4,2022-06-27,2022,6,27,HU,47.46323,19.17698,Anonymous expert;Kornélia Kurucz;Gábor Kemenes...,CONTINENT_DERIVED_FROM_COORDINATES
4,5,2021-10-20,2021,10,20,ES,39.43645,2.75412,Maria Angeles Puig;Mikel Bengoa Paulis;Ignacio...,CONTINENT_DERIVED_FROM_COORDINATES
5,7,2022-08-26,2022,8,26,IT,43.76617,11.27971,UNIROMA1;Roger Eritja,CONTINENT_DERIVED_FROM_COORDINATES
6,8,2022-06-22,2022,6,22,IT,40.30343,17.72688,Roger Eritja;UNIROMA1,CONTINENT_DERIVED_FROM_COORDINATES
7,9,2022-08-16,2022,8,16,ES,37.39345,-6.08236,Isis Sanpera-Calbet;Mikel Alexander González;D...,CONTINENT_DERIVED_FROM_COORDINATES
8,10,2022-08-13,2022,8,13,IT,45.57203,12.10044,UNIROMA1;Eleonora Longo;Francesco Severini;Rog...,CONTINENT_DERIVED_FROM_COORDINATES
9,12,2022-06-22,2022,6,22,IT,41.83784,12.69665,Roger Eritja;UNIROMA1,CONTINENT_DERIVED_FROM_COORDINATES


In [645]:
df_0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14512 entries, 0 to 14511
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   index         14512 non-null  int64         
 1   event_date    14512 non-null  datetime64[ns]
 2   year          14512 non-null  int64         
 3   month         14512 non-null  int64         
 4   day           14512 non-null  int64         
 5   country_code  14512 non-null  object        
 6   latitude      14512 non-null  float64       
 7   longitude     14512 non-null  float64       
 8   witness       14512 non-null  object        
 9   issue         14512 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(4), object(3)
memory usage: 1.1+ MB


In [642]:
# Change the column 'event_date' to datetime format.
df_0['event_date'] = pd.to_datetime(df_0['event_date'], format='%Y-%m-%d')
df_0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14512 entries, 0 to 14511
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   index         14512 non-null  int64         
 1   event_date    14512 non-null  datetime64[ns]
 2   year          14512 non-null  int64         
 3   month         14512 non-null  int64         
 4   day           14512 non-null  int64         
 5   country_code  14512 non-null  object        
 6   latitude      14512 non-null  float64       
 7   longitude     14512 non-null  float64       
 8   witness       14512 non-null  object        
 9   issue         14512 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(4), object(3)
memory usage: 1.1+ MB


In [643]:
df_0_meteo.columns

Index(['index', 'event_date', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir',
       'wspd', 'wpgt', 'pres', 'tsun'],
      dtype='object')

In [644]:
df_0_meteo.columns=['event_date', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'wpgt',
       'pres', 'tsun']

ValueError: Length mismatch: Expected axis has 12 elements, new values have 11 elements

In [612]:
df_0_meteo.columns

Index(['event_date', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd',
       'wpgt', 'pres', 'tsun'],
      dtype='object')

In [613]:
df_0_meteo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14512 entries, 0 to 14511
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   event_date  12485 non-null  datetime64[ns]
 1   tavg        12480 non-null  object        
 2   tmin        12454 non-null  object        
 3   tmax        12453 non-null  object        
 4   prcp        11072 non-null  object        
 5   snow        76 non-null     object        
 6   wdir        10404 non-null  object        
 7   wspd        11992 non-null  object        
 8   wpgt        4882 non-null   object        
 9   pres        11384 non-null  object        
 10  tsun        1 non-null      object        
dtypes: datetime64[ns](1), object(10)
memory usage: 1.3+ MB


| Column  | Description                                                                         | Type    |
|---------|-------------------------------------------------------------------------------------|---------|
| tavg    | The mean air temperature in °C                                                      | Float64 |
| tmin    | The mean minimum air temperature in °C                                              | Float64 |
| tmax    | The mean maximum air temperature in °C                                              | Float64 |
| prcp    | The mean monthly precipitation total in mm                                          | Float64 |
| wspd    | The mean wind speed in km/h                                                         | Float64 |
| pres    | The mean sea-level air pressure in hPa                                              | Float64 |
| tsun    | The mean sunshine total in minutes (m)                                              | Float64 |

In [646]:
df_1=df_0.merge(df_0_meteo, how='left', on='index')
df_1.head(10)

Unnamed: 0,index,event_date_x,year,month,day,country_code,latitude,longitude,witness,issue,event_date_y,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
0,0,2022-11-04,2022,11,4,ES,41.51019,2.24589,Roger Eritja,CONTINENT_DERIVED_FROM_COORDINATES,2022-11-04,15.4,11.0,20.0,0.0,,271.0,19.5,42.6,1013.8,
1,1,2021-08-27,2021,8,27,IT,44.40289,8.98775,Karin Bakran-Lebl;Ana Klobucar;UNIROMA1;Roger ...,CONTINENT_DERIVED_FROM_COORDINATES,2021-08-27,25.0,23.0,28.1,0.3,,68.0,13.8,27.8,1007.5,
2,3,2022-08-11,2022,8,11,IT,41.70922,12.78512,UNIROMA1;Eleonora Longo;Francesco Severini;Rog...,CONTINENT_DERIVED_FROM_COORDINATES,2022-06-27,26.6,17.7,33.1,0.0,,89.0,6.2,38.0,1016.6,
3,4,2022-06-27,2022,6,27,HU,47.46323,19.17698,Anonymous expert;Kornélia Kurucz;Gábor Kemenes...,CONTINENT_DERIVED_FROM_COORDINATES,2021-10-20,18.3,17.7,24.8,0.0,,83.0,7.1,,1023.5,
4,5,2021-10-20,2021,10,20,ES,39.43645,2.75412,Maria Angeles Puig;Mikel Bengoa Paulis;Ignacio...,CONTINENT_DERIVED_FROM_COORDINATES,2022-08-26,26.1,19.9,33.3,0.3,,34.0,5.4,18.5,1010.9,
5,7,2022-08-26,2022,8,26,IT,43.76617,11.27971,UNIROMA1;Roger Eritja,CONTINENT_DERIVED_FROM_COORDINATES,2022-08-16,24.9,18.7,30.9,0.0,,256.0,14.2,40.8,1011.3,
6,8,2022-06-22,2022,6,22,IT,40.30343,17.72688,Roger Eritja;UNIROMA1,CONTINENT_DERIVED_FROM_COORDINATES,2022-08-13,23.1,16.0,28.3,0.0,,31.0,7.2,,1013.0,
7,9,2022-08-16,2022,8,16,ES,37.39345,-6.08236,Isis Sanpera-Calbet;Mikel Alexander González;D...,CONTINENT_DERIVED_FROM_COORDINATES,2022-06-22,27.5,20.1,33.8,0.0,,176.0,10.1,,1014.0,
8,10,2022-08-13,2022,8,13,IT,45.57203,12.10044,UNIROMA1;Eleonora Longo;Francesco Severini;Rog...,CONTINENT_DERIVED_FROM_COORDINATES,2022-06-26,25.2,18.7,31.5,0.0,,277.0,7.2,20.4,1014.8,
9,12,2022-06-22,2022,6,22,IT,41.83784,12.69665,Roger Eritja;UNIROMA1,CONTINENT_DERIVED_FROM_COORDINATES,2022-07-12,24.1,19.7,29.5,0.0,0.0,12.0,10.8,22.2,1018.9,


In [627]:
df_1 = pd.concat([df_0, df_0_meteo], axis=1,join='inner')
df_1.head(10)

Unnamed: 0,event_date,year,month,day,country_code,latitude,longitude,witness,issue,event_date.1,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
0,2022-11-04,2022,11,4,ES,41.51019,2.24589,Roger Eritja,CONTINENT_DERIVED_FROM_COORDINATES,2022-11-04,15.4,11.0,20.0,0.0,,271.0,19.5,42.6,1013.8,
1,2021-08-27,2021,8,27,IT,44.40289,8.98775,Karin Bakran-Lebl;Ana Klobucar;UNIROMA1;Roger ...,CONTINENT_DERIVED_FROM_COORDINATES,2021-08-27,25.0,23.0,28.1,0.3,,68.0,13.8,27.8,1007.5,
3,2022-08-11,2022,8,11,IT,41.70922,12.78512,UNIROMA1;Eleonora Longo;Francesco Severini;Rog...,CONTINENT_DERIVED_FROM_COORDINATES,2022-06-27,26.6,17.7,33.1,0.0,,89.0,6.2,38.0,1016.6,
4,2022-06-27,2022,6,27,HU,47.46323,19.17698,Anonymous expert;Kornélia Kurucz;Gábor Kemenes...,CONTINENT_DERIVED_FROM_COORDINATES,2021-10-20,18.3,17.7,24.8,0.0,,83.0,7.1,,1023.5,
5,2021-10-20,2021,10,20,ES,39.43645,2.75412,Maria Angeles Puig;Mikel Bengoa Paulis;Ignacio...,CONTINENT_DERIVED_FROM_COORDINATES,2022-08-26,26.1,19.9,33.3,0.3,,34.0,5.4,18.5,1010.9,
7,2022-08-26,2022,8,26,IT,43.76617,11.27971,UNIROMA1;Roger Eritja,CONTINENT_DERIVED_FROM_COORDINATES,2022-08-16,24.9,18.7,30.9,0.0,,256.0,14.2,40.8,1011.3,
8,2022-06-22,2022,6,22,IT,40.30343,17.72688,Roger Eritja;UNIROMA1,CONTINENT_DERIVED_FROM_COORDINATES,2022-08-13,23.1,16.0,28.3,0.0,,31.0,7.2,,1013.0,
9,2022-08-16,2022,8,16,ES,37.39345,-6.08236,Isis Sanpera-Calbet;Mikel Alexander González;D...,CONTINENT_DERIVED_FROM_COORDINATES,2022-06-22,27.5,20.1,33.8,0.0,,176.0,10.1,,1014.0,
10,2022-08-13,2022,8,13,IT,45.57203,12.10044,UNIROMA1;Eleonora Longo;Francesco Severini;Rog...,CONTINENT_DERIVED_FROM_COORDINATES,2022-06-26,25.2,18.7,31.5,0.0,,277.0,7.2,20.4,1014.8,
12,2022-06-22,2022,6,22,IT,41.83784,12.69665,Roger Eritja;UNIROMA1,CONTINENT_DERIVED_FROM_COORDINATES,2022-07-12,24.1,19.7,29.5,0.0,0.0,12.0,10.8,22.2,1018.9,


In [614]:
df_1=df_0.set_index('event_date').combine_first(df_0_meteo.set_index('event_date')).reset_index()
df_1.head(10)

Unnamed: 0,event_date,country_code,day,issue,latitude,longitude,month,prcp,pres,snow,tavg,tmax,tmin,tsun,wdir,witness,wpgt,wspd,year
0,NaT,,,,,,,,,,,,,,,,,,
1,NaT,,,,,,,,,,,,,,,,,,
2,NaT,,,,,,,,,,,,,,,,,,
3,NaT,,,,,,,,,,,,,,,,,,
4,NaT,,,,,,,,,,,,,,,,,,
5,NaT,,,,,,,,,,,,,,,,,,
6,NaT,,,,,,,,,,,,,,,,,,
7,NaT,,,,,,,,,,,,,,,,,,
8,NaT,,,,,,,,,,,,,,,,,,
9,NaT,,,,,,,,,,,,,,,,,,


In [507]:
# Check if column 'event_date' has the same information as 'time'.
df_1['same_date'] = np.where(df_1['event_date']==df_1['time'], True, False)
df_1.head(5)

Unnamed: 0,event_date,year,month,day,country_code,latitude,longitude,witness,issue,time,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun,same_date
0,2022-11-04,2022.0,11.0,4.0,ES,41.51019,2.24589,Roger Eritja,CONTINENT_DERIVED_FROM_COORDINATES,2022-11-04,15.4,11.0,20.0,0.0,,271.0,19.5,42.6,1013.8,,True
1,2021-08-27,2021.0,8.0,27.0,IT,44.40289,8.98775,Karin Bakran-Lebl;Ana Klobucar;UNIROMA1;Roger ...,CONTINENT_DERIVED_FROM_COORDINATES,2021-08-27,25.0,23.0,28.1,0.3,,68.0,13.8,27.8,1007.5,,True
3,2022-08-11,2022.0,8.0,11.0,IT,41.70922,12.78512,UNIROMA1;Eleonora Longo;Francesco Severini;Rog...,CONTINENT_DERIVED_FROM_COORDINATES,2022-06-27,26.6,17.7,33.1,0.0,,89.0,6.2,38.0,1016.6,,False
4,2022-06-27,2022.0,6.0,27.0,HU,47.46323,19.17698,Anonymous expert;Kornélia Kurucz;Gábor Kemenes...,CONTINENT_DERIVED_FROM_COORDINATES,2021-10-20,18.3,17.7,24.8,0.0,,83.0,7.1,,1023.5,,False
5,2021-10-20,2021.0,10.0,20.0,ES,39.43645,2.75412,Maria Angeles Puig;Mikel Bengoa Paulis;Ignacio...,CONTINENT_DERIVED_FROM_COORDINATES,2022-08-26,26.1,19.9,33.3,0.3,,34.0,5.4,18.5,1010.9,,False


In [None]:
Nuestro df contiene las siguientes columnas:

registro = (instant), es el índice
fecha = (dteday)
estacion = (season) Hay que cambiarla, algunas estaciones no coinciden con la fecha.
año = (year) También hay que cambiarla, está en 0 y 1 (2018,2019)
mes = (month)
festivo = (holiday) 0: laborales, 1:festivos, creemos que tomará festivos.
dia_semana = (weekday)
no_laboral = (workingday) Cuenta como 0 el laboral y el 1 el no laboral
clima = ('weathersit')
'temperatura' = (temp)
'sens_termica'= ('atemp')
humedad = (hum)
viento = (windspeed)
ocasionales = (casual)
registrados = (registred)
total = (cnt)

In [None]:
# Check for outliers in month using another method.
mean_month=df_1['month'].mean()
std_month=df_1['month'].std()
upper=mean_month + std_month
lower=mean_month - std_month
ucb=mean_month + std_month * 3
lcb=mean_month - std_month * 3

In [None]:
# Start the graph.
month_graph=sns.histplot(x=df_1['month'],kde=True)
graph.axvline(x=mean_month, c='red',label='mean')

# Plot its standard deviation.
graph.axvline(x=upper,c='green',label='std')
graph.axvline(x=lower, c='green')

# Plot its confidence interval to 99.7%
graph.axvline(x=lcb, c='orange',label='99 lower')
graph.axvline(x=ucb, c='orange',label='99 upper')

plt.legend()