In [1]:
import requests
import zipfile
import io
import time
import os
import pandas as pd

In [2]:
def downloadData():
    years = ['2019', '2020', '2021', '2022', '2023']
    months = [str(x).zfill(2) for x in range(1, 13)]

    for y in years:
        download_folder = './download/'
        for m in months:
            time.sleep(0.3)
            date = y + m
            # URL of the zip file
            file_url = f"https://climatologia.meteochile.gob.cl/application/datos/getDatosEma/360011/360011_{date}_Viento.csv.zip"

            # Ensure the download folder exists, create if not
            os.makedirs(download_folder, exist_ok=True)

            # Extract the file name from the URL
            file_name = file_url.split("/")[-1]

            # Construct the full path to save the file
            file_path = os.path.join(download_folder, file_name)

            # Send a GET request to download the file
            response = requests.get(file_url)

            # Check if the request was successful
            if response.status_code == 200:
                with open(file_path, 'wb') as f:
                    f.write(response.content)
                print(f"File downloaded and saved as {file_path}")
            else:
                print(f"Failed to download the file. Status code: {response.status_code}")
#downloadData()

In [3]:
def unzipData():
    # Paths to the download and data folders
    download_folder = "./download/"
    data_folder = "./data/"

    # Ensure the data folder exists, create if not
    os.makedirs(data_folder, exist_ok=True)

    # List all files in the download folder
    download_files = os.listdir(download_folder)

    # Loop through each file in the download folder
    for file_name in download_files:
        # Construct the full paths for the input and output files
        input_file_path = os.path.join(download_folder, file_name)
        output_folder_path = data_folder

        # Check if the file is a zip file
        if file_name.endswith(".zip"):
            try:
                # Open and extract the zip file
                with zipfile.ZipFile(input_file_path, 'r') as zip_ref:
                    # Extract all contents to the output folder
                    zip_ref.extractall(output_folder_path)
                print(f"Extracted {file_name} to {output_folder_path}")
            except zipfile.BadZipFile:
                print(f"{file_name} is not a valid zip file.")
        else:
            print(f"{file_name} is not a zip file.")

    print("Extraction process completed.")
#unzipData()

In [4]:
csv_folder = "./data/"

csv_files = [file for file in os.listdir(csv_folder) if file.endswith('.csv')]

dataframes = []

for csv_file in csv_files:
    csv_path = os.path.join(csv_folder, csv_file)
    df = pd.read_csv(csv_path, delimiter=';')
    dataframes.append(df)

combined_df = pd.concat(dataframes, ignore_index=True)

In [5]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2150613 entries, 0 to 2150612
Data columns (total 34 columns):
 #   Column           Dtype  
---  ------           -----  
 0   codigoNacional   int64  
 1   idEquipo         int64  
 2   idPista          float64
 3   momento          object 
 4   ddInst           float64
 5   ffInst           float64
 6   dd02Minutos      float64
 7   ff02Minutos      float64
 8   dd10Minutos      float64
 9   ff10Minutos      float64
 10  dd02MinutosMin   float64
 11  ff02MinutosMin   float64
 12  dd02MinutosMax   float64
 13  ff02MinutosMax   float64
 14  dd10MinutosMin   float64
 15  ff10MinutosMin   float64
 16  dd10MinutosMax   float64
 17  ff10MinutosMax   float64
 18  dd01Minutos      float64
 19  ff01Minutos      float64
 20  dd01MinutosMin   float64
 21  ff01MinutosMin   float64
 22  dd01MinutosMax   float64
 23  ff01MinutosMax   float64
 24  momentoRegistro  object 
 25  dd15m            float64
 26  ffMax15m         float64
 27  ffMed15m    

In [6]:
combined_df.isnull().sum()

codigoNacional           0
idEquipo                 0
idPista            2150613
momento                  0
ddInst                   0
ffInst                   0
dd02Minutos              0
ff02Minutos              0
dd10Minutos              0
ff10Minutos              0
dd02MinutosMin         431
ff02MinutosMin         431
dd02MinutosMax         431
ff02MinutosMax         431
dd10MinutosMin         350
ff10MinutosMin         350
dd10MinutosMax         350
ff10MinutosMax         350
dd01Minutos        2150613
ff01Minutos        2150613
dd01MinutosMin     2150613
ff01MinutosMin     2150613
dd01MinutosMax     2150613
ff01MinutosMax     2150613
momentoRegistro          0
dd15m              2150613
ffMax15m           2150613
ffMed15m           2150613
dn02Min            2150613
dx02Min            2150613
dn10Min            2150613
dx10Min            2150613
dd02Mts            2150613
ff02Mts            2150613
dtype: int64

In [7]:
result_df = combined_df[combined_df.columns[combined_df.isnull().sum() == 0]]
result_df = result_df.drop(columns='momentoRegistro')
result_df

Unnamed: 0,codigoNacional,idEquipo,momento,ddInst,ffInst,dd02Minutos,ff02Minutos,dd10Minutos,ff10Minutos
0,360011,0,2019-02-05 18:27:00,218.0,9.7,218.0,4.2,212.0,7.8
1,360011,0,2019-02-05 18:28:00,208.0,8.0,212.0,4.5,213.0,7.7
2,360011,0,2019-02-05 18:29:00,230.0,6.2,204.0,3.8,214.0,7.2
3,360011,0,2019-02-05 18:30:00,213.0,6.8,218.0,3.2,219.0,7.2
4,360011,0,2019-02-05 18:31:00,232.0,9.3,223.0,4.0,221.0,7.5
...,...,...,...,...,...,...,...,...,...
2150608,360011,0,2023-07-31 23:56:00,28.0,4.7,34.0,2.3,44.0,3.2
2150609,360011,0,2023-07-31 23:57:00,28.0,4.9,28.0,2.5,41.0,3.4
2150610,360011,0,2023-07-31 23:58:00,34.0,5.2,28.0,2.5,39.0,3.6
2150611,360011,0,2023-07-31 23:59:00,32.0,5.4,30.0,2.6,37.0,3.8


In [8]:
result_df.info(), result_df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2150613 entries, 0 to 2150612
Data columns (total 9 columns):
 #   Column          Dtype  
---  ------          -----  
 0   codigoNacional  int64  
 1   idEquipo        int64  
 2   momento         object 
 3   ddInst          float64
 4   ffInst          float64
 5   dd02Minutos     float64
 6   ff02Minutos     float64
 7   dd10Minutos     float64
 8   ff10Minutos     float64
dtypes: float64(6), int64(2), object(1)
memory usage: 147.7+ MB


(None,
 codigoNacional    0
 idEquipo          0
 momento           0
 ddInst            0
 ffInst            0
 dd02Minutos       0
 ff02Minutos       0
 dd10Minutos       0
 ff10Minutos       0
 dtype: int64)

In [9]:
result_df.columns

Index(['codigoNacional', 'idEquipo', 'momento', 'ddInst', 'ffInst',
       'dd02Minutos', 'ff02Minutos', 'dd10Minutos', 'ff10Minutos'],
      dtype='object')

In [12]:
result_df.to_csv('./dataset/viento_dataset.csv', sep=';', index=False)