# <font color='008fd0'>Overview</font>

This notebooks represents the processing steps for the meteo dataset.

## <font color='00b269'>Workflow</font>
- **MeteoTransform**: 
   - keep only useful columns & rename them
   - extract numerical values - Pressure
   - impute empty values
      - Precipitation: Tr -> 0
      - Insolation: - -> 3 rolling average

In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import warnings

StatementMeta(, 71df5862-3bfb-4a93-a17a-1750bee880a7, 3, Finished, Available)

In [2]:
folder = "abfss://Medio@onelake.dfs.fabric.microsoft.com/Medio.Lakehouse/Files/Meteo"

StatementMeta(, 71df5862-3bfb-4a93-a17a-1750bee880a7, 4, Finished, Available)

In [9]:
df = pd.read_csv(f"{folder}/meteo.csv")
print(len(df))
df.head()

StatementMeta(, 71df5862-3bfb-4a93-a17a-1750bee880a7, 11, Finished, Available)

17349


Unnamed: 0,City,Date,Ave. T. (ºC),Max. T. (ºC),Min. T. (ºC),Prec. (mm),S.L.Press./ Gheopot.,Wind dir,Wind sp. (Km/h),Cloud c.,Snow depth (cm),Insolat. (hours)
0,Cluj Napoca,31/01/2013,3.8,7.8,-0.3,3.8,1012.6 Hpa,297º(NW),8,4/8,10,4.6
1,Cluj Napoca,30/01/2013,-0.7,1.1,-2.4,4.8,1017.1 Hpa,56º(NE),5,8/8,10,0.8
2,Cluj Napoca,29/01/2013,-1.8,1.0,-4.6,3.5,1021.2 Hpa,53º(NE),4,8/8,6,1.1
3,Cluj Napoca,28/01/2013,-3.1,-0.7,-5.5,0.6,1017.8 Hpa,47º(NE),4,7/8,8,4.3
4,Cluj Napoca,27/01/2013,-4.8,-1.4,-8.1,2.2,1017.4 Hpa,23º(NE),5,8/8,7,0.5


In [10]:
class MeteoTransform(TransformerMixin):
    def transform(self, X):
        rename_map = {
            "City": "City",
            "Date": "Date",
            "Max. T. (ºC)": "Max",
            "Min. T. (ºC)": "Min",
            "Prec. (mm)": "Prec",
            "S.L.Press./ Gheopot.": "Press",
            "Wind sp. (Km/h)": "Wind",
            "Insolat. (hours)": "Insolat"
        }
        warnings.filterwarnings("ignore")
        # keep and rename only useful columns
        X = X[list(rename_map.keys())]
        X.rename(columns=rename_map, inplace=True)
        # date column
        X['Date'] = pd.to_datetime(X['Date']).dt.date
        X.sort_values(by=['Date'], inplace=True)
        # pressure columns
        X['Press'] = X['Press'].str.extract(r'(\d+\.\d+)', expand=False).astype(float)
        # precipitation
        X['Prec'] = X['Prec'].replace('Tr', 0.0).astype(float)
        # insolation
        X['Insolat'] = X['Insolat'].replace('-', np.nan).astype(float)
        while X['Insolat'].isna().sum():
            mean_values = X['Insolat'].rolling(3, min_periods=1, center=True).mean().round(1)
            X['Insolat'] = X['Insolat'].combine_first(mean_values)
        warnings.filterwarnings("default")
        return X

pipeline = Pipeline([
    ('transformations', MeteoTransform())
])

StatementMeta(, 71df5862-3bfb-4a93-a17a-1750bee880a7, 12, Finished, Available)

In [11]:
processed = pipeline.transform(df)
processed.head()

StatementMeta(, 71df5862-3bfb-4a93-a17a-1750bee880a7, 13, Finished, Available)

Unnamed: 0,City,Date,Max,Min,Prec,Press,Wind,Insolat
1916,Cluj Napoca,2007-01-01,6.2,-6.1,3.5,1026.7,4,0.0
9952,Iasi,2007-01-01,11.7,-2.1,4.9,1021.4,11,3.5
15247,Constanta,2007-01-01,14.4,-0.9,0.0,1026.3,11,7.1
5934,Bucuresti,2007-01-01,10.1,-5.1,0.0,1025.2,8,2.9
9951,Iasi,2007-01-02,9.1,2.6,0.8,1014.3,7,0.3


In [12]:
print(len(processed))
processed.isna().sum()

StatementMeta(, 71df5862-3bfb-4a93-a17a-1750bee880a7, 14, Finished, Available)

17349


City       0
Date       0
Max        0
Min        0
Prec       0
Press      0
Wind       0
Insolat    0
dtype: int64

In [None]:
processed.to_csv(f"{folder}/meteo_processed.csv")

StatementMeta(, cbfafe58-796b-4c76-9e2c-0a14ffa1a272, 9, Finished, Available)



In [None]:
sdf = spark.createDataFrame(processed)
sdf.write.mode("overwrite").format("delta").save("Tables/Meteo")

StatementMeta(, cbfafe58-796b-4c76-9e2c-0a14ffa1a272, 10, Finished, Available)

  self._sock = None


Row(City='Cluj Napoca', Date=datetime.date(2007, 1, 1), Max=6.2, Min=-6.1, Prec=3.5, Press=1026.7, Wind=4, Insolat=0.0)