# Data Preprocessing Project
Dataset: Climate Weather Surface of Brazil - Hourly (https://www.kaggle.com/datasets/PROPPG-PPG/hourly-weather-surface-brazil-southeast-region)

Team: Sahil Ram Jadhav, Farheen Fatima, Bryan Burch, Christian Suarez, Priti Bankar

In [1]:
import pandas as pd
import numpy as np

In [2]:
path = "/Users/bryanburch/Projects/cs177/central_west.csv" # replace with your path
data = pd.read_csv(path, na_values=['NA', '?', '-'])

# Unnecessary Columns Removal & Renaming
(Sahil)

In [3]:
data.drop(['index', 'Hora', 'region', 'state', 'station', 'station_code'], axis=1, inplace=True)

In [4]:
list(data)

['Data',
 'PRECIPITAÇÃO TOTAL, HORÁRIO (mm)',
 'PRESSAO ATMOSFERICA AO NIVEL DA ESTACAO, HORARIA (mB)',
 'PRESSÃO ATMOSFERICA MAX.NA HORA ANT. (AUT) (mB)',
 'PRESSÃO ATMOSFERICA MIN. NA HORA ANT. (AUT) (mB)',
 'RADIACAO GLOBAL (Kj/m²)',
 'TEMPERATURA DO AR - BULBO SECO, HORARIA (°C)',
 'TEMPERATURA DO PONTO DE ORVALHO (°C)',
 'TEMPERATURA MÁXIMA NA HORA ANT. (AUT) (°C)',
 'TEMPERATURA MÍNIMA NA HORA ANT. (AUT) (°C)',
 'TEMPERATURA ORVALHO MAX. NA HORA ANT. (AUT) (°C)',
 'TEMPERATURA ORVALHO MIN. NA HORA ANT. (AUT) (°C)',
 'UMIDADE REL. MAX. NA HORA ANT. (AUT) (%)',
 'UMIDADE REL. MIN. NA HORA ANT. (AUT) (%)',
 'UMIDADE RELATIVA DO AR, HORARIA (%)',
 'VENTO, DIREÇÃO HORARIA (gr) (° (gr))',
 'VENTO, RAJADA MAXIMA (m/s)',
 'VENTO, VELOCIDADE HORARIA (m/s)',
 'latitude',
 'longitude',
 'height']

In [5]:
data.rename(columns = {'Data':'Date (YYYY-MM-DD)',
                     'PRECIPITAÇÃO TOTAL, HORÁRIO (mm)':'Total precipitation in millimetres',
                     'PRESSAO ATMOSFERICA AO NIVEL DA ESTACAO, HORARIA (mB)':'Atmospheric pressure at station level (mB)',
                     'PRESSÃO ATMOSFERICA MAX.NA HORA ANT. (AUT) (mB)':'Maximum air pressure (mB)',
                     'PRESSÃO ATMOSFERICA MIN. NA HORA ANT. (AUT) (mB)':'Minimum air pressure (mB)',
                     'RADIACAO GLOBAL (Kj/m²)':'Solar radiation (Kj/m²)',
                     'TEMPERATURA DO AR - BULBO SECO, HORARIA (°C)':'Air temperature (°C)',
                     'TEMPERATURA DO PONTO DE ORVALHO (°C)':'Dew point temperature (°C)',
                     'TEMPERATURA MÁXIMA NA HORA ANT. (AUT) (°C)':'Maximum temperature (°C)',
                     'TEMPERATURA MÍNIMA NA HORA ANT. (AUT) (°C)':'Minimum temperature (°C)',
                     'TEMPERATURA ORVALHO MAX. NA HORA ANT. (AUT) (°C)':'Maximum dew point temperature (°C)',
                     'TEMPERATURA ORVALHO MIN. NA HORA ANT. (AUT) (°C)':'Minimum dew point temperature (°C)',
                     'UMIDADE REL. MAX. NA HORA ANT. (AUT) (%)':'Maximum relative humid temperature (%)',
                     'UMIDADE REL. MIN. NA HORA ANT. (AUT) (%)':'Minimum relative humid temperature (%)',
                     'UMIDADE RELATIVA DO AR, HORARIA (%)':'Relative humidity (%)',
                     'VENTO, DIREÇÃO HORARIA (gr) (° (gr))':'Wind direction (radius degrees (0-360))',
                     'VENTO, RAJADA MAXIMA (m/s)':'Wind gust (m/s)',
                     'VENTO, VELOCIDADE HORARIA (m/s)':'Wind speed (m/s)',
                     'height':'Elevation'
                     }, inplace = True)

# Null Value Removal

In [6]:
data_with_nans = data.replace(-9999.0, np.NaN)
data_no_nulls = data_with_nans.dropna()

In [9]:
print('Number of instances before null value removal = %d' % (data.shape[0]))

Number of instances before null value removal = 11427120


In [10]:
print('Number of instances after null value removal = %d' % (data_no_nulls.shape[0]))

Number of instances after null value removal = 4792210


# Outlier Visualization - TODO
(Farheen)

In [None]:
%matplotlib inline

df2 = df.drop(['PRECIPITAÇÃO TOTAL, HORÁRIO (mm)'], axis=1)
df2.boxplot(figsize=(16, 16))

# Outlier Removal
(Bryan)

In [11]:
# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)

In [12]:
target_columns = [
              'Total precipitation in millimetres',
              'Atmospheric pressure at station level (mB)',
              'Maximum air pressure (mB)',
              'Minimum air pressure (mB)',
              'Solar radiation (Kj/m²)',
              'Air temperature (°C)',
              'Dew point temperature (°C)',
              'Maximum temperature (°C)',
              'Minimum temperature (°C)',
              'Maximum dew point temperature (°C)',
              'Minimum dew point temperature (°C)',
              'Maximum relative humid temperature (%)',
              'Minimum relative humid temperature (%)',
              'Relative humidity (%)',
              'Wind direction (radius degrees (0-360))',
              'Wind gust (m/s)',
              'Wind speed (m/s)',
              'Elevation']

In [13]:
data_no_outliers = data.copy()

for column in target_columns:
    remove_outliers(data_no_outliers, column, 3)

data_no_outliers

Unnamed: 0,Date (YYYY-MM-DD),Total precipitation in millimetres,Atmospheric pressure at station level (mB),Maximum air pressure (mB),Minimum air pressure (mB),Solar radiation (Kj/m²),Air temperature (°C),Dew point temperature (°C),Maximum temperature (°C),Minimum temperature (°C),...,Minimum dew point temperature (°C),Maximum relative humid temperature (%),Minimum relative humid temperature (%),Relative humidity (%),Wind direction (radius degrees (0-360)),Wind gust (m/s),Wind speed (m/s),latitude,longitude,Elevation
0,2017-12-20,0.0,899.6,900.0,899.6,3391,26.5,17.7,26.5,24.4,...,16.5,65,57,59,39,9.6,3.9,-16.011111,-47.5575,1043.0
1,2017-12-20,0.0,899.2,899.6,899.2,3306,26.6,16.7,26.7,25.4,...,16.0,60,52,54,55,8.3,3.4,-16.011111,-47.5575,1043.0
2,2017-12-20,0.0,898.6,899.2,898.6,3167,27.3,15.8,27.6,25.9,...,14.5,56,45,49,62,8.3,3.6,-16.011111,-47.5575,1043.0
3,2017-12-20,0.0,897.7,898.6,897.7,3279,27.5,12.9,28.5,26.6,...,12.9,53,41,41,43,6.7,3.1,-16.011111,-47.5575,1043.0
4,2017-12-20,0.0,897.0,897.7,897.0,2753,27.5,13.7,28.9,27.4,...,12.4,45,39,43,98,6.4,3.5,-16.011111,-47.5575,1043.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11427115,2017-12-20,0.0,898.8,898.8,898.5,29,18.4,15.8,18.4,17.6,...,15.6,91,84,85,72,3.5,1.7,-16.011111,-47.5575,1043.0
11427116,2017-12-20,0.0,899.4,899.4,898.8,557,20.3,16.9,20.3,18.4,...,15.8,85,80,81,69,3.5,1.8,-16.011111,-47.5575,1043.0
11427117,2017-12-20,0.0,899.7,899.7,899.4,1441,21.8,16.6,21.9,20.3,...,16.4,81,72,72,83,7.6,4.4,-16.011111,-47.5575,1043.0
11427118,2017-12-20,0.0,899.8,899.8,899.7,2334,23.2,17.3,23.3,21.7,...,16.4,74,68,70,59,7.6,3.7,-16.011111,-47.5575,1043.0


In [15]:
# TESTING EFFECT NULL REMOVAL HAS ON OUTLIER REMOVAL
data_no_outliers_null_free = data_no_nulls.copy()

for column in target_columns:
    remove_outliers(data_no_outliers_null_free, column, 3)

data_no_outliers_null_free

Unnamed: 0,Date (YYYY-MM-DD),Total precipitation in millimetres,Atmospheric pressure at station level (mB),Maximum air pressure (mB),Minimum air pressure (mB),Solar radiation (Kj/m²),Air temperature (°C),Dew point temperature (°C),Maximum temperature (°C),Minimum temperature (°C),...,Minimum dew point temperature (°C),Maximum relative humid temperature (%),Minimum relative humid temperature (%),Relative humidity (%),Wind direction (radius degrees (0-360)),Wind gust (m/s),Wind speed (m/s),latitude,longitude,Elevation
0,2017-12-20,0.0,899.6,900.0,899.6,3391.0,26.5,17.7,26.5,24.4,...,16.5,65.0,57.0,59.0,39.0,9.6,3.9,-16.011111,-47.5575,1043.0
1,2017-12-20,0.0,899.2,899.6,899.2,3306.0,26.6,16.7,26.7,25.4,...,16.0,60.0,52.0,54.0,55.0,8.3,3.4,-16.011111,-47.5575,1043.0
2,2017-12-20,0.0,898.6,899.2,898.6,3167.0,27.3,15.8,27.6,25.9,...,14.5,56.0,45.0,49.0,62.0,8.3,3.6,-16.011111,-47.5575,1043.0
3,2017-12-20,0.0,897.7,898.6,897.7,3279.0,27.5,12.9,28.5,26.6,...,12.9,53.0,41.0,41.0,43.0,6.7,3.1,-16.011111,-47.5575,1043.0
4,2017-12-20,0.0,897.0,897.7,897.0,2753.0,27.5,13.7,28.9,27.4,...,12.4,45.0,39.0,43.0,98.0,6.4,3.5,-16.011111,-47.5575,1043.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11427115,2017-12-20,0.0,898.8,898.8,898.5,29.0,18.4,15.8,18.4,17.6,...,15.6,91.0,84.0,85.0,72.0,3.5,1.7,-16.011111,-47.5575,1043.0
11427116,2017-12-20,0.0,899.4,899.4,898.8,557.0,20.3,16.9,20.3,18.4,...,15.8,85.0,80.0,81.0,69.0,3.5,1.8,-16.011111,-47.5575,1043.0
11427117,2017-12-20,0.0,899.7,899.7,899.4,1441.0,21.8,16.6,21.9,20.3,...,16.4,81.0,72.0,72.0,83.0,7.6,4.4,-16.011111,-47.5575,1043.0
11427118,2017-12-20,0.0,899.8,899.8,899.7,2334.0,23.2,17.3,23.3,21.7,...,16.4,74.0,68.0,70.0,59.0,7.6,3.7,-16.011111,-47.5575,1043.0


Removed about 100k rows that were outliers. Also, data still in original units.

# Data Aggregation (hourly -> daily)
(Bryan)

In [16]:
# making a copy just in case
data = data_no_outliers_null_free.copy()

In [17]:
data.sort_values(by='Date (YYYY-MM-DD)',ascending=True)

Unnamed: 0,Date (YYYY-MM-DD),Total precipitation in millimetres,Atmospheric pressure at station level (mB),Maximum air pressure (mB),Minimum air pressure (mB),Solar radiation (Kj/m²),Air temperature (°C),Dew point temperature (°C),Maximum temperature (°C),Minimum temperature (°C),...,Minimum dew point temperature (°C),Maximum relative humid temperature (%),Minimum relative humid temperature (%),Relative humidity (%),Wind direction (radius degrees (0-360)),Wind gust (m/s),Wind speed (m/s),latitude,longitude,Elevation
3767509,2000-05-07,0.0,888.2,888.2,887.7,1506.0,22.6,14.7,22.6,20.7,...,14.3,69.0,60.0,61.0,126.0,3.8,1.8,-15.789444,-47.925833,1159.54
3767518,2000-05-07,0.0,886.2,886.2,885.8,34.0,24.1,13.4,25.8,23.6,...,12.9,59.0,45.0,51.0,197.0,3.2,1.3,-15.789444,-47.925833,1159.54
3767517,2000-05-07,0.0,885.9,885.9,885.5,540.0,25.8,12.7,27.4,25.8,...,11.6,45.0,38.0,44.0,109.0,3.0,1.5,-15.789444,-47.925833,1159.54
3767516,2000-05-07,0.0,885.6,885.6,885.4,1412.0,26.6,12.5,28.1,26.6,...,11.5,43.0,37.0,41.0,53.0,3.8,1.1,-15.789444,-47.925833,1159.54
3767515,2000-05-07,0.0,885.5,885.9,885.5,2530.0,28.0,12.4,28.3,26.8,...,11.5,46.0,36.0,38.0,125.0,6.3,1.8,-15.789444,-47.925833,1159.54
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3502531,2021-04-30,0.0,951.1,952.0,951.1,2309.0,29.1,18.3,29.5,28.5,...,17.4,54.0,49.0,52.0,141.0,5.9,3.0,-13.253521,-46.890326,551.33
3502532,2021-04-30,0.0,950.7,951.1,950.6,1689.0,28.6,19.0,29.7,28.6,...,17.5,57.0,49.0,56.0,76.0,5.8,2.7,-13.253521,-46.890326,551.33
3502533,2021-04-30,0.0,950.5,950.7,950.5,789.0,27.9,18.5,28.9,27.9,...,18.2,59.0,54.0,56.0,127.0,4.3,1.9,-13.253521,-46.890326,551.33
3502525,2021-04-30,0.0,953.8,953.8,953.4,1184.0,25.7,17.9,26.0,24.1,...,17.8,70.0,62.0,62.0,139.0,9.1,4.4,-13.253521,-46.890326,551.33


Aggregate those with same units as subsets of the dataframe

In [18]:
pressure_columns = data.columns[data.columns.str.contains('pressure')]
data_date_pressures = data.groupby('Date (YYYY-MM-DD)')[pressure_columns].mean().reset_index()

In [19]:
data_date_precipitation = data.groupby('Date (YYYY-MM-DD)')['Total precipitation in millimetres'].mean().reset_index()

In [20]:
data_date_radiation = data.groupby('Date (YYYY-MM-DD)')['Solar radiation (Kj/m²)'].mean().reset_index()

In [21]:
temperature_columns = data.columns[data.columns.str.contains('°C')]
data_date_temperatures = data.groupby('Date (YYYY-MM-DD)')[temperature_columns].mean().reset_index()

In [22]:
humidity_columns = data.columns[data.columns.str.contains('humid')]
data_date_humidities = data.groupby('Date (YYYY-MM-DD)')[humidity_columns].mean().reset_index()

In [23]:
wind_columns = data.columns[data.columns.str.contains('Wind')]
data_date_wind = data.groupby('Date (YYYY-MM-DD)')[wind_columns].mean().reset_index()

In [24]:
# want to do mean or something else?
data_date_latitude_longitude = data.groupby('Date (YYYY-MM-DD)')[['latitude', 'longitude']].mean().reset_index()

In [25]:
data_date_height = data.groupby('Date (YYYY-MM-DD)')['Elevation'].mean().reset_index()

Join aggregated subsets to get final aggregated dataframe

In [26]:
aggregate_data = data_date_precipitation.merge(data_date_pressures)

In [27]:
aggregate_data = aggregate_data.merge(data_date_radiation)

In [28]:
aggregate_data = aggregate_data.merge(data_date_temperatures)

In [29]:
aggregate_data = aggregate_data.merge(data_date_humidities)

In [30]:
aggregate_data = aggregate_data.merge(data_date_wind)

In [31]:
aggregate_data = aggregate_data.merge(data_date_latitude_longitude)

In [32]:
aggregate_data = aggregate_data.merge(data_date_height)

In [33]:
aggregate_data

Unnamed: 0,Date (YYYY-MM-DD),Total precipitation in millimetres,Atmospheric pressure at station level (mB),Maximum air pressure (mB),Minimum air pressure (mB),Solar radiation (Kj/m²),Air temperature (°C),Dew point temperature (°C),Maximum temperature (°C),Minimum temperature (°C),...,Minimum dew point temperature (°C),Maximum relative humid temperature (%),Minimum relative humid temperature (%),Relative humidity (%),Wind direction (radius degrees (0-360)),Wind gust (m/s),Wind speed (m/s),latitude,longitude,Elevation
0,2000-05-07,0.000000,886.770000,887.060000,886.600000,1880.000000,25.580000,13.530000,26.380000,24.670000,...,12.890000,53.000000,44.900000,47.600000,120.500000,4.560000,1.890000,-15.789444,-47.925833,1159.540000
1,2000-05-08,0.000000,888.425000,888.700000,888.225000,1783.833333,24.658333,13.683333,25.058333,23.516667,...,12.733333,58.416667,50.333333,52.666667,91.333333,5.466667,2.741667,-15.789444,-47.925833,1159.540000
2,2000-05-09,0.000000,888.266667,888.558333,888.041667,1446.666667,24.633333,13.450000,25.366667,23.508333,...,12.825000,57.000000,48.250000,50.750000,125.083333,4.350000,1.825000,-15.789444,-47.925833,1159.540000
3,2000-05-10,0.000000,889.250000,889.516667,889.041667,1638.416667,24.691667,14.241667,25.225000,23.708333,...,13.650000,58.666667,51.500000,53.500000,155.333333,4.683333,2.433333,-15.789444,-47.925833,1159.540000
4,2000-05-11,0.000000,888.408333,888.691667,888.233333,1675.333333,23.316667,13.266667,23.733333,22.308333,...,12.500000,58.750000,52.166667,54.583333,136.250000,5.725000,2.775000,-15.789444,-47.925833,1159.540000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7630,2021-04-26,0.005220,945.112912,945.433242,944.901236,1313.657967,24.525824,16.696841,25.220742,23.482418,...,15.982830,68.802198,61.057692,64.028846,211.315934,4.176923,1.575962,-17.801123,-52.098204,595.863310
7631,2021-04-27,0.012138,945.190897,945.525241,944.989379,1456.371034,24.488138,15.593793,25.217517,23.342621,...,14.830069,65.339310,57.172414,60.264828,189.568276,4.276138,1.651448,-17.792002,-52.090653,600.385876
7632,2021-04-28,0.019945,946.263115,946.546585,946.018989,1499.266393,24.734153,14.744945,25.417896,23.522131,...,13.934426,61.715847,53.943989,56.740437,184.575137,4.381421,1.729508,-17.711581,-52.112827,597.304549
7633,2021-04-29,0.007510,947.717942,948.011822,947.484562,1338.627260,24.184840,13.919054,24.804729,23.072740,...,13.090960,60.422809,53.150209,55.924896,148.470097,4.925591,2.079833,-17.754311,-52.110150,596.609138


# Normalize
(Fahreen)

In [40]:
z = aggregate_data.copy()
cols_to_norm = [
              'Atmospheric pressure at station level (mB)',
              'Maximum air pressure (mB)',
              'Minimum air pressure (mB)',
              'Solar radiation (Kj/m²)',
              'Air temperature (°C)',
              'Dew point temperature (°C)',
              'Maximum temperature (°C)',
              'Minimum temperature (°C)',
              'Maximum dew point temperature (°C)',
              'Minimum dew point temperature (°C)',
              'Maximum relative humid temperature (%)',
              'Minimum relative humid temperature (%)',
              'Relative humidity (%)',
              'Wind direction (radius degrees (0-360))',
              'Wind gust (m/s)',
              'Wind speed (m/s)',
              'latitude',
              'longitude',
              'Elevation']

z[cols_to_norm] = z[cols_to_norm].apply(lambda x: (x - x.mean()) / x.std())
z


Unnamed: 0,Date (YYYY-MM-DD),Total precipitation in millimetres,Atmospheric pressure at station level (mB),Maximum air pressure (mB),Minimum air pressure (mB),Solar radiation (Kj/m²),Air temperature (°C),Dew point temperature (°C),Maximum temperature (°C),Minimum temperature (°C),...,Minimum dew point temperature (°C),Maximum relative humid temperature (%),Minimum relative humid temperature (%),Relative humidity (%),Wind direction (radius degrees (0-360)),Wind gust (m/s),Wind speed (m/s),latitude,longitude,Elevation
0,2000-05-07,0.000000,-3.783941,-3.784600,-3.778856,1.835943,-0.214009,-0.817187,-0.157846,-0.156585,...,-0.793629,-0.933931,-1.000238,-0.990281,-1.120859,-0.792446,-0.566478,1.490337,3.154476,3.742280
1,2000-05-08,0.000000,-3.681935,-3.683646,-3.678636,1.427923,-0.641119,-0.777619,-0.758141,-0.715226,...,-0.833930,-0.515065,-0.574347,-0.605593,-1.846743,0.229857,1.059656,1.490337,3.154476,3.742280
2,2000-05-09,0.000000,-3.691694,-3.692367,-3.689943,-0.002623,-0.652704,-0.837831,-0.618097,-0.719262,...,-0.810350,-0.624615,-0.737648,-0.751116,-1.006791,-1.029230,-0.690587,1.490337,3.154476,3.742280
3,2000-05-10,0.000000,-3.631086,-3.633375,-3.628269,0.810942,-0.625672,-0.633542,-0.682442,-0.622388,...,-0.598131,-0.495733,-0.482898,-0.542322,-0.253946,-0.653383,0.470938,1.490337,3.154476,3.742280
4,2000-05-11,0.000000,-3.682963,-3.684159,-3.678122,0.967574,-1.262861,-0.885140,-1.359950,-1.300507,...,-0.893951,-0.489289,-0.430641,-0.460070,-0.728882,0.521138,1.123301,1.490337,3.154476,3.742280
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7630,2021-04-26,0.005220,-0.187970,-0.191312,-0.183207,-0.566958,-0.702525,0.000015,-0.684376,-0.731815,...,0.001955,0.288039,0.266280,0.257083,1.139318,-1.224381,-1.166090,-1.028333,0.392340,0.050214
7631,2021-04-27,0.012138,-0.183163,-0.185649,-0.177771,0.038551,-0.719989,-0.284626,-0.685840,-0.799529,...,-0.294575,0.020257,-0.038267,-0.028701,0.598074,-1.112512,-1.021959,-1.016913,0.397339,0.079836
7632,2021-04-28,0.019945,-0.117077,-0.122778,-0.114271,0.220550,-0.605983,-0.503671,-0.594829,-0.712579,...,-0.524966,-0.259942,-0.291326,-0.296291,0.473808,-0.993801,-0.872914,-0.916224,0.382660,0.059654
7633,2021-04-29,0.007510,-0.027409,-0.032582,-0.023884,-0.461017,-0.860540,-0.716792,-0.873327,-0.930251,...,-0.741935,-0.359932,-0.353546,-0.358211,-0.424755,-0.380227,-0.204020,-0.969723,0.384432,0.055099


# PCA

In [43]:
# these are the fields from the dataset that we want to reduce
pca_target_data = z[cols_to_norm]

In [44]:
from sklearn.decomposition import PCA

# capture 95% of information
pca = PCA(0.95)

pca_data = pca.fit_transform(pca_target_data)

In [47]:
# pca_data is numpy 2d array, need to convert to pandas dataframe (for readability)
df_pca_data = pd.DataFrame(pca_data)

In [48]:
df_pca_data

Unnamed: 0,0,1,2,3,4,5
0,6.295545,3.668991,-3.777892,-3.276331,0.361848,-0.877045
1,6.683255,4.455561,-3.004912,-1.349318,0.446375,-1.437071
2,5.860399,4.558128,-2.515216,-3.409132,0.476555,-0.184169
3,5.743704,4.704287,-3.002506,-2.313170,0.689502,-0.015803
4,6.658987,5.188419,-2.181283,-1.089290,0.667546,-0.373192
...,...,...,...,...,...,...
7630,-0.644813,1.255216,0.726268,-1.372290,-1.160487,1.264777
7631,0.110162,0.777256,0.782237,-1.328049,-1.349276,0.742833
7632,0.575417,0.264045,0.813980,-1.277594,-1.266382,0.690303
7633,1.203607,0.300936,1.538408,-0.427567,-1.318976,0.333968


In [50]:
# now let's replace the pre-existing columns with the ones PCA calculated
final_data = z[['Date (YYYY-MM-DD)', 'Total precipitation in millimetres']]

pd.merge(final_data, df_pca_data, left_index=True, right_index=True)

Unnamed: 0,Date (YYYY-MM-DD),Total precipitation in millimetres,0,1,2,3,4,5
0,2000-05-07,0.000000,6.295545,3.668991,-3.777892,-3.276331,0.361848,-0.877045
1,2000-05-08,0.000000,6.683255,4.455561,-3.004912,-1.349318,0.446375,-1.437071
2,2000-05-09,0.000000,5.860399,4.558128,-2.515216,-3.409132,0.476555,-0.184169
3,2000-05-10,0.000000,5.743704,4.704287,-3.002506,-2.313170,0.689502,-0.015803
4,2000-05-11,0.000000,6.658987,5.188419,-2.181283,-1.089290,0.667546,-0.373192
...,...,...,...,...,...,...,...,...
7630,2021-04-26,0.005220,-0.644813,1.255216,0.726268,-1.372290,-1.160487,1.264777
7631,2021-04-27,0.012138,0.110162,0.777256,0.782237,-1.328049,-1.349276,0.742833
7632,2021-04-28,0.019945,0.575417,0.264045,0.813980,-1.277594,-1.266382,0.690303
7633,2021-04-29,0.007510,1.203607,0.300936,1.538408,-0.427567,-1.318976,0.333968


# Data Sampling - TODO

# Splitting into training and test sets - TODO

# Calculating mean and SD for both sets - TODO