In [90]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [91]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from mpl_toolkits.mplot3d import Axes3D

In [92]:
df = pd.read_csv("D:/workshop/data/aussie_rain/weatherAUS.csv")
df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [93]:
df.columns

Index(['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
       'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'RainToday', 'RainTomorrow'],
      dtype='object')

# Cleaning

In [94]:
df = df.drop([
    'Date', 'Location', 
    'WindGustDir', 'WindDir9am', 'WindDir3pm'], axis=1)

df = df.dropna(how = 'any')

df.loc[df['RainToday'] == 'No', 'RainToday'] = 0
df.loc[df['RainToday'] == 'Yes', 'RainToday'] = 1

df.loc[df['RainTomorrow'] == 'No', 'RainTomorrow'] = 0
df.loc[df['RainTomorrow'] == 'Yes', 'RainTomorrow'] = 1

df['RainToday'] = df['RainToday'].astype(int)
df['RainTomorrow'] = df['RainTomorrow'].astype(int)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 58090 entries, 6049 to 142302
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MinTemp        58090 non-null  float64
 1   MaxTemp        58090 non-null  float64
 2   Rainfall       58090 non-null  float64
 3   Evaporation    58090 non-null  float64
 4   Sunshine       58090 non-null  float64
 5   WindGustSpeed  58090 non-null  float64
 6   WindSpeed9am   58090 non-null  float64
 7   WindSpeed3pm   58090 non-null  float64
 8   Humidity9am    58090 non-null  float64
 9   Humidity3pm    58090 non-null  float64
 10  Pressure9am    58090 non-null  float64
 11  Pressure3pm    58090 non-null  float64
 12  Cloud9am       58090 non-null  float64
 13  Cloud3pm       58090 non-null  float64
 14  Temp9am        58090 non-null  float64
 15  Temp3pm        58090 non-null  float64
 16  RainToday      58090 non-null  int64  
 17  RainTomorrow   58090 non-null  int64  
dtypes: floa

In [95]:
X = df.drop('RainTomorrow', axis=1)
X.shape

(58090, 17)

# Splitting

In [96]:
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(X, test_size=0.2)

X_train.shape, X_test.shape

((46472, 17), (11618, 17))

In [103]:
X_train.to_csv(r"D:\workshop\data\aussie_rain\train.csv", index=False)
X_test.to_csv(r"D:\workshop\data\aussie_rain\test.csv", index=False)

# Check Data Module

In [127]:
from aussie_rain_train import AusRainDataModule
dm = AusRainDataModule()

In [128]:
dm.setup(stage='fit')

[32m12:29:45[0m | [34m[1mDEBUG   [0m | [34m[1mX shape - (46472, 17)[0m
[32m12:29:45[0m | [34m[1mDEBUG   [0m | [34m[1mX_train      - torch.Size([37177, 17])   - torch.float32[0m
[32m12:29:45[0m | [34m[1mDEBUG   [0m | [34m[1mX_val shape  - torch.Size([9295, 17])     - torch.float32[0m
[32m12:29:45[0m | [1mINFO    [0m | [1mTotal Dataset       : 46472 samples[0m
[32m12:29:45[0m | [1mINFO    [0m | [1mTrain Dataset       : 37177 samples[0m
[32m12:29:45[0m | [1mINFO    [0m | [1mValidation Dataset  : 9295 samples[0m


In [129]:
sample = dm.train_dataset[0]
sample.shape, sample.dtype

(torch.Size([17]), torch.float32)

# Check Model

In [130]:
from aussie_rain_train import AusRainAutoencoder

input_size = 17  # Number of input features
encoding_dim = 3  # Desired number of output dimensions

autoencoder_model = AusRainAutoencoder(input_size, encoding_dim)

In [131]:
out = autoencoder_model(sample)
out.shape

[32m12:30:06[0m | [34m[1mDEBUG   [0m | [34m[1mInput shape : torch.Size([17])[0m
[32m12:30:06[0m | [34m[1mDEBUG   [0m | [34m[1mEncoder Output shape : torch.Size([3])[0m
[32m12:30:06[0m | [34m[1mDEBUG   [0m | [34m[1mDecoder Output shape : torch.Size([17])[0m


torch.Size([17])

# Use Encoder

In [None]:
# Encoding the data using the trained autoencoder
# encoded_data = model.encoder(X_tensor).detach().numpy()