In [1]:
import numpy as np
from scipy import stats
import pandas as pd

In [2]:
# data from http://kdd.ics.uci.edu/databases/el_nino/el_nino.html

data_link = 'http://kdd.ics.uci.edu/databases/el_nino/elnino.gz'
data_desc = 'http://kdd.ics.uci.edu/databases/el_nino/elnino.col'

# Preparing dataset

In [3]:
from tessa import DataFields3D, Dataset

In [4]:
cols = pd.read_csv(data_desc, header=None).squeeze().to_list()
data = (
    pd.read_csv(data_link, header=None, delim_whitespace=True, names=cols)
    .drop_duplicates(keep='last', subset=['buoy', 'day'])
    .query('humidity != "."')
    .astype({'humidity': float})
    .assign(z_humidity = lambda x: stats.zscore(x['humidity'].values))
)
data.head()

Unnamed: 0,buoy,day,latitude,longitude,zon.winds,mer.winds,humidity,air temp.,s.s.temp.,z_humidity
0,1,1,8.96,-140.32,-6.3,-6.4,83.5,27.32,27.57,-0.187797
1,1,2,8.95,-140.32,-5.7,-3.6,86.4,26.7,27.62,0.380607
2,1,3,8.96,-140.32,-6.2,-5.8,83.0,27.36,27.68,-0.285797
3,1,4,8.96,-140.34,-6.4,-5.3,82.2,27.32,27.7,-0.442598
4,1,5,8.96,-140.33,-4.9,-6.2,87.3,27.09,27.85,0.557008


In [5]:
data_fields = DataFields3D("latitude", "longitude", "day", weights='z_humidity')
data_fields

DataFields3D(x='latitude', y='longitude', z='day', weights='z_humidity')

In [6]:
dataset = Dataset(data, data_fields, name='El Nino')

In [7]:
with dataset.format('spatio_temporal_tensor'):
    # converts the data into the proper format for analysis
    *tensor_data, data_index = dataset.data

In [8]:
idx, vals, shape = tensor_data
print(f'Tensor with {shape=} and density={idx.shape[0] / np.prod(shape):.1%}')

Tensor with shape=(84, 104, 14) and density=0.5%


# Standard Tensor Factorization

In [9]:
from tessa import tensor_factors

In [10]:
factors = tensor_factors(idx, vals, shape, (10, 12, 3), seed=123)

growth of the core: 1.0
growth of the core: 0.1364425523591237
growth of the core: 0.009922526838207426
growth of the core: 0.001885573262431016
growth of the core: 0.006855716042607339
growth of the core: 0.015220333607973842
growth of the core: 0.004625363430118546
growth of the core: 0.00034016064864029593
Core is no longer growing. Norm of the core: 15.925474510603966.


In [11]:
for i, factor in enumerate(factors):
    print(f'{i+1} {factor.shape=}')

1 factor.shape=(84, 10)
2 factor.shape=(104, 12)
3 factor.shape=(14, 3)


# TESSA

In [12]:
from tessa import tessa_factors

In [13]:
factors = tessa_factors(idx, shape, (10, 12, 2, 3), attention_span=4, seed=123)

growth of the core: 1.0
growth of the core: 0.45950796251590037
growth of the core: 0.0521364844288116
growth of the core: 0.02255310516181182
growth of the core: 0.03260984296632649
growth of the core: 0.015769502659024565
growth of the core: 0.00965233700905079
growth of the core: 0.008587852215974466
growth of the core: 0.006875455003141976
growth of the core: 0.005310163782685596
growth of the core: 0.003760457829912918
growth of the core: 0.002323053030615243
growth of the core: 0.0013311981354696514
growth of the core: 0.0007740643522595958
Core is no longer growing. Norm of the core: 20.04194540027395.


In [14]:
for i, factor in enumerate(factors):
    print(f'{i+1} {factor.shape=}')

1 factor.shape=(84, 10)
2 factor.shape=(104, 12)
3 factor.shape=(4, 2)
4 factor.shape=(11, 3)
