In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os

wdir = "/content/drive/MyDrive/magistrale/tesi/timeseries"
if os.path.abspath(os.curdir) != wdir:
  os.chdir(wdir)
print(f"Work dir: {wdir}")
print(f"@{wdir}: {os.listdir()}")

Work dir: /content/drive/MyDrive/magistrale/tesi/timeseries
@/content/drive/MyDrive/magistrale/tesi/timeseries: ['extraction.ipynb', 'timeseries.zip', 'conversionTime.csv', 's1', 's10', 's11', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9', 'repr_tsIDs.txt']


In [3]:
import zipfile
convTime = "conversionTime.csv"
zip_name = "timeseries.zip"

if convTime not in os.listdir():
    zip_file = os.path.join(wdir, zip_name)
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(wdir)
    #print(f"@{wdir}: {os.listdir()}")
    print(f"Extracted {zip_name} @{wdir}")
else:
    print(f"Already extracted {zip_name} @{wdir}")

Already extracted timeseries.zip @/content/drive/MyDrive/magistrale/tesi/timeseries


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

###Exploration of timeseries

In [5]:
# imports
import re
import yaml
import time

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [6]:
LOG = "repr_tsIDs.txt"
DATE_COL = "Date"

In [9]:
def repr_tsIDs(df, logfile) -> {}:
    df = df.drop(DATE_COL, axis = 1)
    report = {}
    report["Length"] = len(df)
    report["Shape"] = [len(df), len(df.columns), ]

    print("NaN analysis")
    print("NaN analysis", file = logfile)
    nans = df.isnull().sum().to_dict()
    report["NaN Number"] = nans
    report["NaN %"] = {}
    report["Nan fill strat"] = {}
    good_nan_perc = 0.30
    cols_bad_nan = []
    for tsID, nan_num in nans.items():
        nan_percentage = nan_num / len(df)
        report["NaN %"][tsID] = nan_percentage
        report["Nan fill strat"][tsID] = ["ffil", "bfill"] if nan_percentage < good_nan_perc else ["0"]
        if nan_percentage < good_nan_perc:
            df[tsID] = df[tsID].fillna(method = "ffill").fillna(method = "bfill")
        else:
            cols_bad_nan.append(tsID)
            df[tsID] = df[tsID].fillna(0.0)
    report["Bad Nan cols"] = {}
    report["Bad Nan cols"]["Tot num"] = len(cols_bad_nan)
    report["Bad Nan cols"]["tsIDs"] = cols_bad_nan
    report["Bad Nan cols"]["Perc bad nan"] = len(cols_bad_nan) / len(df.columns)
    
    df = df.drop(cols_bad_nan, axis=1)
    report["Good Nan cols"] = {}
    report["Good Nan cols"]["tsIDs"] = list(df.columns)
    report["Good Nan cols"]["Tot num"] = len(df.columns)
    print(f"Good NaN: {len(df.columns)}")
    print(f"Good NaN: {len(df.columns)}", file = logfile)
    if len(df.columns) < 2:
        return report

    print("PCA")
    print("PCA", file = logfile)
    report["PCA"] = {}
    exp_var = 0.95
    report["PCA"]["Explained variance"] = exp_var
    
    dimReduct = PCA(n_components = exp_var)
    X = df.to_numpy()
    X = np.transpose(X)
    Y = dimReduct.fit_transform(X)
    report["PCA"]["Reduced dimensions"] = Y.shape[-1]
    report["PCA"]["Explained variance ratio"] = dimReduct.explained_variance_ratio_.tolist()
    
    print("KMeans")
    print("KMeans", file = logfile)
    report["KMeans"] = {}
    k_num = 10
    report["KMeans"]["Num of evaluated configurations"] = k_num
    seed = 42
    report["KMeans"]["Random state"] = seed
    sizes = np.unique(
        np.geomspace(2, len(df.columns), num = k_num, endpoint = False, dtype=int)
    ).tolist()
    print(f"KMeans: sizes: {sizes}")
    print(f"KMeans: sizes: {sizes}", file = logfile)

    report["KMeans"]["Size"] = {}
    best_k = -1
    best_kmeans = None
    best_score = float("inf")
    for k in sizes:
        print(f"KMeans: size: {k}")
        print(f"KMeans: size: {k}", file = logfile)
        report["KMeans"]["Size"][k] = {}
        kmeans = KMeans(n_clusters = k, random_state = seed)
        kmeans.fit_predict(Y)
        sscore = silhouette_score(Y, kmeans.labels_)
        report["KMeans"]["Size"][k]["Score"] = float(sscore)
        if sscore < best_score:
            best_k = k
            best_score = float(sscore)
            best_kmeans = kmeans
    
    print(f"Best KMeans: {best_k}")
    print(f"Best KMeans: {best_k}", file = logfile)
    report["KMeans"]["Best silhouette"] = {}
    report["KMeans"]["Best silhouette"]["Size"] = best_k
    report["KMeans"]["Best silhouette"]["Score"] = best_score
    report["KMeans"]["Best silhouette"]["Clusters"] = {}
    for i, center in enumerate(best_kmeans.cluster_centers_):
        closestIDX = np.argmin(np.linalg.norm(Y - center, axis=1))
        report["KMeans"]["Best silhouette"]["Clusters"][i] = {}
        report["KMeans"]["Best silhouette"]["Clusters"][i]["Centroid"] = center.tolist()
        report["KMeans"]["Best silhouette"]["Clusters"][i]["Closest column"] = df.columns[closestIDX]
        cluster_dim = np.sum(best_kmeans.labels_ == i)
        report["KMeans"]["Best silhouette"]["Clusters"][i]["Dimension"] = int(cluster_dim)

    return report

In [10]:
log = open(LOG, "w")
store = True
try:
    dirs = list(filter(
        lambda t: os.path.isdir(t[1]), 
        [(f, os.path.join(wdir, f)) for f in os.listdir()]
    ))
    print(f"Groups: {[g for g, _ in dirs]}", file=log)
    for group, groupPath in dirs:
        dims = list(
            map(lambda csv: (csv[:-4], os.path.join(groupPath, csv)), 
                filter(lambda f: re.match(".+\.csv", f) is not None,
                        os.listdir(groupPath)
                )
            )
        )
        for dim, dimPath in dims:
            print(f"[{group}] Dimension: {dim}", file=log)
            print(f"[{group}] Dimension: {dim}")
            df = pd.read_csv(dimPath)

            t0 = time.time()
            data = repr_tsIDs(df, logfile = log)
            deltaT = time.time() - t0
            data["Time (s)"] = deltaT
            print(f"[{group}:{dim}] Time: {deltaT} s", file=log)
            print(f"[{group}:{dim}] Time: {deltaT} s")
            if not store:
                continue
            yaml_file = f"{dim}_repr_tsIDs.txt"
            yaml_path = os.path.join(groupPath, yaml_file)
            with open(yaml_path, "w") as yf:
                yaml.dump(data, yf, default_flow_style=False)
finally:
    log.close()

[s1] Dimension: somma_progressiva
NaN analysis
Good NaN: 5
PCA
KMeans
KMeans: sizes: [2, 3, 4]
KMeans: size: 2
KMeans: size: 3
KMeans: size: 4
Best KMeans: 4
[s1:somma_progressiva] Time: 0.05360293388366699 s
[s10] Dimension: not_specified
NaN analysis
Good NaN: 3
PCA
KMeans
KMeans: sizes: [2]
KMeans: size: 2
Best KMeans: 2
[s10:not_specified] Time: 0.017436504364013672 s
[s11] Dimension: net_amount
NaN analysis
Good NaN: 1
[s11:net_amount] Time: 0.003173351287841797 s
[s2] Dimension: SC_da_Media_Settimanale_Venduto
NaN analysis
Good NaN: 5
PCA
KMeans
KMeans: sizes: [2, 3, 4]
KMeans: size: 2
KMeans: size: 3
KMeans: size: 4
Best KMeans: 4
[s2:SC_da_Media_Settimanale_Venduto] Time: 0.05198216438293457 s
[s3] Dimension: conteggio_articoli
NaN analysis
Good NaN: 13
PCA
KMeans
KMeans: sizes: [2, 3, 4, 5, 6, 7, 8, 10]
KMeans: size: 2
KMeans: size: 3
KMeans: size: 4
KMeans: size: 5
KMeans: size: 6
KMeans: size: 7
KMeans: size: 8
KMeans: size: 10
Best KMeans: 10
[s3:conteggio_articoli] Time: 2