# Environment Initialization

In [7]:
%matplotlib widget

import matplotlib.pyplot as plt
import matplotlib.figure

import scipy.io
from statsmodels.tsa.seasonal import STL, DecomposeResult

import pandas as pd
import datetime as dt

from sklearn.cluster import KMeans

import wordcloud
from PIL import Image
import numpy as np



import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
import re

import os

if not os.path.exists("./report/figures/"):
  os.makedirs("./report/figures")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sergiovaneg/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Water dataset
## Import and visualize variables

In [8]:
water_dataset = scipy.io.loadmat("./WaterQualityData.mat")
N = water_dataset["Ydata"].shape[0]
seasonal = 21

x_data = pd.DataFrame(water_dataset["Xdata"],
                   index=pd.date_range(start=dt.datetime(2016, 1, 28, 8),
                                       periods=N, freq="D"))
y_data = pd.DataFrame(water_dataset["Ydata"],
                   index=pd.date_range(start=dt.datetime(2016, 1, 28, 8),
                                       periods=N, freq="D"))

def stl_custom_plot(data:pd.Series, seasonal:int, trend:int,
                    suptitle:str, save_path:str,
                    figsize:tuple[int,int]=(10,6), period = None)-> matplotlib.figure.Figure:
  stl = STL(data, seasonal=seasonal, trend=trend, robust=True, period=period).fit()

  fig = plt.figure(figsize=figsize)
  axes = fig.subplots(4,1, sharex=True)
  axes[0].plot(data.index, stl.observed); axes[0].set_ylabel("Original")
  axes[1].plot(data.index, stl.trend); axes[1].set_ylabel("Trend")
  axes[2].plot(data.index, stl.seasonal); axes[2].set_ylabel("Seasonality")
  axes[3].plot(data.index, stl.resid); axes[3].set_ylabel("Residual")
  for idx in range(4):
    axes[idx].autoscale(enable=True, axis="x", tight=True)
  fig.suptitle(suptitle)
  fig.savefig(fname=save_path)

  return fig;

fig = stl_custom_plot(x_data.iloc[:,0].squeeze(), seasonal, None,
                      "Input automatic STL decomposition",
                      "./report/figures/x_decomp.eps")
plt.close(fig)

fig = stl_custom_plot(y_data.squeeze(), seasonal, None,
                      "Output automatic STL decomposition",
                      "./report/figures/y_decomp.eps")
plt.close(fig)

## Seasonality Test

In [9]:
fig = stl_custom_plot(y_data.squeeze(), seasonal, None,
                      f"Input STL decomposition\nPeriod: weekly",
                      "./report/figures/weekly_decomp.eps", period=7)
plt.close(fig)

fig = stl_custom_plot(y_data.squeeze(), seasonal, None,
                      f"Output STL decomposition\nPeriod: monthly",
                      "./report/figures/monthly_decomp.eps", period=30)
plt.close(fig)

fig = stl_custom_plot(y_data.squeeze(), seasonal, None,
                      f"Output STL decomposition\nPeriod: quarterly",
                      "./report/figures/quarterly_decomp.eps", period=90)
plt.close(fig)

## Output-variable K-Means

In [10]:
n_clusters = 4
y_clusters = KMeans(n_clusters=n_clusters, n_init=100).fit_predict(y_data)

fig = plt.figure(figsize=(16,9))
for cluster in range(n_clusters):
  cluster_idxs = (y_clusters == cluster)
  plt.scatter(y_data.index[cluster_idxs], y_data.iloc[cluster_idxs,0])
fig.savefig("./report/figures/clustered_by_output.eps")
plt.close(fig)

## Multi-variate K-Means

In [11]:
n_clusters = 4
x_clusters = KMeans(n_clusters=n_clusters, n_init=100).fit_predict(x_data)

fig = plt.figure(figsize=(16,9))
for cluster in range(n_clusters):
  cluster_idxs = (x_clusters == cluster)
  plt.scatter(y_data.index[cluster_idxs], y_data.iloc[cluster_idxs,0])
fig.savefig("./report/figures/clustered_by_input.eps")
plt.close(fig)

# Sequential Text Data
## Pre-Processing

In [12]:
dagon_text = open("./dagon.txt", "r").read()
ctulhu_mask = 255 - np.array(Image.open("./cthulhu.png").convert("L"))[30:1150, 20:1200]
  
stopwords = set.union(set(stopwords.words("english")),
                      set(wordcloud.STOPWORDS))

wc = wordcloud.WordCloud(background_color="black", max_words=100,
                         mask=ctulhu_mask, stopwords=stopwords,
                         contour_width=1, contour_color="green",
                         mode="RGB")

wc.generate(dagon_text)
wc.to_file("./report/figures/wordcloud.png")

dagon_text_tokenized = \
  word_tokenize(re.sub(r"[^\w\s]", "", dagon_text.lower()))
dagon_text_processed = " ".join(dagon_text_tokenized)

bigram = {}
for (a,b) in zip(dagon_text_tokenized[:-1], dagon_text_tokenized[1:]):
  if a in stopwords and b in stopwords:
    continue
  couple = " ".join([a,b])
  if couple not in bigram.keys():
    bigram[couple] = 1
  else:
    bigram[couple] = bigram[couple] + 1

wc.generate_from_frequencies(bigram)
wc.to_file("./report/figures/wordcloud_2N.png")

trigram = {}
for (a,b,c) in zip(dagon_text_tokenized[:-2], dagon_text_tokenized[1:-1], dagon_text_tokenized[2:]):
  if a in stopwords or b in stopwords or c in stopwords:
    continue
  triple = " ".join([a,b,c])
  if triple not in bigram.keys():
    trigram[triple] = 1
  else:
    trigram[triple] = trigram[triple] + 1

wc.generate_from_frequencies(trigram)
wc.to_file("./report/figures/wordcloud_3N.png")

<wordcloud.wordcloud.WordCloud at 0x7ff4cbeac910>