In [None]:
%load_ext autoreload
%autoreload 2

%config InlineBackend.figure_format = 'svg'
%matplotlib inline

import collections
import colorsys
import itertools

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
import webcolors
from IPython.display import Image

import haikulib.eda.colors
from haikulib import data, nlp, utils





In [None]:
data_dir = data.get_data_dir() / "experiments" / "eda" / "colors"
data_dir.mkdir(parents=True, exist_ok=True)
pd.set_option("display.latex.repr", True)
pd.set_option("display.latex.longtable", True)
pd.set_option('display.max_colwidth', None)
plt.rcParams["figure.figsize"] = (16, 9)
sns.set(style="whitegrid")

In [None]:
df = data.get_df()
corpus = []

for haiku in df["haiku"]:
    corpus.append(" ".join(haiku.split("/")))

color_names = {
    r["color"]: r["hex"] for _, r in haikulib.eda.colors.get_colors().iterrows()
}

In [None]:
%%time
naive_colors = collections.Counter()
for haiku in corpus:
    # Update the color counts for this haiku.
    naive_colors.update(nlp.count_tokens_from(haiku, color_names, ngrams=[1, 2, 3]))


In [None]:
naive_color_counts = pd.DataFrame(
    {
        "color": list(naive_colors.keys()),
        "count": list(naive_colors.values()),
        "hex": [color_names[c] for c in naive_colors],
    }
)

total_color_count = sum(row["count"] for index, row in naive_color_counts.iterrows())

print(f"There are {total_color_count} occurences of color in the corpus")
print(f"There are {len(naive_color_counts)} unique colors")

naive_color_counts.head(10)


In [None]:
utils.display_source("haikulib.utils", "display_source")
utils.display_source("haikulib.eda.colors", "is_color")
utils.display_source("haikulib.nlp", "pos_tag")


In [None]:
# Modified to test colors of all three sizes.
haiku = "dark blue lines / in a light olive green sea salt / dreams"
haiku_colors = [
    tagged_word[0]
    for tagged_word in nlp.pos_tag(haiku)
    if haikulib.eda.colors.is_color(tagged_word)
]
print(haiku_colors)

In [None]:
utils.display_source("haikulib.eda.colors", "find_colors")
haikulib.eda.colors.find_colors(nlp.pos_tag(haiku))
utils.display_source("haikulib.data.initialization", "init_csv")
df = data.get_df()
df.tail(10)

In [None]:
pos_tagging_color_counts = haikulib.eda.colors.get_colors()

total_color_count = pos_tagging_color_counts["count"].sum()
used_color_count = pos_tagging_color_counts["count"].astype(bool).sum(axis=0)

print(f"There are {total_color_count} occurences of color in the corpus")
print(f"There are {used_color_count} unique colors")

pos_tagging_color_counts[["color", "count", "hex"]].head(10)

In [None]:
total_color_count = naive_color_counts["count"].sum()

print(f"There are {total_color_count} occurences of color in the corpus")
print(f"There are {len(naive_color_counts)} unique colors")

naive_color_counts.head(10)

In [None]:
Image(data_dir / ".." / "word_clouds" / "colors.png")


In [None]:
colors = haikulib.eda.colors.get_colors()
colors.sort_values(by=["hsv", "count"], ascending=False, inplace=True)
used_colors = colors.loc[colors["count"] != 0].copy()
used_colors.sort_values(by="count", ascending=False, inplace=True)
_ = plt.bar(
    range(len(used_colors)),
    used_colors["count"],
    color=used_colors["rgb"],
    width=1,
    linewidth=0,
    log=True,
)
plt.savefig(data_dir / "histogram.svg")
plt.show()

In [None]:
def pairwise_difference(seq):
    for l, r in utils.pairwise(seq):
        yield r - l
    # Loop back around to the front.
    yield 2 * np.pi - seq[-1]


def accumulate(seq):
    _sum = 0
    for s in seq:
        yield _sum
        _sum += s

In [None]:
used_colors.sort_values(by="count", ascending=False, inplace=True)

ax = plt.subplot(111, projection="polar")

thetas = 2 * np.pi * used_colors["count"] / used_colors["count"].sum()
thetas = np.array(list(accumulate(thetas)))
widths = np.array(list(pairwise_difference(thetas)))
radii = np.log(used_colors["count"])

_ = ax.bar(
    x=thetas,
    height=radii,
    width=widths,
    color=used_colors["rgb"],
    linewidth=0,
    align="edge",
)
plt.savefig(data_dir / "count-proportional-theta-radii-width.svg")
plt.show()
ax = plt.subplot(111, projection="polar")

_ = ax.bar(
    x=thetas,
    # Plot the same information with a fixed height.
    height=1,
    width=widths,
    color=used_colors["rgb"],
    linewidth=0,
    align="edge",
)
plt.savefig(data_dir / "count-proportional-theta-width-fixed-height.svg")
plt.show()

In [None]:
used_colors.sort_values(by="hsv", ascending=False, inplace=True)


In [None]:
ax = plt.subplot(111, projection="polar")

thetas = np.linspace(0, 2 * np.pi, len(used_colors), endpoint=False)
widths = 4 * np.pi / len(used_colors)
radii = np.log(used_colors["count"])

_ = ax.bar(
    x=thetas,
    height=radii,
    width=widths,
    color=used_colors["rgb"],
    linewidth=0,
    align="edge",
)
plt.savefig(data_dir / "hue-proportional-radii-fixed-theta-width.svg")
plt.show()
ax = plt.subplot(111, projection="polar")

thetas = 2 * np.pi * used_colors["count"] / used_colors["count"].sum()
thetas = np.array(list(accumulate(thetas)))
widths = np.array(list(pairwise_difference(thetas)))
radii = np.log(used_colors["count"])

_ = ax.bar(
    x=thetas,
    height=1,
    width=widths,
    color=used_colors["rgb"],
    linewidth=0,
    align="edge",
)
plt.savefig(data_dir / "hue-proportional-theta-width-fixed-radii.svg")
plt.show()
ax = plt.subplot(111, projection="polar")

_ = ax.bar(
    x=thetas,
    height=radii,
    width=widths,
    color=used_colors["rgb"],
    linewidth=0,
    align="edge",
)
plt.savefig(data_dir / "hue-proportional-theta-radii-width.svg")
plt.show()