In [6]:
# !pip install dataframe-image 
import dataframe_image as dfi
# !pip install playsound pygobject
from playsound import playsound

import pandas as pd
import numpy as np
import torch
from torch import nn
from pathlib import Path
from matplotlib import pyplot as plt

In [7]:
from pathlib import Path
from matplotlib import pyplot as plt

CHAPTER = 'ch07'
BOOK_IMAGES_DIR = Path.home() / 'code' / 'tangibleai' / 'nlpia-manuscript' 
BOOK_IMAGES_DIR /= Path('manuscript') / 'images' / CHAPTER
CODE_IMAGES_DIR = Path.home() / 'code' / 'tangibleai' / 'nlpia2' / 'src' / 'nlpia2' / 'images' / CHAPTER 
IMAGES_DIR = CODE_IMAGES_DIR
IMAGES_DIR.mkdir(exist_ok=True,parents=True)

def savefig(ax, filename, **kwargs):
    filepath = IMAGES_DIR / filename
    if isinstance(ax, (list, tuple)):
        ax = ax[0]
    if hasattr(ax, 'figure'):
        return ax.figure.savefig(filepath, **kwargs)
    if hasattr(ax, 'savefig'):
        return ax.savefig(filepath, **kwargs)
    return plt.savefig(filepath)


In [8]:
# .Chapter imports

HOME_DATA_DIR = Path.home() / '.nlpia2-data'

In [9]:
def calc_conv_out_seq_len(seq_len, kernel_len, stride=1, dilation=1, padding=0):
    """ L_out = 1 + (L_in + 2 * padding - dilation * (kernel_size - 1) - 1) / stride """
    return 1 + (seq_len + 2 * padding - dilation * (kernel_len - 1) - 1) // stride

In [10]:
# def total_out_seq_len(seq_len, kernel_lengths, pool_lengths=None, stride=1, dilation=1, padding=0):
#     """ Calculate the number of encoding dimensions output from CNN layers

#     From PyTorch docs:
#       L_out = 1 + (L_in + 2 * padding - dilation * (kernel_size - 1) - 1) / stride
#     But padding=0 and dilation=1, because we're only doing a 'valid' convolution.
#     So:
#       L_out = 1 + (L_in - (kernel_size - 1) - 1) // stride

#     source: https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
#     """
#     if pool_lengths is None:
#         pool_lengths = kernel_lengths
#     out_pool_total = 0
#     for kernel_len in kernel_lengths:
#         conv_output_len = calc_conv_out_seq_len(
#             seq_len=seq_len, kernel_len=kernel_len, stride=stride,
#             dilation=dilation, padding=padding
#         )
#         out_pool = calc_conv_out_seq_len(
#             seq_len=conv_output_len,
#             kernel_len=kernel_len,
#             stride=stride,
#             dilation=dilation, padding=padding
#         )
#         out_pool_total += out_pool

#     # return the len of a "flattened" vector that is passed into a fully connected (Linear) layer
#     return out_pool_total

In [12]:
# .CNN hyperparameters
# [source,python]
# ----
class CNNTextClassifier(nn.Module):

    def __init__(self, embeddings=torch.rand(10_000, 50)):
        super().__init__()

        self.seq_len = 40                               # <1>
        self.vocab_size = embeddings.shape[0]           # <2>
        self.embedding_size = embeddings.shape[1]       # <3>
        self.out_channels = 5                           # <4>
        self.kernel_lengths = [2, 3, 4, 5, 6]           # <5>
        self.stride = 1                                 # <6>
        self.dropout = nn.Dropout(0)                    # <7>
        self.pool_stride = self.stride                  # <8>
# ----
# <1> `N_`: assume a maximum text length of 40 tokens
# <2> `V`: number of unique tokens (words) in your vocabulary
# <3> `E`: number of word embedding dimensions (kernel input channels)
# <4> `F`: number of filters (kernel output channels)
# <5> `K`: number of columns of weights in each kernel
# <6> `S`: number of time steps (tokens) to slide the kernel forward with each step
# <7> `D`: portion of convolution output to ignore. 0 dropout increases overfitting
# <8> `P`: pooling strides greater than 1 will increase feature reduction

# .CNN embedding
# [source,python]
# ----
        self.embed = nn.Embedding(
            self.vocab_size,
            self.embedding_size,                        # <1>
            padding_idx=0)
        state = self.embedding.state_dict()
        state['weight'] = embeddings                    # <2>
        
# ----
# <1> for pretrained 50-D GloVe vectors set embedding_size to 50

        self.convolvers = []
        self.poolers = []
        self.total_out_len = 0
        for i, kernel_len in enumerate(self.kernel_lengths):
            self.convolvers.append(
                nn.Conv1d(
                    in_channels=self.embedding_size,
                    out_channels=self.out_channels,
                    kernel_size=kernel_len,
                    stride=self.stride))
            print(f'conv[{i}].weight.shape: {self.convolvers[-1].weight.shape}')
            conv_output_len = calc_conv_out_seq_len(
                seq_len=self.seq_len, kernel_len=kernel_len, stride=self.stride)
            print(f'conv_output_len: {conv_output_len}')
            self.poolers.append(
                nn.MaxPool1d(
                    kernel_size=conv_output_len,
                    stride=self.stride))                            # <7>
            self.total_out_len += calc_conv_out_seq_len(
                seq_len=conv_output_len,
                kernel_len=conv_output_len,
                stride=self.stride)
            print(f'pool_output_len: {pool_output_len}')
            self.pool_lengths.append(pool_output_len)
            # Given input size: (32x1x34). Calculated output size: (32x1x0). Output size is too small
            print(f'poolers[{i}]: {self.poolers[-1]}')
        print(f'sum(self.pool_lengths): {sum(self.pool_lengths)}')
        self.conv_output_size = calc_output_seq_len(seq_len=self.seq_len, kernel_lengths=self.kernel_lengths)  # <8>
        print(f'calc_output_seq_len: {self.conv_output_size}')
        self.linear_layer = nn.Linear(self.out_channels * sum(self.pool_lengths), 1)
        print(f'linear_layer: {self.linear_layer}')

In [None]:
    def forward(self, x):
        """ Takes sequence of integers (token indices) and outputs binary class label """

        x = self.embedding(x)

        conv_outputs = []
        for (conv, pool) in zip(self.convolvers, self.poolers):
            z = conv(x)
            z = torch.relu(z)
            z = pool(z)
            conv_outputs.append(z)

        # The output of each convolutional layer is concatenated into a unique vector
        union = torch.cat(conv_outputs, 2)
        union = union.reshape(union.size(0), -1)

        # The "flattened" vector is passed through a fully connected layer
        out = self.linear_layer(union)
        # Dropout is applied
        out = self.dropout(out)
        # Activation function is applied
        out = torch.sigmoid(out)

        return out.squeeze()

In [45]:
# .Learnable embedding layer

from torch import nn

embedding = nn.Embedding(
    num_embeddings=10000,  # <1>
    embedding_dim=50,      # <2>
    padding_idx=0)

# <1> you must use the same size here as you use in your tokenizer
# <2> the smallest useful GloVE embeddings have 50 dimensions

In [46]:
embedding

Embedding(3000, 50, padding_idx=0)

In [47]:
# Listing 7.TBD12
# .Load news posts

df = pd.read_csv(HOME_DATA_DIR / 'news.csv')
df = df[['text', 'target']]  # <1>
print(df)

# <1> you only need the text and binary newsworthiness label for your CNN training

                                                   text  target
0     Our Deeds are the Reason of this #earthquake M...       1
1                Forest fire near La Ronge Sask. Canada       1
2     All residents asked to 'shelter in place' are ...       1
...                                                 ...     ...
7850  And in its broadest sense, neural Darwinism im...      -1
7851  In the 1980's, Edelman's theory was so novel t...      -1
7852  Over a lifetime, I have written millions of wo...      -1

[7853 rows x 2 columns]


In [48]:
# // Listing 7.TBD13
# .Most common words for your vocabulary

import re
from collections import Counter
from itertools import chain

counts = Counter(chain(*[
    re.findall(r'\w+', t.lower()) for t in df['text']]))     # <1>
vocab = [tok for tok, count in counts.most_common(4000)[3:]] # <2>
vocab += ['sacred']

print(counts.most_common(6))

# <1> tokenizing, case folding, and occurrence counting
# <2> ignore the 3 most frequent tokens ("t", "co", "http")


[('t', 5235), ('co', 4740), ('http', 4309), ('the', 3667), ('a', 2366), ('to', 2185)]


In [49]:
# // Listing 7.TBD14 
# .Multipurpose padding function

def pad(sequence, pad_value, seq_len):
    padded = list(sequence)[:seq_len]
    padded = padded + [pad_value] * (seq_len - len(padded))
    return padded

In [50]:
# .Load embeddings and align with your vocabulary

from nessvec.files import load_vecs_df

glove = load_vecs_df(HOME_DATA_DIR / 'glove.6B.50d.txt')
vocab = [tok for tok in vocab if tok in glove.index]         # <1>
embed = glove.loc[vocab]                                     # <2>

print(f'embed.shape: {embed.shape}')
print(f'vocab:\n{pd.Series(vocab)}')

# <1> skip unknown embeddings; alternatively create zero vectors
# <2> ensure your embedding matrix is in the same order as your vocab

2022-05-27:12:35:27.193 DEBUG files.py:271 skiprows=1


embed.shape: (3846, 50)
vocab:
0             a
1            to
2            in
         ...   
3843    objects
3844     chases
3845     sacred
Length: 3846, dtype: object


In [51]:
# Figure 7.TBD
# cnn-embeddings-glove-words-are-sacred.drawio.png

words = embed.loc[['words', 'are', 'sacred']].T
words['.'] = 0.0
words['<pad>'] = 0.0
roundwords = words.round(2)
roundwords

Unnamed: 0,words,are,sacred,.,<pad>
0,-0.07,0.96,0.38,0.0,0.0
1,0.66,0.01,1.32,0.0,0.0
2,-0.06,0.22,-1.54,0.0,0.0
...,...,...,...,...,...
47,0.82,0.14,-0.51,0.0,0.0
48,-0.54,-0.38,-0.70,0.0,0.0
49,-0.00,-0.39,-1.41,0.0,0.0


In [52]:
class CNNTextClassifier(nn.Module):

    def __init__(self, vocab_size=5, embedding_size=50):
        super().__init__()

        self.embedding = nn.Embedding(
            num_embeddings=5,
            embedding_dim=50,
            padding_idx=0)
        self.conv = nn.Conv1d(
            in_channels=50,
            out_channels=50,
            groups=50,
            kernel_size=2,
            stride=1)
    
    def forward(self, x):
        
        embeddings = self.embedding(x)
        print(f"embeddings.size(): {embeddings.size()}")
        print(f"embeddings:\n{embeddings}")
        features = self.conv(embeddings)
        print(f"features.size(): {features.size()}")
        print(f"features:\n{features}")
        return features.squeeze()
#             z = torch.relu(z)
#             z = pool(z)
#             conv_outputs.append(z)


In [54]:
import torch
cnn = CNNTextClassifier()
index_seq = torch.tensor([[1, 2, 3, 4] + [0] * 46 ])
print(f"index_seq.dtype: {index_seq.dtype}")
print(f"index_seq.size(): {index_seq.size()}")
cnn.forward(index_seq)

index_seq.dtype: torch.int64
index_seq.size(): torch.Size([1, 50])
embeddings.size(): torch.Size([1, 50, 50])
embeddings:
tensor([[[-1.0081, -1.1699, -1.4963,  ...,  2.3020,  0.2489,  0.5018],
         [-0.2400, -0.6480, -1.3521,  ...,  0.5758, -1.1413,  1.3931],
         [-0.3120,  0.6063,  1.8722,  ..., -0.8434, -0.6202, -0.5454],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]],
       grad_fn=<EmbeddingBackward0>)
features.size(): torch.Size([1, 50, 49])
features:
tensor([[[ 0.7111,  0.7006,  0.8373,  ...,  1.6108, -0.9101,  0.4913],
         [ 0.3074, -0.0479, -0.0912,  ...,  0.9394,  0.0321,  1.3873],
         [ 0.7862,  1.0993,  0.8208,  ...,  0.5600,  0.5075,  0.5376],
         ...,
         [ 0.3920,  0.3920,  0.3920,  ...,  0.3920,  0.3920,  0.3920],
         [ 0.0987,  0.0987,  0.0987,  ...,  0.0987

tensor([[ 0.7111,  0.7006,  0.8373,  ...,  1.6108, -0.9101,  0.4913],
        [ 0.3074, -0.0479, -0.0912,  ...,  0.9394,  0.0321,  1.3873],
        [ 0.7862,  1.0993,  0.8208,  ...,  0.5600,  0.5075,  0.5376],
        ...,
        [ 0.3920,  0.3920,  0.3920,  ...,  0.3920,  0.3920,  0.3920],
        [ 0.0987,  0.0987,  0.0987,  ...,  0.0987,  0.0987,  0.0987],
        [-0.3255, -0.3255, -0.3255,  ..., -0.3255, -0.3255, -0.3255]],
       grad_fn=<SqueezeBackward0>)

In [55]:
filepath = Path('df-glove-vectors-sacred-pad-1.png')
if filepath.is_file():
    filepath.unlink()
dfi.export(roundwords, filepath, max_rows=7)

[0527/123616.786132:ERROR:sandbox_linux.cc(377)] InitializeSandbox() called with multiple threads in process gpu-process.
[0527/123617.056794:INFO:headless_shell.cc(659)] Written to file /tmp/tmp2hzke_7k/temp.png.


![df-glove-vectors-sacred-pad.png](df-glove-vectors-sacred-pad.png)

In [56]:
# .Initialize your embedding layer with GloVE vectors

import torch
embed = torch.Tensor(embed.values)                         # <1>
print(f'embed.size(): {embed.size()}')
embed = nn.Embedding.from_pretrained(embed, freeze=False)  # <2>
print(embed)

# <1> convert Pandas DataFrame to a torch.Tensor
# <2> freeze=False allows your Embedding layer to fine-tune your embeddings

embed.size(): torch.Size([3846, 50])
Embedding(3846, 50)


In [57]:
# .Initialize your CNN hyperparameters

class CNNTextClassifier(nn.Module):

    def __init__(self, embeddings=glove):
        super().__init__()

        self.seq_len = 35                         # <1>
        self.vocab_size = embeddings.shape[0]
        self.embedding_size = embeddings.shape[1]
        self.kernel_lengths = [2]                 # <2>
        self.stride = 2
        self.conv_output_size = 50                # <3>

        self.dropout = nn.Dropout(.2)

        self.embedding = nn.Embedding(self.vocab_size + 1, self.embedding_size, padding_idx=0)

        self.convolvers = []
        self.poolers = []
        for i, kernel_len in enumerate(self.kernel_lengths):
            self.convolvers.append(nn.Conv1d(self.seq_len, self.conv_output_size, kernel_len, self.stride))
            self.poolers.append(nn.MaxPool1d(kernel_len, self.stride))

        self.encoding_size = self.cnn_output_size()
        self.linear_layer = nn.Linear(self.encoding_size, 1)

# <1> assume a maximum text length of 35 tokens
# <2> only one kernel layer is needed for reasonable results
# <3> the convolution output need not have the same number of channels as your embeddings


    def cnn_output_size(self):
        """ Calculate the number of encoding dimensions output from CNN layers

        Convolved_Features = ((embedding_size + (2 * padding) - dilation * (kernel - 1) - 1) / stride) + 1
        Pooled_Features = ((embedding_size + (2 * padding) - dilation * (kernel - 1) - 1) / stride) + 1

        source: https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
        """
        out_pool_total = 0
        for kernel_len in self.kernel_lengths:
            out_conv = ((self.embedding_size - 1 * (kernel_len - 1) - 1) / self.stride) + 1
            out_conv = math.floor(out_conv)
            out_pool = ((out_conv - 1 * (kernel_len - 1) - 1) / self.stride) + 1
            out_pool = math.floor(out_pool)
            out_pool_total += out_pool

        # Returns "flattened" vector (input for fully connected layer)
        return out_pool_total * self.conv_output_size

In [58]:
np.array(embed).shape



()

In [None]:
import json
import pandas as pd
pd.options.display.max_columns = 100
pd.options.display.max_rows = 40
from pathlib import Path

paths = list((Path.home() / '.nlpia2-data' / 'log').glob('*'))
df = []
for p in paths:
    d = json.load(p.open())
    df.append({k:d.get(k) for k in d.keys() if k not in ('learning_curve', 'y_test', 'y_train')})
    df[-1]['filename'] = p.name[-12:-5]
df = pd.DataFrame(df)
df.sort_values('test_accuracy').round(2).tail(10).T

In [None]:
# FIXME: both implementations look incorrect because pooling should not add additional dimensions/sequence elements to the encoding

# .Compute the shape of the CNN output (the number of the output encoding vector dimensions)


def conv1_output_encoding_size(self, embedding_size, kernel_lengths, stride, output_channels):
    """ Calculate the number of encoding dimensions output from CNN layers

    Convolved_Features = ((embedding_size + (2 * padding) - dilation * (kernel - 1) - 1) / stride) + 1
    Pooled_Features = ((embedding_size + (2 * padding) - dilation * (kernel - 1) - 1) / stride) + 1

    source: https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
    """
    out_pool_total = 0
    for kernel_len in kernel_lengths:
        out_conv = (
            (embedding_size - 1 * (kernel_len - 1) - 1) / stride) + 1
        out_conv = math.floor(out_conv)
        out_pool = ((out_conv - 1 * (kernel_len - 1) - 1) / stride) + 1
        out_pool = math.floor(out_pool)
        out_pool_total += out_pool

    # return the len of a "flattened" vector that is passed into a fully connected (Linear) layer
    return out_pool_total



In [None]:

def compute_output_seq_len(input_seq_len, kernel_lengths, stride):
    """ Calculate the number of encoding dimensions output from CNN layers

    From PyTorch docs:
      L_out = 1 + (L_in + 2 * padding - dilation * (kernel_size - 1) - 1) / stride
    But padding=0 and dilation=1, because we're only doing a 'valid' convolution.
    So:
      L_out = 1 + (L_in - (kernel_size - 1) - 1) // stride

    source: https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
    """
    out_pool_total = 0
    for kernel_len in kernel_lengths:
        out_conv = (
            (input_seq_len - 1 * (kernel_len - 1) - 1) / stride) + 1
        out_conv = math.floor(out_conv)
        out_pool = ((out_conv - 1 * (kernel_len - 1) - 1) / stride) + 1
        out_pool = math.floor(out_pool)
        out_pool_total += out_pool

    # return the len of a "flattened" vector that is passed into a fully connected (Linear) layer
    return out_pool_total


In [None]:
import json
import pandas as pd
from pathlib import Path

pd.options.display.max_columns = 100
pd.options.display.max_rows = 40


def best_hyperparms():
    paths = list((Path.home() / '.nlpia2-data' / 'log').glob('*'))
    df = []
    for p in paths:
        d = json.load(p.open())
        df.append({k: d.get(k) for k in d.keys() if k not in ('learning_curve', 'y_test', 'y_train')})
        df[-1]['filename'] = p.name[-12:-5]
    df = pd.DataFrame(df)
    df.sort_values('test_accuracy').round(2).tail(10).T


def learning_curve(filepath=Path.home() / '.nlpia2-data' / 'log' / 'disaster_tweets_cnn_pipeline_14728.json'):
    with Path(filepath).open() as fin:
        results = json.load(fin)

    curve = pd.DataFrame(results['learning_curve'],
                         columns=['loss', 'training_accuracy', 'test_accuracy'])
    return curve

In [None]:
curve = learning_curve()

accuracy = curve[['training_accuracy', 'test_accuracy']]
accuracy.plot(linewidth=3, xlabel='Epochs', ylabel='Accuracy')

In [None]:
savefig(ax, IMAGES_DIR / 'learning-curve-85-80.png')