In [1]:
import dataframe_image as dfi
import numpy as np
import pandas as pd
import spacy
import torch
from torch import nn

In [2]:
from pathlib import Path
from matplotlib import pyplot as plt

CHAPTER = 'ch07'
BOOK_IMAGES_DIR = Path.home() / 'code' / 'tangibleai' / 'nlpia-manuscript' 
BOOK_IMAGES_DIR /= Path('manuscript') / 'images' / CHAPTER
CODE_IMAGES_DIR = Path.home() / 'code' / 'tangibleai' / 'nlpia2' / 'src' / 'nlpia2' / 'images' / CHAPTER 
IMAGES_DIR = CODE_IMAGES_DIR
IMAGES_DIR.mkdir(exist_ok=True,parents=True)

def savefig(ax, filename, **kwargs):
    filepath = IMAGES_DIR / filename
    if isinstance(ax, (list, tuple)):
        ax = ax[0]
    if hasattr(ax, 'figure'):
        return ax.figure.savefig(filepath, **kwargs)
    if hasattr(ax, 'savefig'):
        return ax.savefig(filepath, **kwargs)
    return plt.savefig(filepath)

In [3]:
HOME_DATA_DIR = Path.home() / '.nlpia2-data'

In [4]:
nlp = spacy.load('en_core_web_md')

In [5]:
pd.options.display.max_columns = 11

In [6]:
quote = "The right word may be effective, but no word was ever as effective as a rightly timed pause."
tags = 'ADV ADJ VERB NOUN'.split()
tagged_words = [
    [tok.text] + [int(tok.pos_ == tag) for tag in tags]  # <1>
    for tok in nlp(quote)]                                      # <2>

df = pd.DataFrame(tagged_words, columns=['token'] + tags).T
print(df)

        0      1     2    3   4   ... 15       16     17     18 19
token  The  right  word  may  be  ...  a  rightly  timed  pause  .
ADV      0      0     0    0   0  ...  0        1      0      0  0
ADJ      0      1     0    0   0  ...  0        0      0      0  0
VERB     0      0     0    0   0  ...  0        0      1      0  0
NOUN     0      0     1    0   0  ...  0        0      0      1  0

[5 rows x 20 columns]


In [7]:
pd.options.display.max_columns = 20

In [8]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
token,The,right,word,may,be,effective,",",but,no,word,was,ever,as,effective,as,a,rightly,timed,pause,.
ADV,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0
ADJ,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
VERB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
NOUN,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0


In [9]:
df.iloc[1:].astype(float)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
ADV,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
ADJ,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
VERB,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
NOUN,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [10]:
filepath = IMAGES_DIR / 'conv1d-pos-rightly-timed-pause.df.png'
dfi.export(df.fillna(''), filepath, max_rows=7, max_cols=10, table_conversion='matplotlib')

findfont: Font family ['Helvetica'] not found. Falling back to DejaVu Sans.
findfont: Font family ['Helvetica'] not found. Falling back to DejaVu Sans.


In [11]:
# .Convert a DataFrame to a tensor with the correct size
# [source,python]
# ----
x = torch.tensor(df.iloc[1:].astype(float).values, dtype=torch.float32)  # <1>
x = x.unsqueeze(0)                                # <2>               
print(x.shape)
# ----
# <1> you can use any floating point `dtype` as long as you are consistent for the entire CNN
# <2> insert a new 0th dimension with a size of 1 for a batch with only 1 example sentence



torch.Size([1, 4, 20])


In [12]:
kernel = pd.DataFrame(
           [[1, 0, 0],
            [0, 0, 0],
            [0, 1, 0],
            [0, 0, 1]], index=tags)
print(kernel)

      0  1  2
ADV   1  0  0
ADJ   0  0  0
VERB  0  1  0
NOUN  0  0  1


In [13]:
kernel

Unnamed: 0,0,1,2
ADV,1,0,0
ADJ,0,0,0
VERB,0,1,0
NOUN,0,0,1


In [14]:
kernel = torch.tensor(kernel.values, dtype=torch.float32)
print(kernel)

tensor([[1., 0., 0.],
        [0., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]])


In [15]:
kernel = kernel.unsqueeze(0)
print(kernel)

tensor([[[1., 0., 0.],
         [0., 0., 0.],
         [0., 1., 0.],
         [0., 0., 1.]]])


In [16]:
conv = nn.Conv1d(in_channels=4, out_channels=1, kernel_size=(3, bias=False)
conv.load_state_dict({'weight': kernel})
conv.weight

Parameter containing:
tensor([[[1., 0., 0.],
         [0., 0., 0.],
         [0., 1., 0.],
         [0., 0., 1.]]], requires_grad=True)

In [17]:
z = np.array(conv.forward(x).detach()).squeeze()
df.loc['z'] = pd.Series(z)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
token,The,right,word,may,be,effective,",",but,no,word,was,ever,as,effective,as,a,rightly,timed,pause,.
ADV,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0
ADJ,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
VERB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
NOUN,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
z,1,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,3,0,,


In [18]:
pooler = nn.MaxPool1d(kernel_size=3, stride=3)

In [19]:
y = pooler(torch.tensor(z).unsqueeze(0))
print(y.shape)
y

torch.Size([1, 6])


tensor([[1., 0., 1., 1., 1., 3.]])

In [20]:
df.loc['y'] = pd.Series(np.array(y.squeeze().detach()))
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
token,The,right,word,may,be,effective,",",but,no,word,was,ever,as,effective,as,a,rightly,timed,pause,.
ADV,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0
ADJ,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
VERB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
NOUN,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
z,1,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,3,0,,
y,1,0,1,1,1,3,,,,,,,,,,,,,,
