In [47]:
import os, tarfile
from dstoolbox.transformers import Padder2d, TextFeaturizer
import numpy as np
from scipy import stats
from sklearn.datasets import load_files
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from skorch import NeuralNetClassifier
import torch
from torch import nn
import torch.nn.functional as F

np.random.seed(87)
torch.manual_seed(87)
torch.cuda.manual_seed(87)

Constants

In [48]:
VOCAB_SIZE = 1000
MAX_LEN = 50
USE_MPS = torch.backends.mps.is_available()
NUM_CV_STEPS = 10

Data Preparation

In [49]:
data_dir = "/Users/pepijnschouten/Desktop/Python_Scripts/" \
    "Python_scripts_Varia/Deep_Learning/Skorch/sentiment_" \
        "prediction_IMDB/data"

if not os.path.exists(os.path.join(data_dir, 'aclImdb')):
    with tarfile.open(
        os.path.join(data_dir, 'aclImdb_v1.tar'),'r') as f:
        f.extractall(os.path.join(data_dir, 'aclImdb'))



Data Preprocessing

In [50]:
train_dir = "/Users/pepijnschouten/Desktop/Python_Scripts/" \
    "Python_scripts_Varia/Deep_Learning/Skorch/sentiment_" \
        "prediction_IMDB/data/aclImdb/aclImdb/train/"
dataset = load_files(train_dir,
                     categories=['pos', 'neg'])

print(dataset.keys())

X, y = dataset.data, dataset.target
X = np.array([x.decode() for x in X])

print(X.shape, y.shape)

for text, target in zip(X[:3], y):
    print(f'Target: {dataset['target_names'][target]}')
    print(text)
    print()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])
(25000,) (25000,)
Target: pos
Zero Day leads you to think, even re-think why two boys/young men would do what they did - commit mutual suicide via slaughtering their classmates. It captures what must be beyond a bizarre mode of being for two humans who have decided to withdraw from common civility in order to define their own/mutual world via coupled destruction.<br /><br />It is not a perfect movie but given what money/time the filmmaker and actors had - it is a remarkable product. In terms of explaining the motives and actions of the two young suicide/murderers it is better than 'Elephant' - in terms of being a film that gets under our 'rationalistic' skin it is a far, far better film than almost anything you are likely to see. <br /><br />Flawed but honest with a terrible honesty.

Target: neg
Words can't describe how bad this movie is. I can't explain it by writing only. You have too see it for yourself to get at g

Data Transformation

In [51]:
steps = [
    ('to_idx', TextFeaturizer(max_features=VOCAB_SIZE)),
    ('padder', Padder2d(max_len=MAX_LEN, pad_value=VOCAB_SIZE, dtype=int))
    
]

print(Pipeline(steps).fit_transform(X[:3]))

[[220  48 104 217 190 186  63 156 186 207 193  29 218 117 215  57 205 184
   54  43 129 173 199 169 181  39 102  35 205 128  19  26  27 120 133  23
   76 193  95 206  87  49 190 210  77  44  38  98 140 190]
 [213  33  52  94  18 187 124 101  33  67 102  32 216 137 217  87 191 163
  102  76 219 190  78  17  83 133  94  93 124 158  33  19 132 179 159 217
  190  57 179 183  14 170 115  40 119  12   8 142 130 185]
 [ 65 151 181 148 153 203  98 187 108 131 124  24  79 180  36 190 109 148
  133  90 105  56  31  62 195 157 179 205  88  85 201  81 190  19 103  16
   82 139 116  63  25 180 124 166 196 179 202 143 190 174]]


Create RNN Network

In [52]:
class RNNClassifier(nn.Module):
    def __init__(self,
                 embedding_dim=128,
                 rec_layers_type='lstm',
                 num_units=128,
                 num_layers=2,
                 dropout=0,
                 bidirectional=False,):
        super(RNNClassifier, self).__init__()
        
        self.embedding_dim = embedding_dim
        self.rec_layers_type = rec_layers_type.upper()
        self.num_units = num_units
        self.num_layers = num_layers
        self.dropout = dropout
        self.bidirectional = bidirectional
        
        self.reset_weights()
        
    def reset_weights(self):
        self.emb = nn.Embedding(VOCAB_SIZE +1,
                                self.embedding_dim)
        
        rec_layer = getattr(nn, self.rec_layers_type)
        
        self.rec_layer = rec_layer(
            self.embedding_dim,
            self.num_units,
            num_layers=self.num_layers,
            dropout=self.dropout,
            bidirectional=self.bidirectional,
            batch_first=True
        )
        
        self.drop = nn.Dropout(self.dropout)
        self.output = nn.Linear(
            self.num_units * 2 if self.bidirectional else self.num_units,
            2
        )
        
    def forward(self, X):
        embeddings = self.emb(X)
        if self.rec_layers_type == 'GRU':
            _, rec_out = self.rec_layer(embeddings)
        else:
            _, (rec_out, _) = self.rec_layer(embeddings)
            
        rec_out = rec_out[-1]
        
        drop = self.drop(rec_out)
        output = self.output(drop)
        
        return F.softmax(output, dim=--1)
    
net = NeuralNetClassifier(
    RNNClassifier,
    device = 'mps' if USE_MPS else 'cpu',
    max_epochs=5,
    lr=0.01,
    optimizer=torch.optim.RMSprop,
)

pipe = Pipeline(steps + [('net', net)])

Train Model

In [53]:
pipe.fit(X, y)

  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m0.8221[0m       [32m0.5000[0m        [35m0.6970[0m  3.6767
      2        [36m0.7219[0m       0.5000        0.6979  3.6119
      3        [36m0.7215[0m       0.5000        0.7081  3.5299
      4        [36m0.7111[0m       [32m0.6692[0m        [35m0.6215[0m  3.5426
      5        [36m0.6042[0m       [32m0.7164[0m        [35m0.5643[0m  3.5456


Randomized Grid Search

In [54]:
pipe.set_params(net__verbose=0,
                net__train_split=None,)

params = {
    'to_idx__stop_words': ['english', None],
    'to_idx__lowercase': [False, True],
    'to_idx__ngram_range': [(1, 1), (2, 2)],
    'net__module__embedding_dim': stats.randint(32, 256 + 1),
    'net__module__rec_layers_type': ['gru', 'lstm'],
    'net__module__num_units': stats.randint(32, 256 + 1),
    'net__module__num_layers': [1, 2, 3],
    'net__module__dropout': stats.uniform(0, 0.9),
    'net__module__bidirectional': [True, False],
    'net__lr': [10**(-stats.uniform(1, 5).rvs()) for _ in range(NUM_CV_STEPS)],
    'net__max_epochs': [5, 10],
}

search = RandomizedSearchCV(
    pipe,
    n_iter=NUM_CV_STEPS,
    param_distributions=params,
    verbose=1,
    cv=3,
    random_state=87,
    refit=False,
    scoring='accuracy'
)

search.fit(X[:500], y[:500])

Fitting 3 folds for each of 10 candidates, totalling 30 fits


9 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/miniconda3/envs/pytorch/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/miniconda3/envs/pytorch/lib/python3.12/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/miniconda3/envs/pytorch/lib/python3.12/site-packages/sklearn/pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/opt/miniconda3/envs

In [55]:
print(search.best_score_, search.best_params_)

0.5280403049323041 {'net__lr': 0.006255692908541452, 'net__max_epochs': 10, 'net__module__bidirectional': False, 'net__module__dropout': 0.4358517385563876, 'net__module__embedding_dim': 80, 'net__module__num_layers': 2, 'net__module__num_units': 141, 'net__module__rec_layers_type': 'lstm', 'to_idx__lowercase': True, 'to_idx__ngram_range': (1, 1), 'to_idx__stop_words': 'english'}
