# Installing skorch and loading libraries

In [1]:
import subprocess

# Installation on Google Colab
try:
    import google.colab
    subprocess.run(['python', '-m', 'pip', 'install', 'skorch'])
except ImportError:
    pass

In [2]:
import torch
from torch import nn
import torch.nn.functional as F
from skorch import NeuralNetClassifier

import pandas as pd
import numpy as np
import csv
import re
import string
from collections import defaultdict

# set seed for reproducibility
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

## Training a classifier and making predictions

In [3]:
# download dataset
!gdown 1QP6YuwdKFNUPpvhOaAcvv2Pcp4JMbIRs # x_train
!gdown 1QVo7PZAdiZKzifK8kwhEr_umosiDCUx6 # x_test
!gdown 1QbBeKcmG2ZyAEFB3AKGTgSWQ1YEMn2jl # y_train
!gdown 1QaZj6bI7_78ymnN8IpSk4gVvg-C9fA6X # y_test

Downloading...
From: https://drive.google.com/uc?id=1QP6YuwdKFNUPpvhOaAcvv2Pcp4JMbIRs
To: /content/x_train.txt
100% 64.1M/64.1M [00:01<00:00, 36.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=1QVo7PZAdiZKzifK8kwhEr_umosiDCUx6
To: /content/x_test.txt
100% 65.2M/65.2M [00:01<00:00, 55.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=1QbBeKcmG2ZyAEFB3AKGTgSWQ1YEMn2jl
To: /content/y_train.txt
100% 480k/480k [00:00<00:00, 139MB/s]
Downloading...
From: https://drive.google.com/uc?id=1QaZj6bI7_78ymnN8IpSk4gVvg-C9fA6X
To: /content/y_test.txt
100% 480k/480k [00:00<00:00, 118MB/s]


In [4]:
with open(f'x_train.txt') as f:
    x_train = f.read().splitlines()
with open(f'y_train.txt') as f:
    y_train = f.read().splitlines()
with open(f'x_test.txt') as f:
    x_test = f.read().splitlines()
with open(f'y_test.txt') as f:
    y_test = f.read().splitlines()

In [5]:
# combine x_train and y_train into one dataframe
train_df = pd.DataFrame({'text': x_train, 'label': y_train})
# write train_df to csv with tab as separator
train_df.to_csv('train_df.csv', index=False, sep='\t')
# comibne x_test and y_test into one dataframe
test_df = pd.DataFrame({'text': x_test, 'label': y_test})
# inspect the first 5 items in the train split
train_df.head()

Unnamed: 0,text,label
0,Klement Gottwaldi surnukeha palsameeriti ning ...,est
1,"Sebes, Joseph; Pereira Thomas (1961) (på eng)....",swe
2,भारतीय स्वातन्त्र्य आन्दोलन राष्ट्रीय एवम क्षे...,mai
3,"Après lo cort periòde d'establiment a Basilèa,...",oci
4,ถนนเจริญกรุง (อักษรโรมัน: Thanon Charoen Krung...,tha


### Data preparation

Prepare your dataset for this experiment using the same method as you did in part 1.

Get a subset of the train/test data that includes 20 languages. Include English, German, Dutch, Danish, Swedish, Norwegian, and Japanese, plus 13 additional languages of your choice based on the items in the list of labels.

Don't forget to encode your labels using the adjusted code snippet from part 1!


In [6]:
# TODO: Create your train/test subsets of languages
# Note, make sure these are the same as what you used in Part 1!

In [7]:
# randomly sample 20 different target labels and create subsets of train and test dataframes with only those labels
given_labels = set(['eng', 'deu', 'nld', 'dan', 'swe', 'nno', 'jpn'])
all_labels = set(train_df['label'].unique())
target_labels = list(np.random.choice(list(all_labels - given_labels), 13, replace=False))
target_labels += given_labels
train_df = train_df[train_df['label'].isin(target_labels)]
test_df = test_df[test_df['label'].isin(target_labels)]

In [8]:
print(train_df['label'].value_counts())
print(test_df['label'].value_counts())
print(len(set(train_df['text'].unique()).intersection(set(test_df['text'].unique()))))

label
swe    500
glk    500
chr    500
nno    500
olo    500
pap    500
afr    500
chv    500
que    500
eng    500
lit    500
ltg    500
bar    500
csb    500
slv    500
dan    500
arg    500
jpn    500
nld    500
deu    500
Name: count, dtype: int64
label
nld    500
glk    500
lit    500
deu    500
dan    500
swe    500
pap    500
nno    500
ltg    500
afr    500
arg    500
que    500
chr    500
slv    500
eng    500
csb    500
chv    500
jpn    500
bar    500
olo    500
Name: count, dtype: int64
52


In [9]:
x_train, y_train = train_df['text'].to_numpy(), train_df['label'].to_numpy()
x_test, y_test = test_df['text'].to_numpy(), test_df['label'].to_numpy()
print(x_train[:5])
print(y_train[:5])

['Sebes, Joseph; Pereira Thomas (1961) (på eng). The Jesuits and the Sino-Russian treaty of Nerchinsk (1689): the diary of Thomas Pereira. Bibliotheca Instituti historici S. I., 99-0105377-3 ; 18. Rome. Libris 677492'
 'ريچارد رايت سالˇ ۱۹۷۹ˇ ميئن به علت شخص مۊشکلات واترزˇ أمرأ، جرگهٰ ترک بؤده. واترزأ ني سالˇ ۱۹۸۵ˇ ميئن جرگه جي سيوا بۊبؤ تا گيلمؤر؛ پينک فلؤيدˇ رهبر ببي.گيلمؤر، واترزˇ سيوايي پسي؛ نيک ميسنˇ أمرأ بر بقاى پينک فلؤيد اصرار بورزسته گه هي اصرار مؤجبˇ تخاصؤمˇ دئباخي اعضا ؤ واترز بۊبؤ. سراخر، واترز اکراهˇ أمرأ بر کار گۊدنˇ پينک فلؤيد بي اينˇ حؤضۊر رضايت بدأ ؤ واترزˇ سيوايي پسي اي جرگه به رهبري گيلمؤر دۊته آلبۊم بيرۊن هدأ به نؤم آني لغزش (۱۹۸۷) ؤ سيوايي ناقۊس (۱۹۹۴). ۲۰۰۵ ژۊئيه ما ميئن، دۊ دهه پسي، جرگه أعضا (گيلمؤر، واترز، رايت ؤ ميسن) خيرخاهانه اجرا ويسن لايو ۸ˇ ميئن گردأيتن. سيد برت سالˇ ۲۰۰۶ ؤ ريچارد رايت سالˇ ۲۰۰۸ˇ ميئن بمۊردن گه ايطؤرى اي اجرا؛ آخري جرگه اجرا تمامˇ جرگه أعضا أمرأ بۊ.سالˇ ۲۰۰۶ˇ ميئن، گيلمؤر يته مۊصاحبه لا رپۊبليکا ايتاليايي رۊزنؤمه أمرأ أنجؤم بدأ ؤ اعلام بؤ

In [10]:
# TODO: Use your adjusted code from part 1 to encode the labels again
from sklearn.preprocessing import LabelEncoder

In [11]:
label_encoder = LabelEncoder().fit(y_train)
y_train, y_test = label_encoder.transform(y_train), label_encoder.transform(y_test)
print(label_encoder.classes_)
print(y_train)
print(y_test)

['afr' 'arg' 'bar' 'chr' 'chv' 'csb' 'dan' 'deu' 'eng' 'glk' 'jpn' 'lit'
 'ltg' 'nld' 'nno' 'olo' 'pap' 'que' 'slv' 'swe']
[19  9 13 ... 18  8  6]
[13  9  2 ...  1  6  9]


### Feature Extraction

In [12]:
# First, we extract some simple features as input for the neural network
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 2), max_features=100, binary=True)
X = vectorizer.fit_transform(x_train)

In [13]:
# We need to change the datatype to make it play nice with pytorch
X = X.astype(np.float32)
y = y_train.astype(np.int64)

In the following, we define a vanilla neural network with two hidden layers. The output layer should have as many outputs as there are classes. In addition, it should have a nonlinearity function.

In [14]:
X.shape

(10000, 100)

In [15]:
# TODO: In the following, you can find a small (almost) working example of a neural network. Unfortunately, again, the cat messed up some of the code. Please fix the code such that it is executable.

class ClassifierModule(nn.Module):
    def __init__(
        self,
        num_units=200,
        nonlin=F.relu,
    ):
        super(ClassifierModule, self).__init__()
        self.num_units = num_units
        self.nonlin = nonlin

        self.dense0 = nn.Linear(100, num_units)
        self.nonlin = nonlin
        self.dense1 = nn.Linear(num_units, 50)
        self.output = nn.Linear(50, 2)

    def forward(self, X, **kwargs):
        X = self.nonlin(self.dense0(X))
        X = F.relu(self.dense1(X))
        X = self.output(X)
        return X.squeeze(dim=1)

In [16]:
class ClassifierModule(nn.Module):
    def __init__(
        self,
        input_size=600,
        num_units=200,
        num_classes=2,
        nonlin=F.relu,
    ):
        super(ClassifierModule, self).__init__()
        self.num_units = num_units
        self.nonlin = nonlin

        self.dense0 = nn.Linear(input_size, num_units)
        self.nonlin = nonlin
        self.dense1 = nn.Linear(num_units, 50)
        self.output = nn.Linear(50, num_classes)

    def forward(self, X, **kwargs):
        X = self.nonlin(self.dense0(X))
        X = self.nonlin(self.dense1(X))
        X = self.output(X)
        return X.squeeze(dim=1)

In [17]:
net = NeuralNetClassifier(
    ClassifierModule(
        input_size=X.shape[1],
        num_units=200,
        num_classes=len(label_encoder.classes_),
        nonlin=F.relu,
    ),
    max_epochs=20,
    criterion=nn.CrossEntropyLoss(),
    lr=0.1,
    device='cuda',  # comment this to train with CPU
)

In [18]:
net.fit(X, y)

  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m2.8951[0m       [32m0.1320[0m        [35m2.7631[0m  1.2529
      2        [36m2.5176[0m       [32m0.3850[0m        [35m2.1906[0m  0.8595
      3        [36m1.8043[0m       [32m0.4795[0m        [35m1.5852[0m  0.8454
      4        [36m1.2604[0m       [32m0.6110[0m        [35m1.2050[0m  0.8091
      5        [36m0.9603[0m       [32m0.6650[0m        [35m1.0417[0m  1.1161
      6        [36m0.8210[0m       [32m0.7285[0m        [35m0.9259[0m  1.2763
      7        [36m0.7364[0m       [32m0.7610[0m        [35m0.8248[0m  1.3538
      8        [36m0.6717[0m       [32m0.7830[0m        [35m0.7497[0m  0.8053
      9        [36m0.6189[0m       [32m0.7970[0m        [35m0.6942[0m  0.8257
     10        [36m0.5765[0m       [32m0.8060[0m        [35m0.6487[0m  0.8247
     11        [36m0.5426[0m       [32m0.80

<class 'skorch.classifier.NeuralNetClassifier'>[initialized](
  module_=ClassifierModule(
    (dense0): Linear(in_features=100, out_features=200, bias=True)
    (dense1): Linear(in_features=200, out_features=50, bias=True)
    (output): Linear(in_features=50, out_features=20, bias=True)
  ),
)

Note, you can also use `GridSearchCV` with `skorch`, but be aware that training a neural network takes much more time.

Play around with 5 different sets of hyperparameters. For example, consider some of the following:

- layer sizes
- activation functions
- regularizers
- early stopping
- vectorizer parameters

Report your best hyperparameter combination.

🗒❓ What is the effect of your modifcations on validation performance? Discuss potential reasons.

☝ Note, during model development, if you run into the infamous CUDA out-of-memory (OOM) error, try clearing the GPU memory either with `torch.cuda.empty_cache()` or restarting the runtime.

In [22]:
torch.cuda.empty_cache()

class ClassifierModule(nn.Module):
    def __init__(
        self,
        input_size=600,
        num_units=200,
        num_classes=2,
        nonlin=F.relu,
        dropout=0.1,
    ):
        super(ClassifierModule, self).__init__()
        self.num_units = num_units
        self.nonlin = nonlin
        self.dropout = nn.Dropout(dropout)

        self.dense0 = nn.Linear(input_size, num_units)
        self.dense1 = nn.Linear(num_units, num_units)
        self.output = nn.Linear(num_units, num_classes)

    def forward(self, X, **kwargs):
        X = self.nonlin(self.dense0(X))
        X = self.dropout(X)
        X = self.nonlin(self.dense1(X))
        X = self.dropout(X)
        X = self.output(X)
        return X.squeeze(dim=1)

net = NeuralNetClassifier(
    ClassifierModule(
        input_size=X.shape[1],
        num_units=600,
        num_classes=len(label_encoder.classes_),
        nonlin=F.relu,
    ),
    max_epochs=20,
    criterion=nn.CrossEntropyLoss(),
    lr=0.1,
    device='cuda',  # comment this to train with CPU
)

net.fit(X, y)

  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m2.7397[0m       [32m0.3405[0m        [35m2.3976[0m  1.2022
      2        [36m1.9196[0m       [32m0.5400[0m        [35m1.5632[0m  0.8200
      3        [36m1.2041[0m       [32m0.6355[0m        [35m1.1223[0m  0.8195
      4        [36m0.8804[0m       [32m0.7405[0m        [35m0.8611[0m  0.9320
      5        [36m0.7364[0m       [32m0.7945[0m        [35m0.7538[0m  1.1930
      6        [36m0.6442[0m       [32m0.8295[0m        [35m0.6413[0m  1.3295
      7        [36m0.5817[0m       [32m0.8395[0m        [35m0.5820[0m  1.1305
      8        [36m0.5358[0m       [32m0.8435[0m        [35m0.5590[0m  0.8099
      9        [36m0.5019[0m       [32m0.8560[0m        [35m0.5126[0m  0.8232
     10        [36m0.4681[0m       [32m0.8565[0m        [35m0.5092[0m  0.8101
     11        [36m0.4549[0m       [32m0.86

<class 'skorch.classifier.NeuralNetClassifier'>[initialized](
  module_=ClassifierModule(
    (dropout): Dropout(p=0.1, inplace=False)
    (dense0): Linear(in_features=100, out_features=600, bias=True)
    (dense1): Linear(in_features=600, out_features=600, bias=True)
    (output): Linear(in_features=600, out_features=20, bias=True)
  ),
)