# Open Folder

In [None]:
from google.colab import drive
drive.mount('/content/drive')
SAVEFOLDER = '/content/drive/MyDrive/data'

MODELPATH = SAVEFOLDER
MODELFILENAME = 'cnnmodel.pth'
#downloaded = drive.CreateFile({'id':id}) 
#downloaded.GetContentFile('SKUdata.csv') 
DATAPATH = '/content/drive/MyDrive/data/Test6_REF.csv'

Mounted at /content/drive


## Pre-processing the data

* <b>Description splitting:</b>&nbsp;&nbsp;&nbsp;&nbsp;We are interested in modeling SKU descriptions, rather than longer chunks of text such as paragraphs or documents. The data comes as one description per row, so we split each description at space characters. We also remove any multiple spacing.

* <b>Full description markers:</b>&nbsp;&nbsp;&nbsp;&nbsp;For both training and testing corpora, each description must be surrounded by a start-of-sentence (`<s>`) and end-of-sentence marker (`/s`). These markers will allow the model to generate descriptions of appropriate length.

* <b>Unknown tokens:</b>&nbsp;&nbsp;&nbsp;&nbsp;In order to deal with unknown tokens in the test corpora, all tokens that do not appear in the vocabulary must be replaced with a special token for unknown words (`<UNK>`).


In [None]:
START = "<s>"   # Start-of-sentence token
END = "</s>"    # End-of-sentence-token
UNK = "<UNK>"   # Unknown word token
PUNC = {} # Punctuation that marks end of sentence, '.', '?', '!'

In [None]:
import torchtext
import random
import pandas
from google.colab import drive

def preprocess(data, vocab=None):
    final_data = []
    lowercase = "abcdefghijklmnopqrstuvwxyz"
    for row in data:
        if type(row) != str: continue # filter blanks
        row = [x if x != '<unk>' else UNK for x in row.split()] # create tokens
        if vocab is not None:
            row = [x if x in vocab else UNK for x in row] # process UNK if Vocab present
        if row == [] or row.count('=') >= 2: continue # if done processing
        sen = []
        prev_punct, prev_quot = False, False
        for word in row:
            if prev_quot: # Sentences can end in double quote
                if word[0] not in lowercase: # If a new sent is beginning after double quote
                    final_data.append(sen)
                    sen = []
                    prev_punct, prev_quot = False, False
            if prev_punct:
                if word == '"':
                    prev_punct, prev_quot = False, True
                else:
                    if word[0] not in lowercase: # If a new sent is beginning after PUNC
                        final_data.append(sen)
                        sen = []
                        prev_punct, prev_quot = False, False
            if word in PUNC: prev_punct = True
            sen += [word.upper()]
        # if sen[-1] not in {'.', '?', '!', '"'}: continue # Prevent a lot of short sentences
        final_data.append(sen)
    vocab_was_none = vocab is None # test whether to build vocab
    if vocab is None:
        vocab = set(UNK) # seed with UNK
    for i in range(len(final_data)):
        final_data[i] = [START] + final_data[i] + [END] # apply beginning and end markers
        if vocab_was_none:
            for word in final_data[i]:
                vocab.add(word) 
    return final_data, vocab

def getDataset(path):
    df = []

    filedata = pandas.read_csv(path, 
                         header = 0, 
                         names = ['Code', 'Name', 'PRODHA', 'dataset'], 
                         index_col='Code')

    df.append(filedata[filedata['dataset'] == 'train'])
    df.append(filedata[filedata['dataset'] == 'test'])

    train_dataset, vocab = preprocess(df[0]['Name'])
    test_dataset, _ = preprocess(df[1], vocab)

    return train_dataset, test_dataset

def getAuth():
  # Code to read csv file into Colaboratory:

  !pip install -U -q PyDrive
  from pydrive.auth import GoogleAuth
  from pydrive.drive import GoogleDrive
  from google.colab import auth
  from oauth2client.client import GoogleCredentials

  # Authenticate and create the PyDrive client.
  auth.authenticate_user()
  gauth = GoogleAuth()
  gauth.credentials = GoogleCredentials.get_application_default()
  drive = GoogleDrive(gauth)

  link = 'https://drive.google.com/open?id=1DPZZQ43w8brRhbEMolgLqOWKbZbE-IQu' # The shareable link

  fluff, id = link.split('=')
  print (id) # Verify that you have everything after '='
  return None

train_dataset, test_dataset = getDataset(DATAPATH)


Test the loaded data.

In [None]:
if __name__ == '__main__':
    for x in random.sample(train_dataset, 10):
        print (x)

['<s>', '430-37', 'CN1QT', 'PHTHALO', 'BLUE', 'FUL-BASE', '</s>']
['<s>', '51910033053', 'CN1LT', '2K', 'HS', 'HARDENER', 'NORMAL', '</s>']
['<s>', 'CH6115', 'CN1LT', 'CHALLENGER', 'OCEAN', 'BLUE', '</s>']
['<s>', '1058S', 'BO50GR', 'GOLD', 'PEARL', '</s>']
['<s>', 'WB05', 'CN1L', 'CROMAX', 'PRO', 'JET', 'BLACK', '</s>']
['<s>', '02017159', 'CN0.5LT', 'STBLU', 'MIX159', 'DK', 'YEL', '</s>']
['<s>', '40', 'CN1QT', 'LMB', '1:1', 'DTM', 'EPOXY', 'PRIMER', '</s>']
['<s>', '02017120', 'CN0.5LT', 'STBLU', 'MIX120', 'BLU', 'PRL', '</s>']
['<s>', 'XP53', 'CN1PT', 'RED', 'ORANGE', '</s>']
['<s>', '29100003', 'CN1LT', 'PS', '2K', 'TRANSP', 'BLUE', 'MIDCOAT', '</s>']


## The LanguageModel Class

Below are 4 types of language models: a <b>unigram</b> model, a <b>smoothed unigram</b> model, a <b>bigram</b> model, and a <b>smoothed bigram</b> model. 

* <b>`__init__(self, trainCorpus)`</b>: This trains the language model on `trainCorpus`, calculating relative frequency estimates according to the type of model.

* <b>`generateSentence(self)`</b>: Returns a product description that is generated by the language model. Of the form <TT>[&lt;s&gt;, w<sup>(1)</sup>, ..., w<sup>(n)</sup>, &lt;&sol;s&gt;]</TT>, where each <TT>w<sup>(i)</sup></TT> is a token in the vocabulary (including <TT>&lt;UNK&gt;</TT> but exlcuding <TT>&lt;s&gt;</TT> and <TT>&lt;&sol;s&gt;</TT>).  <TT>&lt;s&gt;</TT> starts each description (with probability $1$). The following words <TT>w<sup>(1)</sup></TT>, ... , <TT>w<sup>(n)</sup></TT>, <TT>&lt;&sol;s&gt;</TT> are generated according to the language model's distribution. Note that the number of tokens <TT>n</TT> is not fixed; instead, the description is finished as soon as the  stop token is generated <TT>&lt;&sol;s&gt;</TT>.

* <b>`getSentenceLogProbability(self, sentence)`</b>:  Returns the <em> logarithm of the probability</em> of <TT>sentence</TT>, since we do the calcs in safer/quicker log space to avoid underflow, which is again a list of the form <TT>[&lt;s&gt;, w<sup>(1)</sup>, ..., w<sup>(n)</sup>, &lt;&sol;s&gt;]</TT>. This uses the base-<em>e</em> logarithm. 

* <b>`getCorpusPerplexity(self, testCorpus)`</b>:  Returns model perplexity (normalized inverse log probability) of `testCorpus` according to the model. For a corpus $W$ with $N$ tokens and a bigram model, Jurafsky and Martin (2020) defines perplexity as follows: 

$$Perplexity(W) = \Big [ \prod_{i=1}^N \frac{1}{P(w^{(i)}|w^{(i-1)})} \Big ]^{1/N}$$

Instead of multiplying probabilities, we add the logarithms of the probabilities and exponentiate the result:

$$\prod_{i=1}^N P(w^{(i)}|w^{(i-1)}) = \exp\Big (\sum_{i=1}^N \log P(w^{(i)}|w^{(i-1)}) \Big ) $$

In [None]:
import math
import random
from collections import defaultdict

class LanguageModel(object):
    def __init__(self, trainCorpus):
        '''
        Initialize and train the model (i.e. estimate the model's underlying probability
        distribution from the training corpus.)
        '''
        return

    def generateSentence(self):
        '''
        Generate a sentence by drawing words according to the model's probability distribution.
        Note: Think about how to set the length of the sentence in a principled way.
        '''
        raise NotImplementedError("Implement generateSentence in each subclass.")

    def getSentenceLogProbability(self, sentence):
        '''
        Calculate the log probability of the sentence provided. 
        '''
        raise NotImplementedError("Implement getSentenceProbability in each subclass.")
        
    def getCorpusPerplexity(self, testCorpus):
        '''
        Calculate the perplexity of the corpus provided.
        '''
        raise NotImplementedError("Implement getCorpusPerplexity in each subclass.")

    def printSentences(self, n):
        '''
        Prints n sentences generated by the model.
        '''

        for i in range(n):
            sent = self.generateSentence()
            prob = self.getSentenceLogProbability(sent)
            print('Log Probability:', prob , '\tSentence:',sent)

## Unigram Model

The <b>unsmoothed unigram</b> model. The probability distribution of a token is given by $\hat P(w)$.

In [None]:
from collections import Counter
class UnigramModel(LanguageModel):
    def __init__(self, trainCorpus):
        self.flatCorp = []

        # Count all uni-grams including UNK but not <s>, and allowing starting caps
        self.c = self.getCount(trainCorpus)
        
        # Create the wheel of fortune
        self.totalcount = sum(self.c.values())
        self.wheel = dict.fromkeys(range(1, self.totalcount+1))
        place = 1 
        for word in self.c.keys():
          for x in range(1, self.c[word] + 1):
            self.wheel[place] = word
            place += 1

        # Calc log probs
        self.logprobs = self.c
        for word, count in self.c.items():
          self.logprobs[word] = math.log(count / self.totalcount, math.e)           

    def generateSentence(self):
        # generate random number up to the wordcount to lookup token
        # starting with one element, could be slow
        sent = [START]
        newword = '0'
        while newword != END:
          newword = self.generateWord()
          sent.append(newword)
        return sent

    def getSentenceLogProbability(self, sentence):
        # skip <s>
        total = [self.logprobs[word] if word in self.logprobs else self.logprobs[UNK] for word in sentence if word != START]
        # N = len(sentence) - 1
        return sum(total)
        
    def getCorpusPerplexity(self, testCorpus):
        # iterate over testcorpus and get LM probs (no OOV possible in Wiki2)
        # START not allowed (but START is needed in bigram...just not in N)
        if testCorpus is not None:
          self.flatCorpforprobs = [word for sentence in testCorpus for word in sentence] # get list of words
          self.flatCorpforN = [word for sentence in testCorpus for word in sentence if word not in {START}] # get list of words
          N = len(self.flatCorpforN) # the length of the total number of tokens
          logprobs = [self.logprobs[word] for word in self.flatCorpforprobs if word in self.c.keys()]
          return math.exp((-1/N) * sum(logprobs))
        else:
          return

    def generateWord(self):
        roll = random.randrange(1, self.totalcount+1)
        return self.wheel[roll]

    def getCount(self, corpus):
        c = Counter()
        for sent in corpus:
          for word in sent:
            c[word] += 1
        del c['<s>']
        return c
###
#twosent = train_dataset[1115:1117]
test = UnigramModel(train_dataset)
testsent = ['<s>', 'Sonic', 'was', 'difficult', '.', '</s>']
#test.logprobs['</s>']
test.getCorpusPerplexity(test_dataset)
a = [count for words in test.c for count in words if count == 0]
a
#type(test.c)
#test.getSentenceLogProbability(testsent)
#test = UnigramModel(train_dataset)
#test.printSentences(1)
#test.getCorpusPerplexity(test_dataset)


[]

Here is a testing function that uses very simple training & test 
corpora.

In [None]:
def testModel(model_type):
    assert model_type in {'unigram', 'bigram', 'smoothed-unigram', 'smoothed-bigram'}

    #	Read in the test corpus
    train_corpus = ["By the Late Classic , a network of <unk> ( <unk> ) linked various parts of the city , running for several kilometres through its urban core .",
    "Few people realize how difficult it was to create Sonic 's graphics engine , which allowed for the incredible rate of speed the game 's known for ."]
    test_corpus = ["Classic parts of the game allowed for <unk> incredible city .", 
                   "Few <unk> realize the difficult network , which linked the game to Sonic ."]
    train_corpus, _ = preprocess(train_corpus)
    test_corpus, _ = preprocess(test_corpus)
    sentence = preprocess(["Sonic was difficult ."])[0][0]

    # These are the correct answers (don't change them!)
    if model_type == "unigram":
       senprobs = [-18.9159206916, -106.714608418, -107.8132207067, -43.0623556464, -54.9560026056]
       trainPerp, testPerp = 40.3970060507, 37.7244929883
       model = UnigramModel(train_corpus)
    elif model_type == "smoothed-unigram":
       senprobs = [-18.8969788221, -107.3856946234, -108.078841804, -43.7800012747, -55.3816031464]
       trainPerp, testPerp = 41.0547195671, 39.356140682
       model = SmoothedUnigramModel(train_corpus)
    elif model_type == "bigram":
       senprobs = [-float('inf'), -10.3450917073, -9.2464794186, -float('inf'), -float('inf')]
       trainPerp, testPerp = 1.4018400696, float('inf')
       model = BigramModel(train_corpus)
    elif model_type == "smoothed-bigram":
       senprobs = [-16.1336514995, -84.9068097328, -84.6431458887, -39.3603940053, -51.4605809045]
       trainPerp, testPerp = 18.6021115212, 28.8970586024
       model = SmoothedBigramModelAD(train_corpus)       

    print("--- TEST: generateSentence() ---")
    modelSen = model.generateSentence()
    senTestPassed = isinstance(modelSen, list) and len(modelSen) > 1 and isinstance(modelSen[0], str)
    if senTestPassed:
        print ("Test generateSentence() passed!")
    else:
        print ("Test generateSentence() failed; did not return a list of strings...")

    print("\n--- TEST: getSentenceLogProbability(...) ---")
    sentences = [sentence, *train_corpus, *test_corpus]
    failed = 0
    for i in range(len(sentences)):
        sen, correct_prob = sentences[i], senprobs[i]
        prob = round(model.getSentenceLogProbability(sen), 10)
        print("Correct log prob.:", correct_prob, '\tModel log prob.:', prob, '\t', 'PASSED' if prob == correct_prob else 'FAILED', '\t', sen)
        if prob != correct_prob: failed+=1

    if not failed:
        print ("Test getSentenceProbability(...) passed!")
    else:
        print("Test getSentenceProbability(...) failed on", failed, "sentence" if failed == 1 else 'sentences...')

    print("\n--- TEST: getCorpusPerplexity(...) ---")
    train_perp = round(model.getCorpusPerplexity(train_corpus), 10)
    test_perp = round(model.getCorpusPerplexity(test_corpus), 10)

    print("Correct train perp.:", trainPerp, '\tModel train perp.:', train_perp, '\t', 'PASSED' if trainPerp == train_perp else 'FAILED')
    print("Correct test perp.:", testPerp, '\tModel test perp.:', test_perp, '\t', 'PASSED' if testPerp == test_perp else 'FAILED')
    train_passed, test_passed = train_perp == trainPerp, test_perp == testPerp
    if train_passed and test_passed:
        print("Test getCorpusPerplexity(...) passed!")
    else:
        print("Test getCorpusPerplexity(...) failed on", "the training corpus and the testing corpus..." if not train_passed and not test_passed else "the testing corpus..." if not test_passed else "the training corpus...")

if __name__=='__main__':
    testModel('unigram')

--- TEST: generateSentence() ---
Test generateSentence() passed!

--- TEST: getSentenceLogProbability(...) ---
Correct log prob.: -18.9159206916 	Model log prob.: -18.9159206916 	 PASSED 	 ['<s>', 'SONIC', 'WAS', 'DIFFICULT', '.', '</s>']
Correct log prob.: -106.714608418 	Model log prob.: -106.714608418 	 PASSED 	 ['<s>', 'BY', 'THE', 'LATE', 'CLASSIC', ',', 'A', 'NETWORK', 'OF', '<UNK>', '(', '<UNK>', ')', 'LINKED', 'VARIOUS', 'PARTS', 'OF', 'THE', 'CITY', ',', 'RUNNING', 'FOR', 'SEVERAL', 'KILOMETRES', 'THROUGH', 'ITS', 'URBAN', 'CORE', '.', '</s>']
Correct log prob.: -107.8132207067 	Model log prob.: -107.8132207067 	 PASSED 	 ['<s>', 'FEW', 'PEOPLE', 'REALIZE', 'HOW', 'DIFFICULT', 'IT', 'WAS', 'TO', 'CREATE', 'SONIC', "'S", 'GRAPHICS', 'ENGINE', ',', 'WHICH', 'ALLOWED', 'FOR', 'THE', 'INCREDIBLE', 'RATE', 'OF', 'SPEED', 'THE', 'GAME', "'S", 'KNOWN', 'FOR', '.', '</s>']
Correct log prob.: -43.0623556464 	Model log prob.: -43.0623556464 	 PASSED 	 ['<s>', 'CLASSIC', 'PARTS', 'OF', '

Now, we can train the model on the full corpus, and evaluate it on the held-out test set.

In [None]:
def runModel(model_type):
    assert model_type in {'unigram', 'bigram', 'smoothed-unigram', 'smoothed-bigram'}
    # Read the corpora
    if model_type == 'unigram':
        model = UnigramModel(train_dataset)
    elif model_type == 'bigram':
        model = BigramModel(train_dataset)
    elif model_type == 'smoothed-unigram':
        model = SmoothedUnigramModel(train_dataset)
    else:
        model = SmoothedBigramModelAD(train_dataset)

    print("--------- 5 sentences from your model ---------")
    model.printSentences(5)

    print ("\n--------- Corpus Perplexities ---------")
    print ("Training Set:", model.getCorpusPerplexity(train_dataset))
    print ("Testing Set:", model.getCorpusPerplexity(test_dataset))

if __name__=='__main__':
    runModel('unigram')

--------- 5 sentences from your model ---------
Log Probability: -87.59261245364787 	Sentence: ['<s>', 'RED', '02018151', 'ADDITIVE', 'METALUX', 'LV', 'CN1GA', 'CH5003', '1K', 'CN1QT', 'WHITE', 'BLACK', 'CN0.5LT', '410A', 'STBL', 'CN1GA', '</s>']
Log Probability: -58.24251061914554 	Sentence: ['<s>', '-', 'FILLER', '02016252', 'NCN1L', 'BO4Z', 'DUX', 'HS', 'PRL', '</s>']
Log Probability: -12.289536106928919 	Sentence: ['<s>', 'WHITE', 'PRL', '</s>']
Log Probability: -1.8922777104005823 	Sentence: ['<s>', '</s>']
Log Probability: -1.8922777104005823 	Sentence: ['<s>', '</s>']

--------- Corpus Perplexities ---------
Training Set: 362.9152094957221
Testing Set: 2.575745109575581


## Smoothed Unigram Model

A <b>unigram</b> model with <b>Laplace (add-one) smoothing</b>. The probability distribution of a word is given by $P_L(w)$. This type of smoothing takes away some of the probability mass for observed events and assigns it to unseen events.

In order to smooth the model, the number of words in the corpus, $N$, and the number of word types, $S$ is used. The distinction between these is meaningful: $N$ indicates the number of word instances, where $S$ refers to the size of our vocabulary. For example, the sentence <em>the cat saw the dog</em> has four word types (<em>the</em>, <em>cat</em>, <em>saw</em>, <em>dog</em>), but five word tokens (<em>the</em>, <em>cat</em>, <em>saw</em>, <em>the</em>, <em>dog</em>). The token <em>the</em> appears twice in the sentence, but they share the same type <em>the</em>.

If $c(w)$ is the frequency of $w$ in the training data, $P_L(w)$ is:

$$P_L(w)=\frac{c(w)+1}{N+S}$$

In [None]:
class SmoothedUnigramModel(LanguageModel):
    def __init__(self, trainCorpus):
        self.flatCorp = []

        # Count all uni-grams including UNK but not <s>, and allowing starting caps
        self.c = self.getCount(trainCorpus)
        
        # Create the wheel of fortune
        self.totalcount = sum(self.c.values())
        self.wheel = dict.fromkeys(range(1, self.totalcount+1))
        place = 0 
        for word in self.c.keys():
          for x in range(1, self.c[word] + 1):
            self.wheel[place] = word
            place += 1

        # Calc log probs
        self.logprobs = self.c
        for word, count in self.c.items():
          self.logprobs[word] = math.log(count / self.totalcount, math.e)           

    def generateSentence(self):
        # generate random number up to the wordcount to lookup token
        # starting with one element, could be slow
        sent = [START]
        newword = '0'
        while newword != END:
          newword = self.generateWord()
          sent.append(newword)
        return sent

    def getSentenceLogProbability(self, sentence):
        # skip <s>
        total = [self.logprobs[word] if word in self.logprobs else self.logprobs[UNK] for word in sentence if word != START]
        # N = len(sentence) - 1
        return sum(total)
        
    def getCorpusPerplexity(self, testCorpus):
        # iterate over testcorpus and get LM probs (no OOV possible in Wiki2)
        # START not allowed (but START is needed in bigram...just not in N)
        if testCorpus is not None:
          self.flatCorpforprobs = [word for sentence in testCorpus for word in sentence] # get list of words
          self.flatCorpforN = [word for sentence in testCorpus for word in sentence if word not in {START}] # get list of words
          N = len(self.flatCorpforN) # the length of the total number of tokens
          logprobs = [self.logprobs[word] for word in self.flatCorpforprobs if word in self.c.keys()]
          return math.exp((-1/N) * sum(logprobs))
        else:
          return

    def generateWord(self):
        roll = random.randrange(1, self.totalcount+1)
        return self.wheel[roll]

    def getCount(self, corpus):
        c = Counter()
        for sent in corpus:
          for word in sent:
            c[word] += 1
        del c['<s>']
        # Do SMOOTHING
        for word in c:
          c[word] += 1
        return c

In [None]:
if __name__=='__main__':
    testModel('smoothed-unigram')

--- TEST: generateSentence() ---
Test generateSentence() passed!

--- TEST: getSentenceLogProbability(...) ---
Correct log prob.: -18.8969788221 	Model log prob.: -18.8969788221 	 PASSED 	 ['<s>', 'SONIC', 'WAS', 'DIFFICULT', '.', '</s>']
Correct log prob.: -107.3856946234 	Model log prob.: -107.3856946234 	 PASSED 	 ['<s>', 'BY', 'THE', 'LATE', 'CLASSIC', ',', 'A', 'NETWORK', 'OF', '<UNK>', '(', '<UNK>', ')', 'LINKED', 'VARIOUS', 'PARTS', 'OF', 'THE', 'CITY', ',', 'RUNNING', 'FOR', 'SEVERAL', 'KILOMETRES', 'THROUGH', 'ITS', 'URBAN', 'CORE', '.', '</s>']
Correct log prob.: -108.078841804 	Model log prob.: -108.078841804 	 PASSED 	 ['<s>', 'FEW', 'PEOPLE', 'REALIZE', 'HOW', 'DIFFICULT', 'IT', 'WAS', 'TO', 'CREATE', 'SONIC', "'S", 'GRAPHICS', 'ENGINE', ',', 'WHICH', 'ALLOWED', 'FOR', 'THE', 'INCREDIBLE', 'RATE', 'OF', 'SPEED', 'THE', 'GAME', "'S", 'KNOWN', 'FOR', '.', '</s>']
Correct log prob.: -43.7800012747 	Model log prob.: -43.7800012747 	 PASSED 	 ['<s>', 'CLASSIC', 'PARTS', 'OF', '

## Bigram Model

An <b>unsmoothed bigram</b> model follows, where the probability distribution of a word is given by $\hat P(w'|w)$. Thus, the probability of $w_i$ is conditioned on $w_{i-1}$.

In [None]:
from collections import defaultdict

class BigramModel(LanguageModel):
    def __init__(self, trainCorpus):
        self.bigrams = [] # list of bigrams as appeared
        self.unigrams = []
        self.vocab = {} # vocab words set
        self.c = {} # count of bigrams, no dupes
        self.c1 = {}
        self.wheels = defaultdict(dict)
        self.logprobs = {}

        self.getVocab(trainCorpus)
        #self.N = sum(self.c.values()) # total count of n-grams in corpus
        self.getAllProbs()

    def generateSentence(self):
        # Start with START, get next word, repeat till END
        sent = [START]
        newWord = ''
        while newWord != END:
          newWord = self.generateBigram(sent[-1])
          sent.append(newWord)
        return sent        

    def getSentenceLogProbability(self, sentence):
        # prob firstword * (2nd | 1st) * (3rd | 2nd) * (END | nth)
        # lookup the stem count, lookup the bigram count, take log of quotient, sum repeat
        bigramProb = 0.0
        sentbigrams = [tuple(sentence[i:i+2]) for i in range(len(sentence) - (2-1))]
        for bigram in sentbigrams:
          stem = bigram[0] # first word of sequence
          self.getProbs(stem) # compute if not in wheels dict
          n = self.wheels[stem]['count'] # return bigram count matching stem
          c = self.c[bigram] # return bigram count, but some may not be present
          if c == 0:
            return float('-inf')
          bigramProb += math.log(c / n, math.e)
        return bigramProb
        
    def getCorpusPerplexity(self, testCorpus):
        # Include START and END in probs, but exclude START from N
        # get all bigrams and fetch their probs, do the math
        if testCorpus is not None:
          bigrams = [tuple(sent[i:i+2]) for sent in testCorpus for i in range(len(sent) - (2-1))]
          N = len([word for sentence in testCorpus for word in sentence if word not in {START}]) # get list of words
          logprobs = [self.logprobs[bigram] if bigram in self.c.keys() else float('-inf') for bigram in bigrams] # include START and END
          return math.exp((-1/N) * sum(logprobs))
        else:
          return

    def getAllProbs(self):
        # iterate over all bigrams, compute stem count, compute bigram conditional prob
        for bigram in self.bigrams:
          count = self.c[bigram]
          #if not self.wheels.get(bigram[0], False):
            #self.getProbs(bigram[0])  
          stemcount = self.c1[bigram[0]]
          bigramcount = self.c[bigram]
          self.logprobs[bigram] = math.log(bigramcount / stemcount, math.e)

    def getVocab(self, corpus):
        # get bigrams, vocab set, counts
        self.bigrams = [tuple(sent[i:i+2]) for sent in corpus for i in range(len(sent) - (2-1))]
        self.unigrams = [i for sent in corpus for i in sent]
        self.vocab = set(self.unigrams)
        self.c = Counter(self.bigrams)
        self.c1 = Counter(self.unigrams)

    def generateBigram(self, stem):
        self.getProbs(stem)
        roll = random.randrange(1, self.wheels[stem]['count'] + 1)
        return self.wheels[stem][roll]

    def getProbs(self, word):
        # make wheel of fortune for bigram[0] starter
        # every vocab word can be a starter except END
        # for word in self.vocab:
          if word != END and self.wheels.get(word, False) == False:
            # return the bigrams and counts that match | also END returns no matches
            # could also just use unigram counts
            matches = {k: self.c[k] for k in self.c.keys() if k[0] == word}
            # get total appearance count of the matching bigrams
            N = sum(matches.values())
            # index the wheel
            self.wheels[word] = dict.fromkeys(['count', *range(1, N + 1)])
            self.wheels[word]['count'] = N
            # populate the wheel with end word
            place = 1
            for bigram in matches:
              for x in range(1, matches[bigram] + 1):
                self.wheels[word][place] = bigram[-1]
                place += 1
          else: 
            return 

In [None]:
if __name__=='__main__':
    testModel('bigram')

--- TEST: generateSentence() ---
Test generateSentence() passed!

--- TEST: getSentenceLogProbability(...) ---
Correct log prob.: -inf 	Model log prob.: -inf 	 PASSED 	 ['<s>', 'SONIC', 'WAS', 'DIFFICULT', '.', '</s>']
Correct log prob.: -10.3450917073 	Model log prob.: -10.3450917073 	 PASSED 	 ['<s>', 'BY', 'THE', 'LATE', 'CLASSIC', ',', 'A', 'NETWORK', 'OF', '<UNK>', '(', '<UNK>', ')', 'LINKED', 'VARIOUS', 'PARTS', 'OF', 'THE', 'CITY', ',', 'RUNNING', 'FOR', 'SEVERAL', 'KILOMETRES', 'THROUGH', 'ITS', 'URBAN', 'CORE', '.', '</s>']
Correct log prob.: -9.2464794186 	Model log prob.: -9.2464794186 	 PASSED 	 ['<s>', 'FEW', 'PEOPLE', 'REALIZE', 'HOW', 'DIFFICULT', 'IT', 'WAS', 'TO', 'CREATE', 'SONIC', "'S", 'GRAPHICS', 'ENGINE', ',', 'WHICH', 'ALLOWED', 'FOR', 'THE', 'INCREDIBLE', 'RATE', 'OF', 'SPEED', 'THE', 'GAME', "'S", 'KNOWN', 'FOR', '.', '</s>']
Correct log prob.: -inf 	Model log prob.: -inf 	 PASSED 	 ['<s>', 'CLASSIC', 'PARTS', 'OF', 'THE', 'GAME', 'ALLOWED', 'FOR', '<UNK>', 'IN

In [None]:
if __name__=='__main__':
    runModel('bigram')

--------- 5 sentences from your model ---------
Log Probability: -11.13855317607246 	Sentence: ['<s>', '483-70', 'CN2.5LT', 'HS', '</s>']
Log Probability: -15.624939826070587 	Sentence: ['<s>', 'WB90', 'CN0.5L', 'STBLU', 'MIX159', 'DRK', 'YELLOW', '</s>']
Log Probability: -12.3380427719357 	Sentence: ['<s>', 'LE3570S', 'CN1GA', 'CLEAR', '</s>']
Log Probability: -14.3670964063428 	Sentence: ['<s>', 'WB1098', 'B1LT', 'HIGH', 'TEMP', 'ACTIVATOR', '(AC)', '</s>']
Log Probability: -17.2116810053293 	Sentence: ['<s>', 'APP205', 'CN1GA', 'REDUCER', 'V', 'HI', 'TEMP', 'ACTIVATOR', '</s>']

--------- Corpus Perplexities ---------
Training Set: 9.92092254811232
Testing Set: inf


## Smoothed Bigram Model

A <b>bigram</b> model with <b>absolute discounting</b> follows. The probability distribution of a word is given by $P_{AD}(w’|w)$.

Smoothing involves a discounting factor $D$. If $n_k$ is the number of bigrams $w_1w_2$ that appear exactly $k$ times, $D$ is: 

$$D=\frac{n_1}{n_1+2n_2}$$ 

For each word $w$, the number of bigram types $ww’$ is computed as follows: 

$$S(w)=|\{w’|c(ww’)>0\}|$$ 

where $c(ww’)$ is the frequency of $ww’$ in the training data.

Finally, $P_{AD}(w’|w)$ is computed: 

$$P_{AD}(w’|w)=\frac{\max \big (c(ww’)-D,0\big )}{c(w)}+\bigg (\frac{D}{c(w)}\cdot S(w) \cdot P_L(w’)\bigg )$$ 

where $c(w)$ is the frequency of $w$ in the training data and $P_L(w’)$ is the Laplace-smoothed unigram probability of $w’$.

In [None]:
import copy
from functools import lru_cache

class SmoothedBigramModelAD(LanguageModel):
    def __init__(self, trainCorpus):
        self.bigrams = [] #flattened tape of bigrams
        self.unigrams = [] #flattened tape of unigrams excluding START
        self.vocab = {} #set of unigrams
        self.bicab = {} #set of bigrams
        self.c2 = dict() #count of bigrams
        self.c1 = dict() #count of unigrams

        self.wheels = defaultdict(dict)

        self.D = float() #discount factor
        self.N = int() #num words in CORPUS tape, excluding START

        self.lenS = int()
        self.Sw = defaultdict(set) #num bigram types per stem
        self.logprobs = dict()
        self.PL = dict() # smoothed unigram probabilities (of follower) for AD

        self.getCounts(trainCorpus)

    def getCounts(self, corpus):
        self.bigrams = [tuple(sent[i:i+2]) for sent in corpus for i in range(len(sent) - (2-1))]
        self.unigrams = [UNK] + [i for sent in corpus for i in sent] # initialize with at least one UNK
        self.bicab = set(self.bigrams)
        self.vocab = set(self.unigrams)
        self.c2 = Counter(self.bigrams)
        self.c1 = Counter(self.unigrams)
        
        n1 = len({i for i in self.c2.keys() if self.c2[i] == 1})
        n2 = len({i for i in self.c2.keys() if self.c2[i] == 2})
        self.D = n1 / (n1 + 2 * n2)

        self.lenS = len(self.vocab) - 1
        self.N = len([i for i in self.unigrams if i not in {START}]) # must omit START token

        # Initialize branching index for smoothing function
        for bigram in self.bicab:
          self.Sw[bigram[0]].add(bigram)
        self.Sw[UNK].add((UNK, UNK)) # insert at least one UNK bigram, might tend to generate this

    def generateSentence(self):
      ### Start with START, get next word, repeat till END
      ### Only draw from training corpus
        sent = [START]
        newWord = ''
        while newWord != END:
          newWord = self.generateBigram(sent[-1])
          sent.append(newWord)
        return sent  

    def getSentenceLogProbability(self, sentence):
      ### Lazy-eval smoothed probs
        if not self.bicab: return float('inf')
        sentbigrams = [tuple(sentence[i:i+2]) for i in range(len(sentence) - (2-1))]
        return sum([self.logprobs[bigram] if self.logprobs.get(bigram) else self.getADProb(bigram) for bigram in sentbigrams])
   
    def corpusWrapper(func):
      ### Alter N for calculating perplexity on new corpus
        def wrapper(self, testCorpus):
          dummy = self.N
          self.N = len([i for sent in testCorpus for i in sent if i not in {START}]) # must omit START token
          output = func(self, testCorpus)
          self.N = dummy
          return output
        return wrapper

    @corpusWrapper
    def getCorpusPerplexity(self, testCorpus):
      ### Lazy-eval smoothed probs
        if testCorpus is not None:
          return math.exp((-1/self.N) * sum([self.getSentenceLogProbability(sent) for sent in testCorpus]))
        else:
          return

    @lru_cache(256)
    def getADProb(self, bigram):
      ### Get abs discounted probs for unseen/seen bigrams (not OOV unigrams) 
      ### N differs between test and train operations 
        if bigram[0] not in self.vocab: stem = UNK
        else: stem = bigram[0] 
        if bigram[1] not in self.vocab: follower = UNK
        else: follower = bigram[1] 
        if stem not in self.vocab: return float('-inf') # a test for OOV or null input
        stemcount = self.getCw(stem)
        if stemcount == 0: 
          print('no stem in vocab count')
          return float('-inf') 
        # Count of unique followers in traincorpus, or # times applied discount
        Sw = self.getSw(stem)
        # Laplace smoothed unigram prob of follower as above
        PL = self.getPL(follower)
        # Maintain dict of bigram probs
        self.logprobs[bigram] = \
          math.log( (max(0, self.getCw0w1(bigram) - self.D) + (self.D * Sw * PL)) / stemcount, math.e) # possibly zero if Sw, PL zero
        return self.logprobs.get(bigram)

    @lru_cache(256)
    def getSw(self, stem):
      return len(self.Sw.get(stem)) # if stem is UNK, returns the dummy training case of len 1

    @lru_cache(256)
    def getPL(self, follower):
      if not self.PL.get(follower):
        self.PL[follower] = (self.getCw(follower) + 1) / (self.N + self.lenS)
      return self.PL.get(follower)

    @lru_cache(256) 
    def getCw(self, unigram):
      return self.c1.get(unigram, self.c1.get(UNK)) # returns UNK count of >= 1 if OOV

    @lru_cache(256)
    def getCw0w1(self, bigram):
      return self.c2.get(bigram, 0)


    def generateBigram(self, stem):
      # support func for generateSentence
        self.getProbs(stem)
        roll = random.randrange(1, self.wheels[stem]['count'] + 1)
        return self.wheels[stem][roll]

    def getProbs(self, word):
      ### support func for generateSentence
      ### make wheel of fortune for bigram[0] starter
      ### every vocab word can be a starter except END
        if word != END and self.wheels.get(word, False) == False:
          # return the bigrams and counts that match | also END returns no matches
          # could also just use unigram counts
          matches = {k: self.c2[k] for k in self.c2.keys() if k[0] == word}
          # get total appearance count of the matching bigrams
          N = sum(matches.values())
          # index the wheel
          self.wheels[word] = dict.fromkeys(['count', *range(1, N + 1)])
          self.wheels[word]['count'] = N
          # populate the wheel with end word
          place = 1
          for bigram in matches:
            for x in range(1, matches[bigram] + 1):
              self.wheels[word][place] = bigram[-1]
              place += 1
        else: 
          return


In [None]:
if __name__=='__main__':
    testModel('smoothed-bigram')

--- TEST: generateSentence() ---
Test generateSentence() passed!

--- TEST: getSentenceLogProbability(...) ---
Correct log prob.: -16.1336514995 	Model log prob.: -16.172553647 	 FAILED 	 ['<s>', 'SONIC', 'WAS', 'DIFFICULT', '.', '</s>']
Correct log prob.: -84.9068097328 	Model log prob.: -85.0403277868 	 FAILED 	 ['<s>', 'BY', 'THE', 'LATE', 'CLASSIC', ',', 'A', 'NETWORK', 'OF', '<UNK>', '(', '<UNK>', ')', 'LINKED', 'VARIOUS', 'PARTS', 'OF', 'THE', 'CITY', ',', 'RUNNING', 'FOR', 'SEVERAL', 'KILOMETRES', 'THROUGH', 'ITS', 'URBAN', 'CORE', '.', '</s>']
Correct log prob.: -84.6431458887 	Model log prob.: -84.7732458836 	 FAILED 	 ['<s>', 'FEW', 'PEOPLE', 'REALIZE', 'HOW', 'DIFFICULT', 'IT', 'WAS', 'TO', 'CREATE', 'SONIC', "'S", 'GRAPHICS', 'ENGINE', ',', 'WHICH', 'ALLOWED', 'FOR', 'THE', 'INCREDIBLE', 'RATE', 'OF', 'SPEED', 'THE', 'GAME', "'S", 'KNOWN', 'FOR', '.', '</s>']
Correct log prob.: -39.3603940053 	Model log prob.: -39.1650030398 	 FAILED 	 ['<s>', 'CLASSIC', 'PARTS', 'OF', 'THE

In [None]:
if __name__=='__main__':
    runModel('smoothed-bigram')

--------- 5 sentences from your model ---------
Log Probability: -21.697686493523868 	Sentence: ['<s>', 'G2-4500S', 'CN1GA', 'METALUX', 'COARSE', 'ALUM', '</s>']
Log Probability: -14.612159694755531 	Sentence: ['<s>', '29586000', 'CN5LT', 'THINNER', '</s>']
Log Probability: -15.326072177939247 	Sentence: ['<s>', 'WB05', 'CN1L', 'PS', 'VHS', 'HARDENER', '</s>']
Log Probability: -12.188504028224338 	Sentence: ['<s>', '2311S', 'TBXX', 'SANDING', 'PASTE', '</s>']
Log Probability: -21.72001091201145 	Sentence: ['<s>', '7701S', 'CN1GA', 'STANDARD', 'ACTIVATOR', '(AC)', '</s>']

--------- Corpus Perplexities ---------
Training Set: 22.38044063709131
Testing Set: 135.40496418312708


# Text Classification with Neural Networks
We will use neural networks to classify product descriptions using the sample dataset. 

Some reference PyTorch tutorials:
<ul>
<li>https://pytorch.org/tutorials/beginner/pytorch_with_examples.html
<li>https://pytorch.org/tutorials/beginner/data_loading_tutorial.html
<li>https://github.com/yunjey/pytorch-tutorial
</ul>

Note: sparingly use "GPU" as the runtime type, as this will speed up the training of your models. You can find this by going to <TT>Runtime > Change Runtime Type</TT> and select "GPU" from the dropdown menu.

In [None]:
from collections import defaultdict
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils import data
import torchtext 
import random

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if __name__=='__main__':
    print('Using device:', device)

Using device: cuda


# Step 2: Load Data
First we load the dataset as `train_data` and `test_data` and do some basic tokenization.

*   To access the list of textual tokens for the *i*th example, use `train_data[i][1]`
*   To access the label for the *i*th example, use `train_data[i][0]`



In [None]:
import pandas 

def preprocess_NN(item):
    '''
    #Process each individual item at a time
    '''
    if not item or type(item) != str: return # filter blanks
    #if len(item) < 2: return    
    result = []
    for x in item.split(' '):
      # Trim any needless starting or ending chars from tokens
        if not x or x.isspace(): continue # filter blanks
        else:  
          try:
            remove_beg=True if x[0] in {'(', '"', "'"} else False
            remove_end=True if x[-1] in {'.', ',', ';', ':', '?', '!', '"', "'", ')'} else False
          except:
            print("Error on item: ",item, ord(x))
          if remove_beg and remove_end: result += [x[0], x[1:-1], x[-1]]
          elif remove_beg: result += [x[0], x[1:]]
          elif remove_end: result += [x[:-1], x[-1]]
          else: result += [x]
    return result


def getDataset_NN():
    """
    Returns 2 dataframes of Code, Name, PRODHA, dataset, where PRODHA is '?' in the test set
    """
    df = []
    path = '/content/drive/MyDrive/data/Test6_REF.csv'

    filedata = pandas.read_csv(path, 
                         header = 0, 
                         names = ['Code', 'Name', 'PRODHA', 'dataset'] 
                         )

    df.append(filedata[filedata['dataset'] == 'train']) #df[0]
    df.append(filedata[filedata['dataset'] == 'test']) #df[1]
    '''
    try:
      train_dataset = preprocess_NN(df[0].iteritems(['Name'])
    except:
      print(1)
    try:
      test_dataset = preprocess_NN(df[1]['Name'])
    except:
      print(1)
    '''
    return df[0], df[1]

if __name__=='__main__':
    train_data1, test_data1 = getDataset_NN()
    train_data1 = [(x[0], preprocess_NN(x[1])) for x in train_data1.loc[:, ['PRODHA','Name']].itertuples(index=False) \
                   if x[1] is not None and x[0] is not None and type(x[1]) == str]
    test_data1 = [(x[0], preprocess_NN(x[1])) for x in test_data1.loc[:, ['PRODHA','Name']].itertuples(index=False) \
                   if x[1] is not None and x[0] is not None and type(x[1]) == str]
    #train_data, test_data = train_data[0:10000] + train_data[12500:12500+10000], train_data[10000:12500] + train_data[12500+10000:], 

    # print(train_data1[1:10])  
    print('Num. Train Examples:', len(train_data1))
    print('Num. Test Examples:', len(test_data1))

    print("\nSAMPLE TRAIN DATA:")
    for x in random.sample(train_data1, 5):
        print('Sample text:', x[1])
        print('Sample label:', x[0], '\n')
        #print('Sample index:', x[2], '\n')
    
    print("\nSAMPLE TEST DATA:")
    for x in random.sample(test_data1, 5):
        print('Sample text:', x[1])
        print('Sample label:', x[0], '\n')
        #print('Sample index:', x[2], '\n')

    #for x in train_data1:
    #  print (x[0], x[1])



Num. Train Examples: 1387
Num. Test Examples: 2980

SAMPLE TRAIN DATA:
Sample text: ['442-21', 'CN1LT', 'NASONXL', 'BLUE']
Sample label: A1A0015054J6U500A1 

Sample text: ['LV984', 'CN1PT', 'METALUX', 'RED', 'OXIDE']
Sample label: A1A0015054J6U500A1 

Sample text: ['29185106', 'CN1L', 'PC', 'ANTI-SILI', 'ADDITIV', '8510']
Sample label: A1A012501212WC00A1 

Sample text: ['29105980', 'CN1LT', 'PC295', 'MB598', 'DIAM', 'GREEN']
Sample label: A1A0015054J6GV00A1 

Sample text: ['WB68', 'CN0.5LT', 'CROMAX', 'PRO', 'DARK', 'VIOLET']
Sample label: A1A0015154J6HQ00A1 


SAMPLE TEST DATA:
Sample text: ['DX707', 'CN3.785LT', 'DUXONE', 'BLACK']
Sample label: ? 

Sample text: ['CH8106', 'CN1PT', 'CHALLENGER', 'LV', 'CLARET', 'RED']
Sample label: ? 

Sample text: ['422-50', 'CN1QT', 'NASON', 'GRAY', 'SEALER']
Sample label: ? 

Sample text: ['EAW*', 'CNS4', 'IMRON', 'ELITE', 'SINGLE', 'STAGE']
Sample label: ? 

Sample text: ['L1010', 'CN1LT', 'LMB', 'FINE', 'WHITE', 'MICA']
Sample label: ? 



## Step 3: Create PyTorch Dataloader

Here is the <b>dataset</b> class containing the tokenized data for NN models. 

*   <b>` build_dictionary(self)`:</b>  Creates the dictionaries `idx2word` and `word2idx`. Represent each word in the dataset with a unique index, and keep track of this in these dictionaries. Use the hyperparameter `threshold` to control which words appear in the dictionary: a training word’s frequency should be `>= threshold` to be included in the dictionary.

* <b>`convert_text(self)`:</b> Converts each review in the dataset to a list of indices, given by `word2idx` dictionary. Store this in the `textual_ids` variable, and the function does not return anything. If a word is not present in the  `word2idx` dictionary, use the `<UNK>` token for that word. Be sure to append the `<END>` token to the end of each review.

*   <b>` get_text(self, idx) `:</b> Return the review at `idx` in the dataset as an array of indices corresponding to the words in the review. If the length of the review is less than `max_len`, pad the review with the `<PAD>` character up to the length of `max_len`. If the length is greater than `max_len`, then it should only return the first `max_len` words. The return type should be `torch.LongTensor`.

*   <b>`get_label(self, idx) `</b>: Return the value `1` if the label for `idx` in the dataset is `positive`, and should return `0` if it is `negative`. The return type should be `torch.LongTensor`.

*  <b> ` __len__(self) `:</b> Return the total number of reviews in the dataset as an `int`.

*   <b>` __getitem__(self, idx)`:</b> Return the (padded) text, and the label. The return type for both these items should be `torch.LongTensor`. Use the ` get_label(self, idx) ` and ` get_text(self, idx) ` functions here.


In [None]:
PAD = '<PAD>'
END = '<END>'
UNK = '<UNK>'
from collections import Counter

class TextDataset(data.Dataset):
    def __init__(self, examples, split, threshold, max_len, idx2word=None, word2idx=None, label2idx=None):

        self.examples = examples
        assert split in {'train', 'val', 'test'}
        self.split = split
        self.threshold = threshold
        self.max_len = max_len

        # Dictionaries
        self.idx2word = idx2word
        self.word2idx = word2idx
        self.label2idx = label2idx
        if split == 'train':
          self.build_dictionary()
        self.vocab_size = len(self.word2idx)
        self.label_size = len(self.label2idx)
      
        # Convert text to indices
        self.textual_ids = []
        self.label_ids = []
        self.convert_text()
        self.convert_labels()
    
        # Convert back to text
        self.idx2label = {v: k for k, v in self.label2idx.items()}

    def build_dictionary(self): 
        '''
        Builds the dictionaries idx2word, word2idx and a label 'dictionary' label2idx. 
        This is only called when split='train', as these
        dictionaries are passed in to the __init__(...) function otherwise. Uses self.threshold
        to control which words are assigned indices in the dictionaries.
        Returns nothing.
        '''
        assert self.split == 'train'
        
        self.idx2word = {0:PAD, 1:END, 2: UNK}
        self.word2idx = {PAD:0, END:1, UNK: 2}
        self.label2idx = {UNK:0}
        self.idx2label = {0:UNK}

        # Count the frequencies of all words in the training data (self.examples)
        # Assign idx (starting from 3) to all words having word_freq >= self.threshold
        # Convert to uppercase
        words = [word.upper() for (_, desc) in self.examples for word in desc if word != None]

        rawcounts = Counter(words)
        adjcounts = rawcounts - Counter(dict.fromkeys(rawcounts, self.threshold - 1)) #Subtract same keys with counts equal to threshold
        +adjcounts # Removes all zero count keys, hence the -1 above

        try:
          test = set([label for label, desc in self.examples])
        except:
          print(test)
        idx2wordadder = dict(enumerate(adjcounts.keys(), start=3))
        word2idxadder = {value: key for key, value in idx2wordadder.items()}
        label2idxadder = {label: i for i, label in enumerate(test, start=1)}
        idx2labeladder = {v: k for k, v in label2idxadder.items()}
        self.idx2word.update(idx2wordadder)
        self.word2idx.update(word2idxadder)
        self.label2idx.update(label2idxadder)
        self.idx2label.update(idx2labeladder)
        return
    
    def convert_text(self):
        '''
        Converts each product desc in the dataset (self.examples) to a list of indices, given by self.word2idx.
        Store this in self.textual_ids; returns nothing.
        '''
        # list of lists; replaces a word with the <UNK> token if it does not exist in the word2idx dictionary.
        self.textual_ids = [[self.word2idx[word] if self.word2idx.get(word) else self.word2idx[UNK] for word in \
                             (Word.upper() for Word in description)] \
                            for (_, description) in self.examples]

        # Appends the <END> token to the end of each review.
        for description in self.textual_ids:
          description.append(self.word2idx[END])
        return
    
    def convert_labels(self):
        '''
        Converts each product category code in the dataset (self.examples) to an index value, given by self.label2idx. 
        Stored in self.label_ids; returns nothing.
        '''
        self.label_ids = [self.label2idx[item[0]] if self.label2idx.get(item[0]) else self.label2idx[UNK] \
                          for item in self.examples] \
                          #for (label, alist) in item]
        return
    
    def get_text(self, idx):
        '''
        Tokenizes product desc.
        Returns the desc at idx as a long tensor (torch.LongTensor) of integers corresponding to the words in the desc.
        May need to PAD.
        '''
        # REVIEW idx
        a = self.textual_ids[idx]
        b = [a[x] if x < len(a) else self.word2idx[PAD] for x in range(0, self.max_len)]
        return torch.LongTensor(b)
    
    def get_label(self, idx):
        '''
        Returns the integer value of the label per idx.
        type = torch.LongTensor.
        '''
        a = self.label_ids[idx]
        return torch.tensor(a, dtype=torch.long)

    def get_labeltext(self, idx):
        '''
        Returns original product desc text per idx.
        type = string.
        '''
        b = self.idx2label[idx]
        return b
    
    def __len__(self):
        '''
        Returns the number of examples (int value) in the dataset
        '''
        return len(self.examples)
    
    def __getitem__(self, idx):
        '''
        Returns the product desc, and label of the example specified by idx.
        '''
        return self.get_text(idx), self.get_label(idx)

In [None]:
if __name__=='__main__':
    # Sample item
    Ds = TextDataset(train_data1, split='train', threshold=1, max_len=8)
    print('Vocab size:', Ds.vocab_size)
    print('Label set:', Ds.label_size)

    text, label = Ds[random.randint(0, len(Ds))]
    print('Example text:', text)
    print('Example label:', label)

    # For checking only. Ignored downstream.
    Ds2 = TextDataset(test_data1, split='test', threshold=1, max_len=8, word2idx=Ds.word2idx, idx2word=Ds.idx2word, label2idx=Ds.label2idx)
    print('Vocab size:', Ds2.vocab_size)
    print('Label set:', Ds2.label_size)
    
    text, label = Ds2[random.randint(0, len(Ds2))]
    print('Example text:', text)
    print('Example label:', label)

Vocab size: 2409
Label set: 172
Example text: tensor([2378, 1080,  676,  701,    1,    0,    0,    0])
Example label: tensor(87)
Vocab size: 2409
Label set: 172
Example text: tensor([   2,    2, 1060,    1,    0,    0,    0,    0])
Example label: tensor(0)


# Step 3: Train a Convolutional Neural Network (CNN)
## Define the CNN Model 


In [None]:

class CNN(nn.Module):
    def __init__(self, vocab_size, embed_size, out_channels, filter_heights, stride, dropout, num_classes, pad_idx):
        super(CNN, self).__init__()
        
        # Create an embedding layer (https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html)
        #   to represent the words in vocabulary. Make sure to use vocab_size, embed_size, and pad_idx here.

        self.embeddinglayer = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_size, padding_idx=pad_idx)

        # Define multiple Convolution layers (nn.Conv2d) with filter (kernel) size [filter_height, embed_size] based on 
        #   different filter_heights.
        # Input channels will be 1 and output channels will be out_channels (these many different filters will be trained 
        #   for each convolution layer)
        # Note: even though conv layers are nn.Conv2d, we are doing a 1d convolution since we are only moving the filter 
        #   in one direction
          # kernel_size - (height, width)

        self.convlayers = nn.ModuleList(
          [nn.Conv2d(in_channels=1, out_channels=out_channels, kernel_size=(i, embed_size), stride=stride) for i in filter_heights]
        )
        self.relu = nn.ReLU()

        # Create a dropout layer (nn.Dropout) using dropout
        
        self.dropoutlayer = nn.Dropout(p=dropout, inplace=False)

        # Define a linear layer (nn.Linear) that consists of num_classes units 
        #   and takes as input the concatenated output for all cnn layers (out_channels * num_of_cnn_layers units)
        # length of filter_heights equals the number of layers, marc=out_channels*layers
        
        self.linearlayer = nn.Linear(in_features=out_channels*len(filter_heights), out_features=num_classes, bias=True)


    def forward(self, texts):
        """
        texts: LongTensor [batch_size, max_len]
        
        Returns output: Tensor [batch_size, num_classes]
        """
        # Pass texts through embedding layer to convert from word ids to word embeddings
        #   Resulting: shape: [batch_size, max_len, embed_size]
        
        x = self.embeddinglayer(texts)
        #print('after embed [batch_size, max_len, embed_size]:', x.shape)

        # Input to conv should have 1 channel.
        #   Resulting shape: [batch_size, 1, MAX_LEN, embed_size]
        
        x = torch.unsqueeze(x, 1)
        #print('after unsqueeze [batch_size, 1, MAX_LEN, embed_size]:', x.shape)
        
        # Pass these texts to each conv layer and compute their output as follows:
        #   Output will have shape [batch_size, out_channels, *, 1] where * depends on filter_height and stride
        #   Convert to shape [batch_size, out_channels, *] (see torch's squeeze() function)
        #   Apply non-linearity on it 
        #   Take the max value across last dimension to have shape [batch_size, out_channels]
        # Concatenate (torch.cat) outputs from all cnns [batch_size, (out_channels*num_of_cnn_layers)]
        #

        results = [None] * len(self.convlayers)
        for i, conv in enumerate(self.convlayers):
          results[i] = conv(x)
          #print('after conv [batch_size, out_channels, *, 1]:', results[i].shape)
          results[i] = torch.squeeze(results[i], dim=3)
          #print('after squeezed:', results[i].shape)
          results[i] = self.relu(results[i])
          #print('after relu:', results[i].shape)
          results[i] = torch.max(results[i], dim=2)[0]
          #print('after max:', results[i].shape)
          #results[i] = torch.squeeze(results[i])
          #print('after squeezed:', results[i].shape)
        x = torch.cat(results, dim=1)
        #print('after cat [batch_size, (out_channels*num_of_cnn_layers)]:', x.shape)

        #   Since each cnn is of different filter_height, it will look at different number of words at a time
        #     So, a filter_height of 3 means cnn looks at 3 words (3-grams) at a time and tries to extract some information from it
        #   Each cnn will learn out_channels number of features from the words it sees at a time
        #   Then applied a non-linearity and took the max value for all channels
        #    Essentially trying to find important n-grams from the entire text
        # Everything happens on a batch simultaneously hence that additional batch_size as the first dimension

        # Apply dropout
        
        x = self.dropoutlayer(x)
        #print('after dropout:', x.shape)

        # Pass output through the linear layer and return its output 
        #   Resulting shape: [batch_size, num_classes]
        # (((W - K + 2P)/S) + 1)

        x = self.linearlayer(x)
        #print('after linear [batch_size, num_classes]:', x.shape)

        ##### NOTE: Do not apply a sigmoid or softmax to the final output - done in training method!

        return x

## Train CNN Model

First, we initialize the train and test <b>dataloaders</b>. A dataloader is responsible for providing batches of data to the model. We first instantiate datasets for the train and test data, and that we use the training vocabulary for both.

In [None]:
if __name__=='__main__':
    THRESHOLD = 1
    MAX_LEN = 8
    BATCH_SIZE = 32 

    train_Ds = TextDataset(train_data1, 'train', THRESHOLD, MAX_LEN)
    train_loader = torch.utils.data.DataLoader(train_Ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, drop_last=True)

    test_Ds = TextDataset(test_data1, 'test', THRESHOLD, MAX_LEN, train_Ds.idx2word, train_Ds.word2idx, train_Ds.label2idx)
    test_loader = torch.utils.data.DataLoader(test_Ds, batch_size=1, shuffle=False, num_workers=1, drop_last=False)


In [None]:
from tqdm.notebook import tqdm

def train_model(model, num_epochs, data_loader, optimizer, criterion):
    print('Training Model...')
    model.train()
    for epoch in tqdm(range(num_epochs)):
        epoch_loss = 0
        epoch_acc = 0
        for texts, labels in data_loader:
            texts = texts.to(device) # shape: [batch_size, MAX_LEN]
            labels = labels.to(device) # shape: [batch_size]

            optimizer.zero_grad()

            output = model(texts)
            acc = accuracy(output, labels)
            
            loss = criterion(output, labels)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        print('[TRAIN]\t Epoch: {:2d}\t Loss: {:.4f}\t Train Accuracy: {:.2f}%'.format(epoch+1, epoch_loss/len(data_loader), 100*epoch_acc/len(data_loader)))
    print('Model Trained!\n')
    
    torch.save(model.state_dict(), 'model.ckpt')

In [None]:
def count_parameters(model):
    """
    Count number of trainable parameters in the model
    """
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def accuracy(output, labels):
    """
    Returns accuracy per batch
    output: Tensor [batch_size, n_classes]
    labels: LongTensor [batch_size]
    """
    preds = output.argmax(dim=1) # find predicted class
    correct = (preds == labels).sum().float() # convert into float for division 
    acc = correct / len(labels)
    return acc

Now we instantiate the model.

In [None]:
if __name__=='__main__':
    cnn_model = CNN(vocab_size = train_Ds.vocab_size, 
                embed_size = 128, 
                out_channels = 64, 
                filter_heights = [2, 3, 4], 
                stride = 1, 
                dropout = 0.5, 
                num_classes = train_Ds.label_size, 
                pad_idx = train_Ds.word2idx[PAD])

    # Put model on the device (cuda or cpu)
    cnn_model = cnn_model.to(device)
    
    print('The model has {:,d} trainable parameters'.format(count_parameters(cnn_model)))

The model has 415,468 trainable parameters


Next, we create the **criterion**, which is our loss function: it is a measure of how well the model matches the empirical distribution of the data. We use cross-entropy loss (https://en.wikipedia.org/wiki/Cross_entropy).

We also define the **optimizer**, which performs gradient descent. We use the Adam optimizer (https://arxiv.org/pdf/1412.6980.pdf), which has been shown to work well on these types of models.

In [None]:
if __name__=='__main__':    
    LEARNING_RATE = 5e-4 

    # Define the loss function
    criterion = nn.CrossEntropyLoss().to(device)

    # Define the optimizer
    optimizer = optim.Adam(cnn_model.parameters(), lr=LEARNING_RATE)

Finally, we can train the model.

In [None]:
if __name__=='__main__':    
    N_EPOCHS = 25 
    
    # train model for N_EPOCHS epochs
    train_model(cnn_model, N_EPOCHS, train_loader, optimizer, criterion)

Training Model...


  0%|          | 0/25 [00:00<?, ?it/s]

[TRAIN]	 Epoch:  1	 Loss: 4.4988	 Train Accuracy: 9.45%
[TRAIN]	 Epoch:  2	 Loss: 3.5988	 Train Accuracy: 23.55%
[TRAIN]	 Epoch:  3	 Loss: 3.1791	 Train Accuracy: 32.78%
[TRAIN]	 Epoch:  4	 Loss: 2.8150	 Train Accuracy: 41.50%
[TRAIN]	 Epoch:  5	 Loss: 2.5158	 Train Accuracy: 47.53%
[TRAIN]	 Epoch:  6	 Loss: 2.2722	 Train Accuracy: 52.11%
[TRAIN]	 Epoch:  7	 Loss: 2.0158	 Train Accuracy: 56.18%
[TRAIN]	 Epoch:  8	 Loss: 1.8040	 Train Accuracy: 62.06%
[TRAIN]	 Epoch:  9	 Loss: 1.6122	 Train Accuracy: 65.70%
[TRAIN]	 Epoch: 10	 Loss: 1.4007	 Train Accuracy: 70.42%
[TRAIN]	 Epoch: 11	 Loss: 1.2390	 Train Accuracy: 74.42%
[TRAIN]	 Epoch: 12	 Loss: 1.1189	 Train Accuracy: 76.96%
[TRAIN]	 Epoch: 13	 Loss: 1.0044	 Train Accuracy: 78.56%
[TRAIN]	 Epoch: 14	 Loss: 0.8859	 Train Accuracy: 81.40%
[TRAIN]	 Epoch: 15	 Loss: 0.7953	 Train Accuracy: 84.88%
[TRAIN]	 Epoch: 16	 Loss: 0.7078	 Train Accuracy: 85.83%
[TRAIN]	 Epoch: 17	 Loss: 0.6416	 Train Accuracy: 88.01%
[TRAIN]	 Epoch: 18	 Loss: 0.5545

## Evaluate CNN Model

Now that the model is trained for text classification, it is time to evaluate it with the following function. Only compare across the same datasets!
NOTE: test dataset may be the predict dataset, and accuracy will report zero.

In [None]:
import random

def evaluate(model, data_loader, criterion, dataset):
    print('Evaluating performance on the test dataset...')
    model.eval()
    epoch_loss = 0
    epoch_acc = 0
    all_predictions = []
    all_textpredictions = []
    print("\nSOME PREDICTIONS FROM THE MODEL:")
    for texts, labels in tqdm(data_loader):
        texts = texts.to(device)
        labels = labels.to(device)
        
        output = model(texts)
        acc = accuracy(output, labels)
        pred = output.argmax(dim=1)
        
        all_predictions.append(pred)
        a = dataset.get_labeltext(pred.item()) # string
        all_textpredictions.append(a)
        
        loss = criterion(output, labels)
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()

        if random.random() < 0.01:
            print("Input: "+' '.join([data_loader.dataset.idx2word[idx] for idx in texts[0].tolist() if idx not in {data_loader.dataset.word2idx[PAD], data_loader.dataset.word2idx[END]}]))
            print("Prediction:", pred.item(), '\tCorrect Output:', labels.item(), '\n')

    full_acc = 100*epoch_acc/len(data_loader)
    full_loss = epoch_loss/len(data_loader)
    print('[TEST]\t Loss: {:.4f}\t Accuracy: {:.2f}%'.format(full_loss, full_acc))
    predictions = torch.cat(all_predictions)
   

    return predictions, full_acc, full_loss, all_textpredictions

In [None]:
if __name__=='__main__':
    _ , _ , _ , output = evaluate(cnn_model, test_loader, criterion, test_Ds) # Compute test data accuracy

Evaluating performance on the test dataset...

SOME PREDICTIONS FROM THE MODEL:


  0%|          | 0/2980 [00:00<?, ?it/s]

Input: <UNK> CN0.5L HYDROCLR BC <UNK>
Prediction: 124 	Correct Output: 0 

Input: <UNK> <UNK> PASTE <UNK>
Prediction: 87 	Correct Output: 0 

Input: <UNK> CN1QT IMRON 6600 CT BASECOAT
Prediction: 11 	Correct Output: 0 

Input: <UNK> CN0.5LT STANDOX ADDITIVE MAROON
Prediction: 124 	Correct Output: 0 

Input: <UNK> BO1PT SATIN PEARL
Prediction: 40 	Correct Output: 0 

Input: 2001 CN1QT LMB 4:1 CLEAR FAST ACT
Prediction: 164 	Correct Output: 0 

Input: <UNK> CN1HP ULTRA <UNK> CLEAR ACTIVATOR
Prediction: 164 	Correct Output: 0 

Input: <UNK> EAXX CROMAX EX WBC <UNK> PT LBL
Prediction: 164 	Correct Output: 0 

Input: <UNK> CN1QT IMRON <UNK> PRO BC MIX
Prediction: 4 	Correct Output: 0 

Input: <UNK> <UNK> <UNK>
Prediction: 87 	Correct Output: 0 

Input: <UNK> <UNK> <UNK>
Prediction: 87 	Correct Output: 0 

Input: <UNK> CN01 <UNK>
Prediction: 87 	Correct Output: 0 

Input: <UNK> CN1GA BLACK PEARL FUL-THANE
Prediction: 153 	Correct Output: 0 

Input: <UNK> SCXX GLOSS WHITE AEROSOL
Prediction: 

# Step 4: Train a Recurrent Neural Network (RNN)
We now build a text classification model that is based on **recurrences** or auto-regression between hidden states.

In [None]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, bidirectional, dropout, num_classes, pad_idx):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # Create an embedding layer (https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html)
        #   to represent the words in the vocabulary. Make sure to use vocab_size, embed_size, and pad_idx here.
        self.embedlayer = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_size, padding_idx=pad_idx)

        # Create a recurrent network (use nn.GRU, not nn.LSTM) with batch_first = True
        # hidden_size, num_layers, dropout, and bidirectional used here
        self.rnnlayer = nn.GRU(input_size=embed_size, hidden_size=self.hidden_size, num_layers=self.num_layers, batch_first=True, dropout=dropout, bidirectional = bidirectional)
        
        # Create a dropout layer (nn.Dropout) using dropout
        self.dropoutlayer = nn.Dropout(p=dropout, inplace=False)

        # Define a linear layer (nn.Linear) that consists of num_classes units 
        #   and takes as input the output of the last timestep. In the bidirectional case, can concatenate
        #   the output of the last timestep of the forward direction with the output of the last timestep of the backward direction.
        #GRU output = (N, L, D, Hout) where L is sequence length, Hout is hidden_size, D is 2 if bidirectional
        self.linearlayer = nn.Linear(in_features=self.hidden_size if bidirectional==False else self.hidden_size*2, out_features=num_classes, bias=True)


    def forward(self, texts):
        """texts: LongTensor [batch_size, MAX_LEN]
        Returns output: Tensor [batch_size, num_classes]"""

        # Pass texts through the embedding layer to convert from word ids to word embeddings
        #   Resulting: shape: [batch_size, max_len, embed_size]
        x = self.embedlayer(texts)
        #print('after embed [batch_size, max_len, embed_size]:', x.shape)

        # Pass the result through the recurrent network
        #   See PyTorch documentation for resulting shape for nn.GRU
        # hidden > (D * num_layers, N, H_{out})(D∗num_layers,N,Hout)
        x, state = self.rnnlayer(x)
        #print('after rnn (batch_size, *, {1 or 2} * hidden_size):', x.shape)
        #print('after rnn hidden states [D * num_layers, batch_size, hidden_size]:', state.shape)
        
        # Concatenate the outputs of the last timestep for each direction (see torch.cat(...))
        #   This depends on whether or not the model is bidirectional
        #   Resulting shape: [batch_size, num_dirs*hidden_size]
        #state = torch.transpose(state, 0, 1)
        #print('after transpose [batch_size, * , *]:', state.shape)

        #state.view(:, self.num_layers, -1, self.hidden_size)
        #state = state[:,-2]
        #print('after reshape [batch_size, num_dirs*hidden_size]:', state.shape)


        # OR, just take last element (timestamp) of the rnn Output
        x = x[:,-1]
        #print('sliced:', x.shape)
        #print('both reshaped? ', torch.equal(state, x))

        # Apply dropout
        x = self.dropoutlayer(x)
        #print('after dropout (no change) :', x.shape)
        
        # Pass the output through the linear layer and return its output 
        #   Resulting shape: [batch_size, num_classes]
        x = self.linearlayer(x)
        
        #print('after linear [batch_size, num_classes]:', x.shape)

        ##### NOTE: Do not apply a sigmoid or softmax to the final output - done in training method!
        
        return x

## Train RNN Model
First, we initialize the train and test dataloaders.

In [None]:
if __name__=='__main__':
    THRESHOLD = 1 # count to surpass to make it into vocab
    MAX_LEN = 8 # 'window' of description tokens
    BATCH_SIZE = 32 

    train_Ds = TextDataset(train_data1, 'train', THRESHOLD, MAX_LEN)
    train_loader = torch.utils.data.DataLoader(train_Ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, drop_last=True)

    test_Ds = TextDataset(test_data1, 'test', THRESHOLD, MAX_LEN, train_Ds.idx2word, train_Ds.word2idx, train_Ds.label2idx)
    test_loader = torch.utils.data.DataLoader(test_Ds, batch_size=1, shuffle=False, num_workers=1, drop_last=False)

In [None]:
if __name__=='__main__':
    rnn_model = RNN(vocab_size = train_Ds.vocab_size,
                embed_size = 64, 
                hidden_size = 64, 
                num_layers = 2,
                bidirectional = True,
                dropout = 0.5,
                num_classes = train_Ds.label_size,
                pad_idx = train_Ds.word2idx[PAD]) 

    # Put model on device
    rnn_model = rnn_model.to(device)

    print('The model has {:,d} trainable parameters'.format(count_parameters(rnn_model)))

The model has 571,093 trainable parameters


In [None]:
if __name__=='__main__':    
    LEARNING_RATE = 5e-4 # tweakable

    # Define loss function
    criterion = nn.CrossEntropyLoss().to(device)

    # Define optimizer
    optimizer = optim.Adam(rnn_model.parameters(), lr=LEARNING_RATE)

In [None]:
if __name__=='__main__':    
    N_EPOCHS = 50 # Tweakable
    
    # train model for N_EPOCHS epochs
    train_model(rnn_model, N_EPOCHS, train_loader, optimizer, criterion)

Training Model...


  0%|          | 0/50 [00:00<?, ?it/s]

[TRAIN]	 Epoch:  1	 Loss: 2.3760	 Train Accuracy: 47.36%
[TRAIN]	 Epoch:  2	 Loss: 2.2171	 Train Accuracy: 50.62%
[TRAIN]	 Epoch:  3	 Loss: 2.0612	 Train Accuracy: 54.29%
[TRAIN]	 Epoch:  4	 Loss: 1.9362	 Train Accuracy: 56.19%
[TRAIN]	 Epoch:  5	 Loss: 1.8445	 Train Accuracy: 58.25%
[TRAIN]	 Epoch:  6	 Loss: 1.7273	 Train Accuracy: 59.54%
[TRAIN]	 Epoch:  7	 Loss: 1.6342	 Train Accuracy: 61.10%
[TRAIN]	 Epoch:  8	 Loss: 1.5511	 Train Accuracy: 63.14%
[TRAIN]	 Epoch:  9	 Loss: 1.4620	 Train Accuracy: 65.23%
[TRAIN]	 Epoch: 10	 Loss: 1.4004	 Train Accuracy: 66.67%
[TRAIN]	 Epoch: 11	 Loss: 1.3252	 Train Accuracy: 67.95%
[TRAIN]	 Epoch: 12	 Loss: 1.2576	 Train Accuracy: 69.50%
[TRAIN]	 Epoch: 13	 Loss: 1.2123	 Train Accuracy: 70.71%
[TRAIN]	 Epoch: 14	 Loss: 1.1451	 Train Accuracy: 72.06%
[TRAIN]	 Epoch: 15	 Loss: 1.0886	 Train Accuracy: 73.09%
[TRAIN]	 Epoch: 16	 Loss: 1.0500	 Train Accuracy: 74.31%
[TRAIN]	 Epoch: 17	 Loss: 1.0008	 Train Accuracy: 75.43%
[TRAIN]	 Epoch: 18	 Loss: 0.945

## Evaluate RNN Model

Now we can evaluate the RNN. 

In [None]:
if __name__=='__main__':    
    evaluate(rnn_model, test_loader, criterion) # Compute test data accuracy

Evaluating performance on the test dataset...

SOME PREDICTIONS FROM THE MODEL:


  0%|          | 0/1284 [00:00<?, ?it/s]

Input: <UNK> <UNK> LT RAL1028 HB -COMB
Prediction: 84 	Correct Output: 242 

Input: ALESTA EE | RAL5001 GREEN BLUE SOLID SEMIGLOSS
Prediction: 140 	Correct Output: 140 

Input: ALESTA EP | RAL1021 YELLOW SOLID GLOSS
Prediction: 21 	Correct Output: 21 

Input: <UNK> TINT RED VIOLET
Prediction: 99 	Correct Output: 99 

Input: CROMAX | G2-7600SX CHROMACLEAR 4LT
Prediction: 136 	Correct Output: 136 

Input: 4210-01 N4LT NASON PU SURFACER EX
Prediction: 82 	Correct Output: 82 

Input: <UNK> N20LT TSA DAZZLING WHITE VE-T
Prediction: 30 	Correct Output: 235 

[TEST]	 Loss: 1.9357	 Accuracy: 65.89%


This piece caches the trained models to Drive.

In [None]:
if __name__=='__main__':
    from google.colab import drive
    drive.mount('/content/drive')
    print()

    try:
        cnn_model is None
        cnn_exists = True
    except:
        cnn_exists = False

    try:
        rnn_model is None
        rnn_exists = True
    except:
        rnn_exists = False

    if cnn_exists:
        print("Saving CNN model....") 
        torch.save(cnn_model, "drive/My Drive/cnn.pt")
    if rnn_exists:
        print("Saving RNN model....") 
        torch.save(rnn_model, "drive/My Drive/rnn.pt")
    print("Done!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Saving CNN model....
Saving RNN model....
Done!


# Save with pickle


In [None]:
torch.save(cnn_model.state_dict(), MODELPATH + MODELFILENAME)

In [None]:
''' 
EXPORTER
'''
import csv
from google.colab import files

with open('CNNoutput.csv', 'w') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerow(output)

files.download('CNNoutput.csv')