In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F
from torch import nn, optim

In [2]:
training_data = [
        ("Veinte paginas".lower().split(), "Spanish"),
        ("I will visit the library".lower().split(), "English"),
        ("I am reading a book".lower().split(), "English"),
        ("This is my favourite chapter".lower().split(), "English"),
        ("Estoy en la biblioteca".lower().split(), "Spanish"),
        ("Tengo un libro".lower().split(), "Spanish")
        ]

test_data = [
        ("Estoy leyendo".lower().split(), "Spanish"),
        ("This is not my favourite book".lower().split(), "English")
        ]

In [3]:
word_dict = {}
i = 0
for words,language in training_data + test_data:
    for word in words:
        if word not in word_dict:
            word_dict[word]=i
            i+=1
print(word_dict)


{'veinte': 0, 'paginas': 1, 'i': 2, 'will': 3, 'visit': 4, 'the': 5, 'library': 6, 'am': 7, 'reading': 8, 'a': 9, 'book': 10, 'this': 11, 'is': 12, 'my': 13, 'favourite': 14, 'chapter': 15, 'estoy': 16, 'en': 17, 'la': 18, 'biblioteca': 19, 'tengo': 20, 'un': 21, 'libro': 22, 'leyendo': 23, 'not': 24}


Setting up the classifier

In [4]:
corpus_size = len(word_dict)
languages = 2
label_index = {"Spanish": 0, "English": 1}
class BagofWordsClassifier(nn.Module):
    
    def __init__(self,languages,corpus_size):
        super(BagofWordsClassifier,self).__init__()
        self.linear = nn.Linear(corpus_size,languages)
        
    def forward(self, bow_vec):
        return F.log_softmax(self.linear(bow_vec),dim=1)

In [5]:
def make_bow_vector(sentence, word_index):
    word_vec = torch.zeros(corpus_size)
    for word in sentence:
        word_vec[word_dict[word]] += 1
    return word_vec.view(1, -1)

def make_target(label, label_index):
    return torch.LongTensor([label_index[label]])


In [6]:
# Stochatic Gradient descent(SGD)
model = BagofWordsClassifier(languages, corpus_size)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [7]:
# Training Classifier
for epoch in range(110):
    for sentence, label in training_data:
        model.zero_grad()
        bow_vec = make_bow_vector(sentence, word_dict)
        target = make_target(label, label_index)
        
        log_probs = model(bow_vec)
        
        loss = loss_function(log_probs, target)
        loss.backward()
        optimizer.step()
    if epoch%10==0:
        print('Epoch: ',str(epoch+1),' Loss: ' + str(loss.item()))

Epoch:  1  Loss: 0.5808020830154419
Epoch:  11  Loss: 0.1256256252527237
Epoch:  21  Loss: 0.0658452957868576
Epoch:  31  Loss: 0.04427177086472511
Epoch:  41  Loss: 0.033272817730903625
Epoch:  51  Loss: 0.026627372950315475
Epoch:  61  Loss: 0.022184422239661217
Epoch:  71  Loss: 0.019007330760359764
Epoch:  81  Loss: 0.01662343740463257
Epoch:  91  Loss: 0.01476938184350729
Epoch:  101  Loss: 0.01328650489449501


In [12]:
test = pd.read_csv("test.csv")
test_labels = test['label'].values
test = test.drop("label",axis=1).values.reshape(len(test),1,28,28)

x_test = torch.Tensor(test.astype(float))
y_test = torch.Tensor(test_labels).long()

In [13]:
model(x_test)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (5600x28 and 25x2)