In [12]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from utils import clean_text

In [13]:
d = [
  ('This article is awesome', 1),
  ('There are just too much words here', 0), 
  ('The math is actually wrong here', 0),
  ('I really enjoy learning new stuff', 1),
  ('I am kinda lazy so I just skim these texts', 0),
  ('Who cares about AI?', 0),
  ('I will surely be a better person after reading this!', 1),
  ('The author is pretty cute :)', 1)
]

In [14]:
class Embeddings():
    """
    A class to read the word embedding file and to create the word embedding matrix
    """
    
    def __init__(self, path, vector_dimension):
        self.path = path 
        self.vector_dimension = vector_dimension
    
    @staticmethod
    def get_coefs(word, *arr): 
        return word, np.asarray(arr, dtype='float32')

    def get_embedding_index(self):
        embeddings_index = dict(self.get_coefs(*o.split(" ")) for o in open(self.path, errors='ignore'))
        return embeddings_index

    def create_embedding_matrix(self, tokenizer, max_features):
        """
        A method to create the embedding matrix
        """
        model_embed = self.get_embedding_index()

        embedding_matrix = np.zeros((max_features + 1, self.vector_dimension))
        for word, index in tokenizer.word_index.items():
            if index > max_features:
                break
            else:
                try:
                    embedding_matrix[index] = model_embed[word]
                except:
                    continue
        return embedding_matrix


In [15]:
embedding = Embeddings('embeddings/mini_embedding.txt', vector_dimension=2)
embedding_matrix = embedding.create_embedding_matrix()

TypeError: create_embedding_matrix() missing 2 required positional arguments: 'tokenizer' and 'max_features'

In [22]:
X_train = [x[0] for x in d] # Text
Y_train = [y[1] for y in d] # Label
X_train = [clean_text(x) for x in X_train]
for i in range(5):
    print("training data ", i+1," - ", X_train[i], "*** label: ", Y_train[i])

training data  1  -  this article awesome *** label:  1
training data  2  -  there are just too much words here *** label:  0
training data  3  -  math actually wrong here *** label:  0
training data  4  -  i really enjoy learning new stuff *** label:  1
training data  5  -  i am kinda lazy so i just skim these texts *** label:  0


In [31]:
from torchnlp.encoders.text import StaticTokenizerEncoder, stack_and_pad_tensors, pad_tensor
loaded_data = ["now this ain't funny", "so don't you dare laugh"]
encoder = StaticTokenizerEncoder(loaded_data, tokenize=lambda s: s.split())
encoded_data = [encoder.encode(example) for example in loaded_data]



<torchnlp.encoders.text.static_tokenizer_encoder.StaticTokenizerEncoder object at 0x7fb462f5b5b0>
