In [None]:
# imports
import torch
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import gensim.downloader
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [None]:
class LSTMTagger(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, target_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim) 
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.hidden2tag = nn.Linear(hidden_dim, target_size)
    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds)
        tag_space = self.hidden2tag(lstm_out)
        # tag_scores = F.softmax(tag_space, dim=-1) # which library is F from?
        return tag_space


In [None]:
EMBEDDING_DIM = 200
HIDDEN_DIM = 64
VOCAB_SIZE = 3000000
TARGET_SIZE = 3000000
learning_rate = 0.001
model = LSTMTagger(EMBEDDING_DIM,HIDDEN_DIM,VOCAB_SIZE,TARGET_SIZE)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
def preprocess_dataset(df):
    df_train, df_test = train_test_split(df, test_size=0.3, random_state=None)
    labels_train = df_train["tags"]
    df_train = df_train.drop(df_train["tags"])
    labels_test = df_test["tags"]
    df_test = df_test.drop(df_test["tags"])

    standard_scaler = preprocessing.StandardScaler()
    df_train_scaled = standard_scaler.fit_transform(df_train)
    df_test_scaled = standard_scaler.transform(df_test)

    label_encoder = preprocessing.LabelEncoder()
    labels_train = label_encoder.fit_transform(labels_train)
    labels_test = label_encoder.transform(labels_test)

    return df_train_scaled, df_test_scaled, labels_train, labels_test

class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X =torch.tensor(X, dtype=torch.float)
        self.y =torch.tensor(y, dtype=torch.float)
    
    def __len__(self):
        return len(self.y)

    def __getitem__(self,idx):
        return self.X[idx], self.y[idx]

def intialise_loaders(X_train_scaled, y_train, X_test_scaled, y_test):
    batch_size = 128
    train_data = CustomDataset(X_train_scaled, y_train)
    test_data = CustomDataset(X_test_scaled, y_test)

    train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=True)

    return train_dataloader, test_dataloader

# train_dataloader, test_dataloader = intialise_loaders(X_train_scaled, y_train, X_test_scaled, y_test)

In [18]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    train_loss, correct = 0, 0
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, torch.unsqueeze(y,1))

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
        train_loss += loss.item()
        correct += torch.sum((pred > 0.5) == torch.unsqueeze(y,1))
    
        train_loss /= num_batches
        correct = float(correct) / size
    
    return train_loss, correct
    

def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, torch.unsqueeze(y,1)).item()
            correct += torch.sum((pred > 0.5) == torch.unsqueeze(y,1))

    test_loss /= num_batches
    correct = float(correct) / size
 
    return test_loss, correct


TypeError: enumerate() missing required argument 'iterable'

In [None]:
no_of_epochs = 3
accuracy_list = []
loss_list = []

data = ??


for epoch in range(no_of_epochs):
    train_loss, correct = 0, 0
    for index, (sentence, tags) in enumerate(data):
        pred = model(sentence)
        loss = loss_fn(pred, tags)

        train_loss += loss.item()
        correct += ?? 

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss /= num_batches
        correct = float(correct) / size
    
    return train_loss, correct

# Understanding data inputs

In [10]:
# input
word2vec_google_news = gensim.downloader.load('word2vec-google-news-300')
print(f"word2vec_google_news data type: {type(word2vec_google_news)}")


word2vec_google_news data type: <class 'gensim.models.keyedvectors.KeyedVectors'>


In [16]:
word_embedding_for_school = word2vec_google_news['school']

In [17]:
print(f"Vector for school: {word_embedding_for_school}", end="\n")
print(f"type: {type(word_embedding_for_school)}", end="\n")
print(f"shape: {word_embedding_for_school.shape}", end="\n")

Vector for school: [ 9.13085938e-02  2.81982422e-02  3.68652344e-02  2.18750000e-01
  9.76562500e-02 -6.59179688e-02  1.56250000e-01 -4.68750000e-02
 -4.66918945e-03 -9.08203125e-02  2.49023438e-01  4.56542969e-02
  6.64062500e-02 -9.86328125e-02 -1.05957031e-01  1.06445312e-01
 -2.00195312e-01  1.40991211e-02  1.36718750e-01 -8.78906250e-02
  2.05078125e-01  1.70898438e-01  1.16577148e-02  1.71875000e-01
  4.85229492e-03 -3.49609375e-01  2.27355957e-03  1.84570312e-01
  1.19628906e-01 -2.51464844e-02  1.02050781e-01 -1.06933594e-01
  4.00390625e-02  7.50732422e-03 -1.63085938e-01 -1.49414062e-01
  2.28515625e-01 -5.02929688e-02  4.63867188e-02  1.25000000e-01
  1.30615234e-02  1.27929688e-01  1.10351562e-01 -5.05371094e-02
  3.44238281e-02  1.44531250e-01  1.71875000e-01 -2.78320312e-02
 -1.10839844e-01  2.29492188e-01 -1.17187500e-01 -1.22070312e-01
 -3.55529785e-03 -1.13769531e-01  2.11914062e-01 -2.08984375e-01
 -1.04492188e-01  4.39453125e-02 -1.23291016e-02 -1.59179688e-01
 -8.83