In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import coo_matrix as cm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import pickle
from tqdm import tqdm
from sklearn.decomposition import TruncatedSVD
from tqdm import tqdm
import torch.nn.functional as F

In [2]:
class SVDWordVectors:
    def __init__(self, train_file, num_samples=10000, k=100, save_path='svd_word_vectors.pt', context_size=2):
        self.train_file = train_file
        self.num_samples = num_samples
        self.k = k
        self.save_path = save_path
        self.context_size = context_size
        self.word_vectors = None
        self.vectorizer = None
        self.vocabulary = None
        self.word_to_index = None  # Mapping from word to index
        self.unknown_token = '<UNK>'
        self.unknown_vector = None  # Initialize unknown vector

    def load_dataset(self, file_path, num_samples=None):
        if num_samples:
            data = pd.read_csv(file_path, nrows=num_samples)
        else:
            data = pd.read_csv(file_path)
        return data

    def preprocess_data(self, data):
        corpus = data['Description']
        return corpus

    def build_co_occurrence_matrix(self, corpus):
        self.vectorizer = CountVectorizer()
        X = self.vectorizer.fit_transform(corpus)
        num_words = len(self.vectorizer.vocabulary_)

        co_occurrence_matrix = np.zeros((num_words, num_words), dtype=np.int32)

        for doc in corpus:
            tokens = doc.split()
            for i, target_word in enumerate(tokens):
                target_index = self.vectorizer.vocabulary_.get(target_word)
                if target_index is None:
                    continue

                # Iterate over the context window around the target word
                for j in range(max(0, i - self.context_size), min(len(tokens), i + self.context_size + 1)):
                    if j != i:
                        context_word = tokens[j]
                        context_index = self.vectorizer.vocabulary_.get(context_word)
                        if context_index is not None:
                            co_occurrence_matrix[target_index, context_index] += 1

        coo_matrix = cm(co_occurrence_matrix)

        self.vocabulary = self.vectorizer.vocabulary_
        self.vocabulary[self.unknown_token] = len(self.vocabulary)

        self.unknown_vector = torch.zeros(self.k)

        print("Size of CountVectorizer vocabulary:", len(self.vocabulary))
        print("Shape of X matrix:", X.shape)
        print("Shape of co-occurrence matrix (coo_matrix):", coo_matrix.shape)
        print("Shape of unknown vector:", self.unknown_vector.shape)

        return coo_matrix

    def apply_svd(self, coo_matrix):
        svd = TruncatedSVD(self.k, n_iter=10)
        word_vectors_svd = svd.fit_transform(coo_matrix)
        return word_vectors_svd

    def train(self):
        data = self.load_dataset(self.train_file, self.num_samples)
        corpus = self.preprocess_data(data)
        coo_matrix = self.build_co_occurrence_matrix(corpus)
        self.word_vectors = self.apply_svd(coo_matrix)
        self.word_vectors = torch.tensor(self.word_vectors)  # Convert numpy array to PyTorch tensor
        self.word_vectors = torch.cat((self.word_vectors, self.unknown_vector.unsqueeze(0)), dim=0)
        self.word_vectors = self.word_vectors.to(torch.float32)  # Convert tensor to torch.float32

        # Create mapping from word to index
        self.word_to_index = {word: idx for word, idx in self.vocabulary.items()}

        self.save_word_vectors(self.word_vectors, self.word_to_index)  # Save both word vectors and mapping

    def save_word_vectors(self, word_vectors, word_to_index):
        # Save both word vectors and mapping
        torch.save({'word_vectors': word_vectors, 'word_to_index': word_to_index}, self.save_path)

    def load_word_vectors(self):
        # Load both word vectors and mapping
        checkpoint = torch.load(self.save_path)
        self.word_vectors = checkpoint['word_vectors']
        self.word_to_index = checkpoint['word_to_index']


In [3]:
svd_model = SVDWordVectors(train_file="data/train.csv", num_samples=None, k=100, save_path='svd_word_vectors.pt',context_size=5)
svd_model.train()

Size of CountVectorizer vocabulary: 60735
Shape of X matrix: (120000, 60734)
Shape of co-occurrence matrix (coo_matrix): (60734, 60734)
Shape of unknown vector: torch.Size([100])


In [4]:
if svd_model.vocabulary is not None:
    print("Size of vocabulary:", len(svd_model.vocabulary))
else:
    print("Vocabulary not initialized.")

# Print size of word vectors
if svd_model.word_vectors is not None:
    print("Size of word vectors:", svd_model.word_vectors.shape)
else:
    print("Word vectors not initialized.")
print("world vectors",svd_model.word_vectors)


# Assuming your word vectors tensor is named word_vectors_tensor
num_rows_with_zeros = torch.sum(torch.all(svd_model.word_vectors == 0, dim=1)).item()
print("Number of rows with all zero values:", num_rows_with_zeros)



Size of vocabulary: 60735
Size of word vectors: torch.Size([60735, 100])
world vectors tensor([[ 1.1744e+00, -9.0805e-02, -4.0689e-01,  ..., -2.4205e-02,
          4.1580e-02, -8.5038e-02],
        [ 1.0647e+01, -5.5095e-01,  1.1532e+00,  ..., -2.9955e-01,
         -2.5240e-02,  2.6508e-01],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        ...,
        [ 4.1421e-01,  5.8811e-01, -3.5414e-01,  ..., -6.6696e-03,
          9.5885e-03,  8.4931e-02],
        [ 2.1136e-02,  1.2461e-02,  2.7019e-02,  ...,  1.7345e-01,
          5.4228e-03, -3.2105e-02],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00]])
Number of rows with all zero values: 28847


In [5]:
# class MyDataset(Dataset):
#     def __init__(self, features, labels):
#         self.features = features
#         self.labels = labels

#     def __len__(self):
#         return len(self.features)

#     def __getitem__(self, idx):
#         return self.features[idx], self.labels[idx]

# class RNNClassifier(nn.Module):
#     def __init__(self, input_size, hidden_size, output_size):
#         super(RNNClassifier, self).__init__()
#         self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
#         self.fc = nn.Linear(hidden_size, output_size)

#     def forward(self, x):
#         out, _ = self.rnn(x.unsqueeze(0))
#         out = self.fc(out[:, -1, :])
#         return out



In [30]:
class MyDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        _, (hn, _) = self.lstm(x.unsqueeze(0))
        out = self.fc(hn[-1, :, :])
        pred = F.log_softmax(out, dim=1)  # Compute log probabilities with softmax
        return pred

class SVDClassifier:
    def __init__(self, word_vectors_file, num_classes):
        self.word_vectors, self.word_to_index = self.load_word_vectors(word_vectors_file)
        self.num_classes = num_classes
        self.lstm_model = LSTMClassifier(input_size=self.word_vectors.size(1), hidden_size=128, output_size=num_classes)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.lstm_model.to(self.device)

    def load_word_vectors(self, file_path):
        saved_data = torch.load(file_path)
        word_vectors = saved_data['word_vectors']
        word_to_index = saved_data['word_to_index']
        return word_vectors, word_to_index

    def train(self, train_file, test_file, num_samples=10000, epochs=10):
        # Load the training data once
        train_data = self.load_dataset(train_file, num_samples)
        train_corpus = self.preprocess_data(train_data)
        train_features = self.convert_to_features(train_corpus)
        train_labels = train_data['Class Index']
        train_dataset = MyDataset(train_features, train_labels)
        train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
        num_train_batches = len(train_dataloader)

        # Load the test data once
        test_data = self.load_dataset(test_file)
        test_corpus = self.preprocess_data(test_data)
        test_features = self.convert_to_features(test_corpus)
        test_labels = test_data['Class Index']
        test_dataset = MyDataset(test_features, test_labels)
        test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)
        num_test_batches = len(test_dataloader)
        
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(self.lstm_model.parameters(), lr=0.0005)
        train_losses = []
        test_accuracies = []
        
        for epoch in range(epochs):        
            self.lstm_model.train()
            total_loss = 0.0
            progress_bar = tqdm(enumerate(train_dataloader), total=num_train_batches, desc=f'Epoch {epoch+1}/{epochs}')
            for step, (inputs, labels) in progress_bar:
                inputs, labels = inputs.to(self.device), labels.to(self.device)
                optimizer.zero_grad()
                outputs = self.lstm_model(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                total_loss += loss.item()

            train_loss = total_loss / num_train_batches
            train_losses.append(train_loss)
            test_accuracy = self.evaluate(test_dataloader)
            test_accuracies.append(test_accuracy)
            
            print(f'Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Test Accuracy: {test_accuracy:.4f}')

        return train_losses, test_accuracies

    def evaluate(self, dataloader):
        self.lstm_model.eval()
        predictions = []
        true_labels = []
        with torch.no_grad():
            for inputs, labels in dataloader:
                inputs, labels = inputs.to(self.device), labels.to(self.device)
                outputs = self.lstm_model(inputs)
                _, predicted = torch.max(outputs, 1)
                predictions.extend(predicted.tolist())
                true_labels.extend(labels.tolist())
#         unique_true_labels = set(true_labels)
#         unique_predictions = set(predictions)

#         print("Unique true labels:", unique_true_labels)
#         print("Unique predictions:", unique_predictions)
        accuracy = accuracy_score(true_labels, predictions)
        return accuracy

    def test(self, test_file):
        data = self.load_dataset(test_file)
        corpus = self.preprocess_data(data)
        features = self.convert_to_features(corpus)
        labels = data['Class Index']

        dataset = MyDataset(features, labels)
        dataloader = DataLoader(dataset, batch_size=1, shuffle=False)
        num_batches = len(dataloader)

        self.lstm_model.eval()
        predictions = []
        true_labels = []

        with torch.no_grad():
            progress_bar = tqdm(enumerate(dataloader), total=num_batches)
            for step, (inputs, labels) in progress_bar:
                inputs, labels = inputs.to(self.device), labels.to(self.device)
                outputs = self.lstm_model(inputs)
                _, predicted = torch.max(outputs, 1)
                predictions.extend(predicted.tolist())
                true_labels.extend(labels.tolist())
                progress_bar.set_description('Testing')

        accuracy = accuracy_score(true_labels, predictions)
        return accuracy

    def convert_to_features(self, corpus):
        features = []
        unk_index = self.word_to_index.get('<UNK>')
        unknown_vector = self.word_vectors[unk_index]
        for text in corpus:
            encoded_text = []
            for token in text.split():
                idx = self.word_to_index.get(token)
                if idx is not None:
                    encoded_text.append(self.word_vectors[idx])
                else:
                    encoded_text.append(unknown_vector)
            encoded_text_avg = torch.stack(encoded_text).mean(dim=0)
            features.append(encoded_text_avg)

        return torch.stack(features)

    def load_dataset(self, file_path, num_samples=None, reduce_label=True):
        if num_samples:
            data = pd.read_csv(file_path, nrows=num_samples)
        else:
            data = pd.read_csv(file_path)

        if reduce_label:
            data['Class Index'] -= 1  # Reduce 1 from each label

        return data

    def preprocess_data(self, data):
        corpus = data['Description']
        return corpus


In [8]:
word_vectors_file = "svd_word_vectors.pt"
train_file = "data/train.csv"
test_file = "data/test.csv"
svd_classifier = SVDClassifier(word_vectors_file)
train_losses, test_accuracies = svd_classifier.train(train_file, test_file,num_samples=None, epochs=30)

Epoch 1/30: 100%|█████████████████████| 120000/120000 [01:40<00:00, 1190.26it/s]


Epoch 1/30, Train Loss: 1.3323, Test Accuracy: 0.4232


Epoch 2/30: 100%|█████████████████████| 120000/120000 [01:55<00:00, 1035.20it/s]


Epoch 2/30, Train Loss: 1.2476, Test Accuracy: 0.4151


Epoch 3/30: 100%|██████████████████████| 120000/120000 [02:21<00:00, 849.54it/s]


Epoch 3/30, Train Loss: 1.2182, Test Accuracy: 0.4680


Epoch 4/30: 100%|█████████████████████| 120000/120000 [01:57<00:00, 1017.80it/s]


Epoch 4/30, Train Loss: 1.2016, Test Accuracy: 0.5114


Epoch 5/30: 100%|█████████████████████| 120000/120000 [01:56<00:00, 1032.03it/s]


Epoch 5/30, Train Loss: 1.1723, Test Accuracy: 0.4595


Epoch 6/30: 100%|█████████████████████| 120000/120000 [01:58<00:00, 1010.32it/s]


Epoch 6/30, Train Loss: 1.1700, Test Accuracy: 0.4517


Epoch 7/30: 100%|█████████████████████| 120000/120000 [01:58<00:00, 1016.49it/s]


Epoch 7/30, Train Loss: 1.1555, Test Accuracy: 0.4878


Epoch 8/30: 100%|█████████████████████| 120000/120000 [01:59<00:00, 1001.13it/s]


Epoch 8/30, Train Loss: 1.1393, Test Accuracy: 0.5367


Epoch 9/30: 100%|█████████████████████| 120000/120000 [01:57<00:00, 1017.43it/s]


Epoch 9/30, Train Loss: 1.1173, Test Accuracy: 0.4959


Epoch 10/30: 100%|████████████████████| 120000/120000 [01:56<00:00, 1026.87it/s]


Epoch 10/30, Train Loss: 1.1012, Test Accuracy: 0.5209


Epoch 11/30: 100%|████████████████████| 120000/120000 [01:58<00:00, 1008.52it/s]


Epoch 11/30, Train Loss: 1.0858, Test Accuracy: 0.5587


Epoch 12/30: 100%|████████████████████| 120000/120000 [01:59<00:00, 1007.06it/s]


Epoch 12/30, Train Loss: 1.0759, Test Accuracy: 0.5729


Epoch 13/30: 100%|████████████████████| 120000/120000 [01:59<00:00, 1006.40it/s]


Epoch 13/30, Train Loss: 1.0607, Test Accuracy: 0.5318


Epoch 14/30: 100%|█████████████████████| 120000/120000 [02:10<00:00, 921.58it/s]


Epoch 14/30, Train Loss: 1.0627, Test Accuracy: 0.5787


Epoch 15/30: 100%|█████████████████████| 120000/120000 [02:17<00:00, 872.03it/s]


Epoch 15/30, Train Loss: 1.0679, Test Accuracy: 0.5561


Epoch 16/30: 100%|█████████████████████| 120000/120000 [02:12<00:00, 903.57it/s]


Epoch 16/30, Train Loss: 1.0522, Test Accuracy: 0.5428


Epoch 17/30: 100%|█████████████████████| 120000/120000 [02:01<00:00, 986.77it/s]


Epoch 17/30, Train Loss: 1.0475, Test Accuracy: 0.5789


Epoch 18/30: 100%|█████████████████████| 120000/120000 [02:09<00:00, 924.80it/s]


Epoch 18/30, Train Loss: 1.0270, Test Accuracy: 0.5746


Epoch 19/30: 100%|█████████████████████| 120000/120000 [02:07<00:00, 941.90it/s]


Epoch 19/30, Train Loss: 1.0205, Test Accuracy: 0.5612


Epoch 20/30: 100%|█████████████████████| 120000/120000 [02:04<00:00, 961.77it/s]


Epoch 20/30, Train Loss: 1.0170, Test Accuracy: 0.5968


Epoch 21/30: 100%|█████████████████████| 120000/120000 [02:05<00:00, 953.85it/s]


Epoch 21/30, Train Loss: 1.0123, Test Accuracy: 0.5924


Epoch 22/30: 100%|█████████████████████| 120000/120000 [02:04<00:00, 961.16it/s]


Epoch 22/30, Train Loss: 1.0026, Test Accuracy: 0.6082


Epoch 23/30: 100%|█████████████████████| 120000/120000 [02:06<00:00, 949.57it/s]


Epoch 23/30, Train Loss: 1.0057, Test Accuracy: 0.6124


Epoch 24/30: 100%|█████████████████████| 120000/120000 [02:04<00:00, 966.26it/s]


Epoch 24/30, Train Loss: 0.9883, Test Accuracy: 0.5837


Epoch 25/30: 100%|█████████████████████| 120000/120000 [02:04<00:00, 961.24it/s]


Epoch 25/30, Train Loss: 0.9942, Test Accuracy: 0.6139


Epoch 26/30: 100%|█████████████████████| 120000/120000 [02:14<00:00, 894.69it/s]


Epoch 26/30, Train Loss: 0.9771, Test Accuracy: 0.6155


Epoch 27/30: 100%|█████████████████████| 120000/120000 [02:10<00:00, 918.69it/s]


Epoch 27/30, Train Loss: 0.9802, Test Accuracy: 0.5870


Epoch 28/30: 100%|█████████████████████| 120000/120000 [02:13<00:00, 898.40it/s]


Epoch 28/30, Train Loss: 0.9692, Test Accuracy: 0.6057


Epoch 29/30: 100%|█████████████████████| 120000/120000 [02:13<00:00, 902.09it/s]


Epoch 29/30, Train Loss: 0.9653, Test Accuracy: 0.6361


Epoch 30/30: 100%|█████████████████████| 120000/120000 [02:09<00:00, 928.44it/s]


Epoch 30/30, Train Loss: 0.9634, Test Accuracy: 0.6224


In [9]:
accuracy = svd_classifier.test(test_file)
print("Test Accuracy:", accuracy)

Testing:  38%|██████████▋                 | 2911/7600 [00:01<00:03, 1482.21it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

Testing: 100%|████████████████████████████| 7600/7600 [00:04<00:00, 1650.52it/s]


Test Accuracy: 0.6223684210526316


In [14]:
new_train_losses, new_test_accuracies = svd_classifier.train(train_file, test_file,num_samples=None, epochs=30)

Epoch 1/30: 100%|█████████████████████| 120000/120000 [01:41<00:00, 1177.50it/s]


Epoch 1/30, Train Loss: 0.9679, Test Accuracy: 0.5954


Epoch 2/30: 100%|█████████████████████| 120000/120000 [01:47<00:00, 1114.62it/s]


Epoch 2/30, Train Loss: 0.9592, Test Accuracy: 0.6187


Epoch 3/30: 100%|█████████████████████| 120000/120000 [01:46<00:00, 1127.26it/s]


Epoch 3/30, Train Loss: 0.9638, Test Accuracy: 0.6116


Epoch 4/30: 100%|██████████████████████| 120000/120000 [02:06<00:00, 946.57it/s]


Epoch 4/30, Train Loss: 0.9511, Test Accuracy: 0.6039


Epoch 5/30: 100%|██████████████████████| 120000/120000 [02:12<00:00, 906.43it/s]


Epoch 5/30, Train Loss: 0.9544, Test Accuracy: 0.6170


Epoch 6/30: 100%|██████████████████████| 120000/120000 [04:34<00:00, 436.98it/s]


Epoch 6/30, Train Loss: 0.9449, Test Accuracy: 0.6263


Epoch 7/30: 100%|██████████████████████| 120000/120000 [07:44<00:00, 258.58it/s]


Epoch 7/30, Train Loss: 0.9496, Test Accuracy: 0.6395


Epoch 8/30: 100%|██████████████████████| 120000/120000 [03:42<00:00, 538.56it/s]


Epoch 8/30, Train Loss: 0.9423, Test Accuracy: 0.6175


Epoch 9/30: 100%|██████████████████████| 120000/120000 [04:19<00:00, 463.08it/s]


Epoch 9/30, Train Loss: 0.9416, Test Accuracy: 0.6305


Epoch 10/30: 100%|██████████████████████| 120000/120000 [43:25<00:00, 46.06it/s]


Epoch 10/30, Train Loss: 0.9284, Test Accuracy: 0.6205


Epoch 11/30: 100%|████████████████████| 120000/120000 [01:34<00:00, 1275.35it/s]


Epoch 11/30, Train Loss: 0.9281, Test Accuracy: 0.6241


Epoch 12/30: 100%|████████████████████| 120000/120000 [01:49<00:00, 1096.98it/s]


Epoch 12/30, Train Loss: 0.9290, Test Accuracy: 0.6422


Epoch 13/30: 100%|████████████████████| 120000/120000 [01:49<00:00, 1097.58it/s]


Epoch 13/30, Train Loss: 0.9259, Test Accuracy: 0.6205


Epoch 14/30: 100%|████████████████████| 120000/120000 [01:50<00:00, 1082.04it/s]


Epoch 14/30, Train Loss: 0.9286, Test Accuracy: 0.6301


Epoch 15/30: 100%|████████████████████| 120000/120000 [01:48<00:00, 1104.65it/s]


Epoch 15/30, Train Loss: 0.9160, Test Accuracy: 0.6470


Epoch 16/30: 100%|█████████████████████| 120000/120000 [02:00<00:00, 997.73it/s]


Epoch 16/30, Train Loss: 0.9212, Test Accuracy: 0.6459


Epoch 17/30: 100%|████████████████████| 120000/120000 [01:58<00:00, 1014.50it/s]


Epoch 17/30, Train Loss: 0.9390, Test Accuracy: 0.6342


Epoch 18/30: 100%|████████████████████| 120000/120000 [01:59<00:00, 1004.53it/s]


Epoch 18/30, Train Loss: 0.9148, Test Accuracy: 0.6387


Epoch 19/30: 100%|█████████████████████| 120000/120000 [02:00<00:00, 996.51it/s]


Epoch 19/30, Train Loss: 0.9113, Test Accuracy: 0.6266


Epoch 20/30: 100%|█████████████████████| 120000/120000 [02:16<00:00, 878.91it/s]


Epoch 20/30, Train Loss: 0.9208, Test Accuracy: 0.6396


Epoch 21/30: 100%|█████████████████████| 120000/120000 [02:14<00:00, 893.15it/s]


Epoch 21/30, Train Loss: 0.9181, Test Accuracy: 0.6180


Epoch 22/30: 100%|█████████████████████| 120000/120000 [02:14<00:00, 890.53it/s]


Epoch 22/30, Train Loss: 0.9120, Test Accuracy: 0.6562


Epoch 23/30: 100%|█████████████████████| 120000/120000 [02:26<00:00, 817.51it/s]


Epoch 23/30, Train Loss: 0.9077, Test Accuracy: 0.6550


Epoch 24/30: 100%|█████████████████████| 120000/120000 [02:24<00:00, 833.16it/s]


Epoch 24/30, Train Loss: 0.9022, Test Accuracy: 0.6482


Epoch 25/30: 100%|█████████████████████| 120000/120000 [02:10<00:00, 918.18it/s]


Epoch 25/30, Train Loss: 0.9000, Test Accuracy: 0.6487


Epoch 26/30: 100%|█████████████████████| 120000/120000 [02:13<00:00, 900.06it/s]


Epoch 26/30, Train Loss: 0.8985, Test Accuracy: 0.6592


Epoch 27/30: 100%|█████████████████████| 120000/120000 [02:21<00:00, 847.82it/s]


Epoch 27/30, Train Loss: 0.8981, Test Accuracy: 0.6389


Epoch 28/30: 100%|█████████████████████| 120000/120000 [02:24<00:00, 829.66it/s]


Epoch 28/30, Train Loss: 0.8962, Test Accuracy: 0.6576


Epoch 29/30: 100%|█████████████████████| 120000/120000 [02:25<00:00, 825.60it/s]


Epoch 29/30, Train Loss: 0.8895, Test Accuracy: 0.6639


Epoch 30/30: 100%|█████████████████████| 120000/120000 [02:26<00:00, 820.07it/s]


Epoch 30/30, Train Loss: 0.8992, Test Accuracy: 0.6511


In [15]:
print("Train Losses:", new_train_losses)
print("Test Accuracies:", new_test_accuracies)

Train Losses: [0.9678528982470115, 0.9592357043679233, 0.963831799296987, 0.9510779040596042, 0.9543613822164244, 0.94492681563708, 0.949636579936806, 0.9422730533835092, 0.9416138626593246, 0.9283593326553107, 0.9281000342751268, 0.9290052500246297, 0.9258940331773531, 0.9286035618977053, 0.9160367057205964, 0.921181189148617, 0.9390360461639399, 0.9147918351107607, 0.9113090302782102, 0.9207529346615869, 0.9180920034519074, 0.9119872084254478, 0.9076967868652928, 0.9022340113365411, 0.899988057517223, 0.8984835013049863, 0.8981478054498997, 0.8961567221176256, 0.889503527512411, 0.8992308918277871]
Test Accuracies: [0.5953947368421053, 0.6186842105263158, 0.611578947368421, 0.6039473684210527, 0.6169736842105263, 0.6263157894736842, 0.6394736842105263, 0.6175, 0.6305263157894737, 0.6205263157894737, 0.6240789473684211, 0.6422368421052631, 0.6205263157894737, 0.6301315789473684, 0.6469736842105264, 0.645921052631579, 0.6342105263157894, 0.6386842105263157, 0.626578947368421, 0.6396052

In [17]:
def save_model(model, save_path):
    torch.save(model, save_path)

save_model(SVDClassifier, "svd-classification-model.pt")


In [20]:
data = pd.read_csv(train_file)
labels = data['Class Index']
unique_labels = labels.unique()
print("Unique Labels:", unique_labels)


Unique Labels: [3 4 2 1]


In [32]:
word_vectors_file = "svd_word_vectors.pt"
train_file = "data/train.csv"
test_file = "data/test.csv"
svd_classifier = SVDClassifier(word_vectors_file,num_classes = 4)
train_losses, test_accuracies = svd_classifier.train(train_file, test_file,num_samples=None, epochs=50)

Epoch 1/50: 100%|█████████████████████| 120000/120000 [01:29<00:00, 1340.82it/s]


Epoch 1/50, Train Loss: 1.3192, Test Accuracy: 0.4093


Epoch 2/50: 100%|█████████████████████| 120000/120000 [01:33<00:00, 1283.99it/s]


Epoch 2/50, Train Loss: 1.2407, Test Accuracy: 0.4591


Epoch 3/50: 100%|█████████████████████| 120000/120000 [01:38<00:00, 1217.67it/s]


Epoch 3/50, Train Loss: 1.2160, Test Accuracy: 0.4733


Epoch 4/50: 100%|█████████████████████| 120000/120000 [01:37<00:00, 1224.82it/s]


Epoch 4/50, Train Loss: 1.1887, Test Accuracy: 0.4601


Epoch 5/50: 100%|█████████████████████| 120000/120000 [01:39<00:00, 1209.94it/s]


Epoch 5/50, Train Loss: 1.1781, Test Accuracy: 0.5020


Epoch 6/50: 100%|█████████████████████| 120000/120000 [01:41<00:00, 1179.55it/s]


Epoch 6/50, Train Loss: 1.1513, Test Accuracy: 0.5062


Epoch 7/50: 100%|█████████████████████| 120000/120000 [01:40<00:00, 1195.27it/s]


Epoch 7/50, Train Loss: 1.1461, Test Accuracy: 0.5082


Epoch 8/50: 100%|█████████████████████| 120000/120000 [01:44<00:00, 1145.15it/s]


Epoch 8/50, Train Loss: 1.1243, Test Accuracy: 0.5403


Epoch 9/50: 100%|█████████████████████| 120000/120000 [01:39<00:00, 1201.15it/s]


Epoch 9/50, Train Loss: 1.1083, Test Accuracy: 0.5414


Epoch 10/50: 100%|████████████████████| 120000/120000 [01:39<00:00, 1202.66it/s]


Epoch 10/50, Train Loss: 1.1068, Test Accuracy: 0.5229


Epoch 11/50: 100%|████████████████████| 120000/120000 [01:40<00:00, 1199.84it/s]


Epoch 11/50, Train Loss: 1.0885, Test Accuracy: 0.5453


Epoch 12/50: 100%|████████████████████| 120000/120000 [01:40<00:00, 1193.26it/s]


Epoch 12/50, Train Loss: 1.0801, Test Accuracy: 0.5320


Epoch 13/50: 100%|████████████████████| 120000/120000 [01:48<00:00, 1105.96it/s]


Epoch 13/50, Train Loss: 1.0697, Test Accuracy: 0.5736


Epoch 14/50: 100%|█████████████████████| 120000/120000 [02:11<00:00, 911.00it/s]


Epoch 14/50, Train Loss: 1.0705, Test Accuracy: 0.5628


Epoch 15/50: 100%|█████████████████████| 120000/120000 [02:03<00:00, 971.71it/s]


Epoch 15/50, Train Loss: 1.0432, Test Accuracy: 0.5518


Epoch 16/50: 100%|████████████████████| 120000/120000 [01:56<00:00, 1029.06it/s]


Epoch 16/50, Train Loss: 1.0486, Test Accuracy: 0.5638


Epoch 17/50: 100%|████████████████████| 120000/120000 [01:57<00:00, 1018.87it/s]


Epoch 17/50, Train Loss: 1.0409, Test Accuracy: 0.5630


Epoch 18/50: 100%|█████████████████████| 120000/120000 [02:02<00:00, 980.92it/s]


Epoch 18/50, Train Loss: 1.0364, Test Accuracy: 0.5516


Epoch 19/50: 100%|████████████████████| 120000/120000 [01:59<00:00, 1000.48it/s]


Epoch 19/50, Train Loss: 1.0341, Test Accuracy: 0.5741


Epoch 20/50: 100%|█████████████████████| 120000/120000 [02:03<00:00, 972.84it/s]


Epoch 20/50, Train Loss: 1.0150, Test Accuracy: 0.5941


Epoch 21/50: 100%|█████████████████████| 120000/120000 [02:02<00:00, 982.54it/s]


Epoch 21/50, Train Loss: 1.0125, Test Accuracy: 0.5441


Epoch 22/50: 100%|█████████████████████| 120000/120000 [02:08<00:00, 932.69it/s]


Epoch 22/50, Train Loss: 1.0272, Test Accuracy: 0.5828


Epoch 23/50: 100%|█████████████████████| 120000/120000 [02:04<00:00, 960.66it/s]


Epoch 23/50, Train Loss: 1.0097, Test Accuracy: 0.6054


Epoch 24/50: 100%|█████████████████████| 120000/120000 [02:02<00:00, 978.74it/s]


Epoch 24/50, Train Loss: 0.9958, Test Accuracy: 0.5857


Epoch 25/50: 100%|█████████████████████| 120000/120000 [02:06<00:00, 950.94it/s]


Epoch 25/50, Train Loss: 0.9901, Test Accuracy: 0.6005


Epoch 26/50: 100%|████████████████████| 120000/120000 [01:56<00:00, 1029.01it/s]


Epoch 26/50, Train Loss: 0.9869, Test Accuracy: 0.5543


Epoch 27/50: 100%|█████████████████████| 120000/120000 [02:01<00:00, 987.35it/s]


Epoch 27/50, Train Loss: 0.9877, Test Accuracy: 0.5655


Epoch 28/50: 100%|█████████████████████| 120000/120000 [02:01<00:00, 989.06it/s]


Epoch 28/50, Train Loss: 0.9838, Test Accuracy: 0.5828


Epoch 29/50: 100%|█████████████████████| 120000/120000 [02:01<00:00, 988.70it/s]


Epoch 29/50, Train Loss: 0.9853, Test Accuracy: 0.5936


Epoch 30/50: 100%|█████████████████████| 120000/120000 [02:00<00:00, 998.36it/s]


Epoch 30/50, Train Loss: 0.9760, Test Accuracy: 0.6179


Epoch 31/50: 100%|█████████████████████| 120000/120000 [02:00<00:00, 993.78it/s]


Epoch 31/50, Train Loss: 0.9699, Test Accuracy: 0.6091


Epoch 32/50: 100%|█████████████████████| 120000/120000 [02:02<00:00, 979.70it/s]


Epoch 32/50, Train Loss: 0.9652, Test Accuracy: 0.5976


Epoch 33/50: 100%|█████████████████████| 120000/120000 [02:02<00:00, 982.02it/s]


Epoch 33/50, Train Loss: 0.9546, Test Accuracy: 0.6349


Epoch 34/50: 100%|█████████████████████| 120000/120000 [02:05<00:00, 956.45it/s]


Epoch 34/50, Train Loss: 0.9522, Test Accuracy: 0.6330


Epoch 35/50: 100%|█████████████████████| 120000/120000 [02:08<00:00, 936.35it/s]


Epoch 35/50, Train Loss: 0.9479, Test Accuracy: 0.6157


Epoch 36/50: 100%|█████████████████████| 120000/120000 [02:03<00:00, 967.89it/s]


Epoch 36/50, Train Loss: 0.9459, Test Accuracy: 0.5983


Epoch 37/50: 100%|█████████████████████| 120000/120000 [02:05<00:00, 959.15it/s]


Epoch 37/50, Train Loss: 0.9441, Test Accuracy: 0.6158


Epoch 38/50: 100%|█████████████████████| 120000/120000 [02:07<00:00, 944.04it/s]


Epoch 38/50, Train Loss: 0.9541, Test Accuracy: 0.6412


Epoch 39/50: 100%|█████████████████████| 120000/120000 [02:01<00:00, 987.77it/s]


Epoch 39/50, Train Loss: 0.9410, Test Accuracy: 0.6358


Epoch 40/50: 100%|████████████████████| 120000/120000 [01:59<00:00, 1006.87it/s]


Epoch 40/50, Train Loss: 0.9387, Test Accuracy: 0.6216


Epoch 41/50: 100%|█████████████████████| 120000/120000 [02:01<00:00, 987.55it/s]


Epoch 41/50, Train Loss: 0.9353, Test Accuracy: 0.6483


Epoch 42/50: 100%|█████████████████████| 120000/120000 [02:01<00:00, 988.64it/s]


Epoch 42/50, Train Loss: 0.9286, Test Accuracy: 0.6442


Epoch 43/50: 100%|█████████████████████| 120000/120000 [02:05<00:00, 959.98it/s]


Epoch 43/50, Train Loss: 0.9265, Test Accuracy: 0.6191


Epoch 44/50: 100%|█████████████████████| 120000/120000 [02:10<00:00, 917.65it/s]


Epoch 44/50, Train Loss: 0.9276, Test Accuracy: 0.6413


Epoch 45/50: 100%|█████████████████████| 120000/120000 [02:00<00:00, 992.72it/s]


Epoch 45/50, Train Loss: 0.9269, Test Accuracy: 0.6175


Epoch 46/50: 100%|█████████████████████| 120000/120000 [02:01<00:00, 990.66it/s]


Epoch 46/50, Train Loss: 0.9241, Test Accuracy: 0.6462


Epoch 47/50: 100%|█████████████████████| 120000/120000 [02:11<00:00, 910.02it/s]


Epoch 47/50, Train Loss: 0.9163, Test Accuracy: 0.6292


Epoch 48/50: 100%|█████████████████████| 120000/120000 [02:02<00:00, 982.27it/s]


Epoch 48/50, Train Loss: 0.9160, Test Accuracy: 0.6297


Epoch 49/50: 100%|█████████████████████| 120000/120000 [02:15<00:00, 887.20it/s]


Epoch 49/50, Train Loss: 0.9184, Test Accuracy: 0.6443


Epoch 50/50: 100%|█████████████████████| 120000/120000 [02:08<00:00, 931.47it/s]


Epoch 50/50, Train Loss: 0.9160, Test Accuracy: 0.6401


In [34]:
print("Train Losses : ", train_losses)
print("Test Accuracies : ", test_accuracies)
print("svd_classifier : ", svd_classifier)
print("model : ", svd_classifier.lstm_model)

Train Losses:  [1.319185847759588, 1.2406862185075127, 1.216024565227031, 1.1887025144385597, 1.1781422953819378, 1.151263598163091, 1.1460754629647742, 1.1242575633420362, 1.1082911797410817, 1.1067875672942984, 1.0885173919599287, 1.0800731354264543, 1.069661087381027, 1.070523568970037, 1.0432168941397053, 1.0486222660589453, 1.040889349045037, 1.0363616567934233, 1.0341057445957473, 1.0150033496369617, 1.0124590747318527, 1.0272043311694758, 1.0097269293994717, 0.9958120048182312, 0.9900760842635248, 0.986909789561388, 0.9876885674911405, 0.98384367720873, 0.9852648492087286, 0.9760174684975027, 0.9699258208847609, 0.965167124263303, 0.9545857548875396, 0.9521839107739805, 0.9478591730243827, 0.9458965884875734, 0.944070828191885, 0.9541106852967022, 0.941003862383826, 0.9386573875013308, 0.9353363515495594, 0.92862868745686, 0.9265108505553246, 0.927594084163119, 0.9269413372618222, 0.9240570746436421, 0.9163133149095473, 0.9160197858729028, 0.918394304872786, 0.9160192734322951]


In [42]:
def save_model(model, save_path):
    torch.save(model, save_path)

save_model(svd_classifier, "svd-classification-model.pt")

In [43]:
# Load the saved model
loaded_model = torch.load("svd-classification-model.pt")
test_file = "data/test.csv"
print(loaded_model.lstm_model)
# Provide the test_file argument when calling the test method
test_accuracy = loaded_model.test(test_file)
print("Test Accuracy:", test_accuracy)

LSTMClassifier(
  (lstm): LSTM(100, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=4, bias=True)
)


Testing:  37%|██████████▍                 | 2820/7600 [00:01<00:02, 1910.45it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

Testing: 100%|████████████████████████████| 7600/7600 [00:03<00:00, 2093.98it/s]


Test Accuracy: 0.6401315789473684
