In [321]:
import pandas as pd
from tqdm import tqdm
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import random
import re
from collections import Counter
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [84]:
    
class SkipGramNegativeSampling(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramNegativeSampling, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.out_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.init_emb()

    def init_emb(self):
        initrange = 0.5 / self.embeddings.weight.size(1)
        self.embeddings.weight.data.uniform_(-initrange, initrange)
        self.out_embeddings.weight.data.uniform_(-0, 0)

    def forward(self, center_words, target_words, neg_words):
        # Convert inputs to tensors if not already tensors
        if not isinstance(center_words, torch.Tensor):
            center_words = torch.tensor(center_words)
        if not isinstance(target_words, torch.Tensor):
            target_words = torch.tensor(target_words)
        
        embeds = self.embeddings(center_words)
        out_embeds = self.out_embeddings(target_words)
        neg_embeds = -self.out_embeddings(neg_words)
        
        score = torch.sum(torch.mul(embeds, out_embeds), dim=1)
        score = torch.sigmoid(score)
        
        neg_score = torch.bmm(neg_embeds, embeds.unsqueeze(2)).squeeze()
        neg_score = torch.sum(torch.log(torch.sigmoid(neg_score)), dim=1)
        
        return -torch.mean(torch.log(score) + neg_score)

class SkipGramWord2Vec:
    def __init__(self, embedding_dim=100, num_neg_samples=4, min_count=2, batch_size=128, learning_rate=0.001, num_epochs=5):
        self.embedding_dim = embedding_dim
        self.num_neg_samples = num_neg_samples
        self.min_count = min_count
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.num_epochs = num_epochs
        self.word_to_idx = {}
        self.idx_to_word = {}
        self.word_counts = {}
        self.word_freqs = []
        self.model = None
        self.unk_token = '<unk>'
        self.unk_index = None  # Index for the <unk> token
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def load_dataset(self, file_path, num_samples=10000, reduce_label=True):
        if num_samples:
            data = pd.read_csv(file_path, nrows=num_samples)
        else:
            data = pd.read_csv(file_path)

        if reduce_label and 'Class Index' in data.columns:
            data['Class Index'] -= 1  # Reduce 1 from each label

        corpus = data['Description']
        return corpus

  
    def preprocess_data(self, corpus):
        cleaned_sentences = []
        for sentence in corpus:
            # Tokenize sentence using regular expression
            tokens = re.findall(r'\b\w+\b', sentence.lower())  # This regex tokenizes words, removing punctuation
            cleaned_sentences.append(tokens)
        return cleaned_sentences

    def build_vocab(self, sentences):
        word_counts = {}
        for sentence in sentences:
            for word in sentence:
                word_counts[word] = word_counts.get(word, 0) + 1
        word_counts = {word: count for word, count in word_counts.items() if count >= self.min_count}
        word_counts[self.unk_token] = 0
        sorted_word_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
        self.word_to_idx = {word: idx + 1 for idx, (word, _) in enumerate(sorted_word_counts)}  # Shift indices by 1 to accommodate the <unk> token
        self.word_to_idx[self.unk_token] = 0  # Assign index 0 to the <unk> token
        self.idx_to_word = {idx: word for word, idx in self.word_to_idx.items()}
        self.word_counts = {word: count for word, count in sorted_word_counts}
        total_words = sum(word_counts.values())
        self.word_freqs = np.array([count / total_words for count in word_counts.values()])

        # Assign index to <unk> token
        self.unk_index = 0
        
        
    def build_data(self, sentences, window_size = 2):
        data = []
        for sentence in sentences:
            for i, center_word in enumerate(sentence):
                for j in range(max(0, i - window_size), min(len(sentence), i + window_size + 1)):
                    if i != j:
                        data.append((center_word, sentence[j]))
        return data

    def get_neg_sample(self, target_word_idx, num_samples):
        neg_samples = []
        while len(neg_samples) < num_samples:
            sample = np.random.choice(len(self.word_to_idx), size=num_samples, p=self.word_freqs)
            if target_word_idx not in sample:
                neg_samples.extend(sample)
        return torch.LongTensor(neg_samples)
    
    def save_word_vectors(self, file_path):
        word_to_idx = self.word_to_idx
        idx_to_word = self.idx_to_word
        word_vectors = self.model.embeddings.weight.cpu().detach().numpy()

        state_dict = {
            'word_to_idx': word_to_idx,
            'idx_to_word': idx_to_word,
            'word_vectors': word_vectors
        }

        torch.save(state_dict, file_path)
        print(f"Word vectors and vocabulary mappings saved to {file_path}.") 

    def train(self, sentences, window_size=2):
        self.build_vocab(sentences)
        data = self.build_data(sentences, window_size)
        total_batches = len(data) // self.batch_size
        print("Device: ", self.device)
        self.model = SkipGramNegativeSampling(len(self.word_to_idx), self.embedding_dim).to(self.device)
            
        optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)

        for epoch in range(self.num_epochs):
            total_loss = 0.0
            with tqdm(total=total_batches, desc=f'Epoch {epoch+1}/{self.num_epochs}', unit='batch') as pbar:
                for i in range(0, len(data), self.batch_size):
                    batch_data = data[i:i+self.batch_size]
                    center_words, target_words = zip(*batch_data)

                    # Convert words to indices
                    center_word_idxs = [self.word_to_idx.get(word, 0) for word in center_words]  
                    target_word_idxs = [self.word_to_idx.get(word, 0) for word in target_words]  

                    center_words_tensor = torch.LongTensor(center_word_idxs).to(self.device)
                    target_words_tensor = torch.LongTensor(target_word_idxs).to(self.device)

                    neg_words = [self.get_neg_sample(target_word_idx, self.num_neg_samples) for target_word_idx in target_word_idxs]

                    # Convert neg_words to tensor
                    neg_words_tensor = torch.stack(neg_words).to(self.device)

                    optimizer.zero_grad()
                    loss = self.model(center_words_tensor, target_words_tensor, neg_words_tensor)
                    loss.backward()
                    optimizer.step()
                    total_loss += loss.item()

                    pbar.set_postfix({'loss': total_loss / ((i // self.batch_size) + 1)})
                    pbar.update()
                    
         # Save word vectors and vocabulary mappings
        self.save_word_vectors('skip-gram-word-vectors.pt')
        
        print("Training completed.")

    def get_word_vector(self, word):
        if word in self.word_to_idx:
            idx = torch.tensor([self.word_to_idx[word]]).to(self.device)
            return self.model.embeddings(idx).squeeze().cpu().detach().numpy()
        else:
            print(f"Word '{word}' not found in vocabulary. Using <unk> token.")
            idx = torch.tensor([self.unk_index]).to(self.device)
            return self.model.embeddings(idx).squeeze().cpu().detach().numpy()

    def most_similar(self, word, topn=5):
        if word in self.word_to_idx:
            word_vec = self.get_word_vector(word)
            if word_vec is not None:
                all_word_vecs = self.model.embeddings.weight.cpu().detach().numpy()
                similarities = np.dot(all_word_vecs, word_vec)
                top_similar_words = np.argsort(-similarities)[1:topn+1]
                return [(self.idx_to_word[idx], similarities[idx]) for idx in top_similar_words]
        return None


In [85]:

word2vec_model = SkipGramWord2Vec(embedding_dim=100, num_neg_samples=4, num_epochs=4)

# Load dataset
file_path = "data/train.csv"
corpus = word2vec_model.load_dataset(file_path, num_samples=15000)
sentences = word2vec_model.preprocess_data(corpus)

word2vec_model.train(sentences)

Device:  cpu


Epoch 1/4: 14872batch [10:18, 24.03batch/s, loss=1.7]                           
Epoch 2/4: 14872batch [10:35, 23.39batch/s, loss=1.46]                          
Epoch 3/4: 14872batch [10:21, 23.92batch/s, loss=1.38]                          
Epoch 4/4: 14872batch [10:01, 24.73batch/s, loss=1.31]                          


Word vectors and vocabulary mappings saved to skip-gram-word-vectors.pt.
Training completed.


In [None]:
word_vector = word2vec_model.get_word_vector("I")
print("Word vector for 'word':", word_vector)

similar_words = word2vec_model.most_similar("I")
print("Most similar words to 'word':", similar_words)
print( word2vec_model.model.embeddings)

In [268]:

class Model(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Model, self).__init__()
        self.recurrent_layer = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc_layer = nn.Linear( hidden_size, output_size)

    def forward(self, input_seq):
        # Forward pass through the RNN
        recurrent_output, _ = self.recurrent_layer(input_seq)
        
        # Extract the last hidden state
        last_hidden_state = recurrent_output[:, -1, :]
        
        # Pass the last hidden state through the fully connected layer
        logits = self.fc_layer(last_hidden_state)
        
        return logits
    
    def predict(self, logits):
        
        # Apply argmax to get the index of the highest logit
        predicted_class_index = torch.argmax(logits).item()
        
        return predicted_class_index


In [317]:
class Classifier:
    def __init__(self, input_size, hidden_size, output_size, num_epochs=10, lr=0.001):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.num_epochs = num_epochs

        self.model = Model(input_size, hidden_size, output_size).to(self.device)

        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)

    def train(self, train_data, model_save_path):
        self.model.train()
        epoch_losses = []
        epoch_accuracies = []

        for epoch in range(self.num_epochs):
            running_loss = 0.0
            train_data_with_progress = tqdm(train_data, desc=f'Epoch {epoch+1}/{self.num_epochs}')
            predictions = []
            labels_list = []
            for inputs, label in train_data_with_progress:
                inputs, label = inputs.to(self.device), label.to(self.device)

                self.optimizer.zero_grad()
                outputs = self.model(inputs)
                loss = self.criterion(outputs, label)
                loss.backward()
                self.optimizer.step()

                running_loss += loss.item() * inputs.size(0)
                predicted_label = self.model.predict(outputs)
                predictions.append(predicted_label)
                labels_list.append(label.item())  
#             prediction_counts = Counter(predictions)

#             # Print the count of each unique number
#             for label, count in prediction_counts.items():
#                 print(f"Label {label}: {count} occurrences")
            avg_loss = running_loss / len(labels_list)
            accuracy = sum(p == l for p, l in zip(predictions, labels_list)) / len(predictions)

            epoch_losses.append(avg_loss)
            epoch_accuracies.append(accuracy)

            print(f'Epoch {epoch+1}/{self.num_epochs}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}')
            
        self.save_model(model_save_path)
        
        return epoch_losses, epoch_accuracies

    def evaluate(self, test_data):
        self.model.eval()  
        predictions = []
        labels_list = []
        with torch.no_grad():  
            for inputs, label in test_data:
                inputs, label = inputs.to(self.device), label.to(self.device)
                outputs = self.model(inputs)
                predicted_label = self.model.predict(outputs)
                predictions.append(predicted_label)
                labels_list.append(label.item())
        
#         prediction_counts = Counter(predictions)

#         # Print the count of each unique number
#         for label, count in prediction_counts.items():
#             print(f"Label {label}: {count} occurrences")
            
        accuracy = sum(p == l for p, l in zip(predictions, labels_list)) / len(predictions)
        print(f'Test Accuracy: {accuracy:.4f}')
        return accuracy
    
    def calculate_metrics(self, data_loader):
        self.model.eval()
        predictions = []
        labels_list = []

        with torch.no_grad():
            for inputs, labels in data_loader:
                inputs, labels = inputs.to(self.device), labels.to(self.device)
                outputs = self.model(inputs)
                predicted_label = self.model.predict(outputs)
                predictions.append(predicted_label)
                labels_list.append(labels.item())

        accuracy = accuracy_score(labels_list, predictions)
        precision = precision_score(labels_list, predictions, average='macro')
        recall = recall_score(labels_list, predictions, average='macro')
        f1 = f1_score(labels_list, predictions, average='macro')
        confusion_mat = confusion_matrix(labels_list, predictions)

        return accuracy, precision, recall, f1, confusion_mat

    def evaluate_performance(self, train_loader, test_loader):
        print("Evaluation on Train Set:")
        train_accuracy, train_precision, train_recall, train_f1, train_confusion_mat = self.calculate_metrics(train_loader)
        print(f"Train Accuracy: {train_accuracy:.4f}")
        print(f"Train Precision: {train_precision:.4f}")
        print(f"Train Recall: {train_recall:.4f}")
        print(f"Train F1 Score: {train_f1:.4f}")
        print("Train Confusion Matrix:")
        print(train_confusion_mat)

        print("\nEvaluation on Test Set:")
        test_accuracy, test_precision, test_recall, test_f1, test_confusion_mat = self.calculate_metrics(test_loader)
        print(f"Test Accuracy: {test_accuracy:.4f}")
        print(f"Test Precision: {test_precision:.4f}")
        print(f"Test Recall: {test_recall:.4f}")
        print(f"Test F1 Score: {test_f1:.4f}")
        print("Test Confusion Matrix:")
        print(test_confusion_mat)
        
    def save_model(self, filepath):
        torch.save(self.model.state_dict(), filepath)
        print(f'Model saved to {filepath}')

    def predict_class(self, sentence):
        input_tensor = sentence.to(self.device)
        self.model.eval()  
        with torch.no_grad():  
            output = self.model(input_tensor)
            predicted_label = self.model.predict(output)
        
        return predicted_label + 1


In [318]:
class skipGramTextDataProcessor:
    def __init__(self, word_to_idx=None, idx_to_word=None, word_vectors=None):
        self.word_to_idx = word_to_idx
        self.idx_to_word = idx_to_word
        self.word_vectors = word_vectors

    def load_dataset(self, file_path, num_samples=None, reduce_label=True):
        if num_samples:
            data = pd.read_csv(file_path, nrows=num_samples)
        else:
            data = pd.read_csv(file_path)

        if reduce_label and 'Class Index' in data.columns:
            data['Class Index'] -= 1  # Reduce 1 from each label

        return data

    def preprocess_data(self, data, text_column='Description', label_column='Class Index'):
        corpus = data[text_column]
        labels = data[label_column] if label_column in data.columns else None
        return corpus, labels

    def load_word_vectors(self, file_path):
        state_dict = torch.load(file_path)
        self.word_to_idx = state_dict['word_to_idx']
        self.idx_to_word = state_dict['idx_to_word']
        self.word_vectors = state_dict['word_vectors']

    def get_word_embedding(self, word):
        if self.word_to_idx is not None and self.word_vectors is not None:
            if word in self.word_to_idx:
                idx = self.word_to_idx[word]
                return self.word_vectors[idx]
            else:
                # If word not found, return zero vector or handle as per your requirement
                return self.word_vectors[0]
        else:
            raise ValueError("Word vectors and vocabulary mappings are not loaded.")
            
    def get_sentece_embeddings(self,sentence):
        tokens = re.findall(r'\b\w+\b', sentence.lower())  
        sentence_embeddings = []
        for word in tokens:
            word_embedding = self.get_word_embedding(word)
            sentence_embeddings.append(word_embedding)
        sentence_embeddings = torch.tensor(np.array(sentence_embeddings))
        return sentence_embeddings
    
    def preprocess_dataset(self, data, batch_size = 1):
        corpus, labels = self.preprocess_data(data)
        dataset = []
        for sentence in corpus:
            sentence_embeddings = self.get_sentece_embeddings(sentence)
            dataset.append(sentence_embeddings)
        
        labels_tensor = torch.tensor(labels)
        combined_data = list(zip(dataset, labels_tensor))

        train_dataloader = DataLoader(combined_data, batch_size=batch_size, shuffle=False)

        return train_dataloader

    

In [323]:
input_size = 100  
hidden_size = 128  
output_size = 4  
batch_size = 1
data_processor = skipGramTextDataProcessor()
data_processor.load_word_vectors('skip-gram-word-vectors.pt')
train_data = data_processor.load_dataset('data/train.csv', num_samples=None)
test_data = data_processor.load_dataset('data/test.csv')
train_dataset = data_processor.preprocess_dataset(train_data)
test_dataset = data_processor.preprocess_dataset(test_data)
# print(len(train_data))
skip_gram_classifier = Classifier(input_size, hidden_size, output_size, num_epochs=10, lr=0.0005)
skip_gram_losses, skip_gram_accuracies = skip_gram_classifier.train(train_dataset,'skip-gram-classification-model.pt')


Epoch 1/10: 100%|██████████████████████| 120000/120000 [03:30<00:00, 571.18it/s]


Epoch 1/10, Loss: 0.9005, Accuracy: 0.6373


Epoch 2/10: 100%|██████████████████████| 120000/120000 [04:01<00:00, 497.44it/s]


Epoch 2/10, Loss: 0.8825, Accuracy: 0.6500


Epoch 3/10: 100%|██████████████████████| 120000/120000 [04:56<00:00, 405.07it/s]


Epoch 3/10, Loss: 0.9014, Accuracy: 0.6358


Epoch 4/10: 100%|██████████████████████| 120000/120000 [05:09<00:00, 387.92it/s]


Epoch 4/10, Loss: 0.9923, Accuracy: 0.5609


Epoch 5/10: 100%|██████████████████████| 120000/120000 [05:03<00:00, 395.40it/s]


Epoch 5/10, Loss: 0.9746, Accuracy: 0.5882


Epoch 6/10: 100%|██████████████████████| 120000/120000 [04:51<00:00, 411.61it/s]


Epoch 6/10, Loss: 1.1425, Accuracy: 0.4975


Epoch 7/10: 100%|██████████████████████| 120000/120000 [04:32<00:00, 439.58it/s]


Epoch 7/10, Loss: 0.9860, Accuracy: 0.5784


Epoch 8/10: 100%|██████████████████████| 120000/120000 [04:34<00:00, 437.72it/s]


Epoch 8/10, Loss: 0.8524, Accuracy: 0.6611


Epoch 9/10: 100%|██████████████████████| 120000/120000 [04:34<00:00, 437.09it/s]


Epoch 9/10, Loss: 1.0329, Accuracy: 0.5587


Epoch 10/10: 100%|█████████████████████| 120000/120000 [04:34<00:00, 437.68it/s]

Epoch 10/10, Loss: 1.1270, Accuracy: 0.5073
Model saved to skip-gram-classification-model.pt





In [324]:
test_acc = skip_gram_classifier.evaluate(test_dataset)
skip_gram_classifier.evaluate_performance(train_dataset, test_dataset)

Test Accuracy: 0.5600
Evaluation on Train Set:
Train Accuracy: 0.5639
Train Precision: 0.5744
Train Recall: 0.5639
Train F1 Score: 0.5515
Train Confusion Matrix:
[[23635  2757  2479  1129]
 [ 6290 20482  1206  2022]
 [11774  2775 10618  4833]
 [ 5610  6307  5150 12933]]

Evaluation on Test Set:
Test Accuracy: 0.5600
Test Precision: 0.5673
Test Recall: 0.5600
Test F1 Score: 0.5460
Test Confusion Matrix:
[[1499  172  161   68]
 [ 389 1313   72  126]
 [ 752  186  626  336]
 [ 366  377  339  818]]


In [325]:
sentence = "Thisksndkf issjkngkasd predicteskfmkad class for this sentencealjsndjfkna jansdfkn ksjdfk."

embeddings = data_processor.get_sentece_embeddings(sentence).unsqueeze(0)
# print(embeddings)
predicted_class = skip_gram_classifier.predict_class(embeddings)
print(f"Predicted class: {predicted_class}")

print("Losses for each epoch:")
for epoch, loss in enumerate(skip_gram_losses):
    print(f"Epoch {epoch+1}: {loss:.4f}")

print("\nAccuracies for each epoch:")
for epoch, accuracy in enumerate(skip_gram_accuracies):
    print(f"Epoch {epoch+1}: {accuracy:.4f}")

Predicted class: 2
Losses for each epoch:
Epoch 1: 0.9005
Epoch 2: 0.8825
Epoch 3: 0.9014
Epoch 4: 0.9923
Epoch 5: 0.9746
Epoch 6: 1.1425
Epoch 7: 0.9860
Epoch 8: 0.8524
Epoch 9: 1.0329
Epoch 10: 1.1270

Accuracies for each epoch:
Epoch 1: 0.6373
Epoch 2: 0.6500
Epoch 3: 0.6358
Epoch 4: 0.5609
Epoch 5: 0.5882
Epoch 6: 0.4975
Epoch 7: 0.5784
Epoch 8: 0.6611
Epoch 9: 0.5587
Epoch 10: 0.5073
