<a href="https://colab.research.google.com/github/sreesravyat/Covid-data-johnhopkins/blob/main/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import numpy as np

# Load GloVe embeddings
def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

glove_path = '/content/glove.6B.100d.txt'
glove_embeddings = load_glove_embeddings(glove_path)


In [5]:
from sklearn.datasets import fetch_20newsgroups
from gensim.utils import simple_preprocess
from sklearn.model_selection import train_test_split

# Load 20 Newsgroups dataset
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space', 'rec.sport.hockey']
newsgroups = fetch_20newsgroups(subset='all', categories=categories)
texts, labels = newsgroups.data, newsgroups.target

# Tokenize the texts
tokens = [simple_preprocess(text) for text in texts]

# Convert words to GloVe indices
def doc2ind(doc, word_to_idx):
    return [word_to_idx[word] if word in word_to_idx else 0 for word in doc]

word_to_idx = {word: idx for idx, word in enumerate(glove_embeddings.keys(), 1)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

# Convert tokens to indices
ng_vector_idx = [doc2ind(doc, word_to_idx) for doc in tokens]

# Padding sequences to the same length
from torch.nn.utils.rnn import pad_sequence
ng_vector_idx_padded = pad_sequence([torch.tensor(seq) for seq in ng_vector_idx], batch_first=True)
labels = torch.tensor(labels)


In [6]:
import numpy as np
import torch
import torch.nn as nn

# Assuming glove_embeddings is a dictionary with word keys and vector values
# and word_to_idx is a dictionary mapping words to their corresponding indices

embedding_dim = 100  # Size of GloVe vectors, adjust if needed
embedding_matrix = np.zeros((len(word_to_idx) + 1, embedding_dim))

for word, idx in word_to_idx.items():
    vector = glove_embeddings.get(word)
    if vector is not None and len(vector) == embedding_dim:
        embedding_matrix[idx] = vector
    else:
        print(f"Word '{word}' not found in GloVe or has incorrect dimensions. Using random vector.")
        embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embedding_dim,))

# Convert to torch tensor
embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32)

# Create an embedding layer
glove_emb = nn.Embedding.from_pretrained(embedding_matrix, freeze=True)


In [7]:
class TextClassificationModel(nn.Module):
    def __init__(self, embedding_layer, num_classes):
        super(TextClassificationModel, self).__init__()
        self.embedding = embedding_layer
        self.fc1 = nn.Linear(100, 64)
        self.fc2 = nn.Linear(64, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.embedding(x)
        x = torch.mean(x, dim=1)  # Average pooling over sequence length
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return self.softmax(x)

num_classes = len(categories)
model = TextClassificationModel(glove_emb, num_classes)


In [8]:
train_data, test_data, train_labels, test_labels = train_test_split(ng_vector_idx_padded, labels, test_size=0.2, random_state=42)

train_dataset = torch.utils.data.TensorDataset(train_data, train_labels)
test_dataset = torch.utils.data.TensorDataset(test_data, test_labels)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

n_epochs = 10
for epoch in range(n_epochs):
    model.train()
    for data, target in train_loader:
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch + 1}/{n_epochs}, Loss: {loss.item()}')

# Evaluate the model
model.eval()
correct = 0
with torch.no_grad():
    for data, target in test_loader:
        output = model(data)
        pred = output.argmax(dim=1, keepdim=True)
        correct += pred.eq(target.view_as(pred)).sum().item()

accuracy = correct / len(test_loader.dataset)
print(f'Test Accuracy: {accuracy:.4f}')


Epoch 1/10, Loss: 1.6141899824142456
Epoch 2/10, Loss: 1.6055608987808228
Epoch 3/10, Loss: 1.5987313985824585
Epoch 4/10, Loss: 1.6064560413360596
Epoch 5/10, Loss: 1.5835797786712646
Epoch 6/10, Loss: 1.5882816314697266
Epoch 7/10, Loss: 1.571858286857605
Epoch 8/10, Loss: 1.5631811618804932
Epoch 9/10, Loss: 1.5398235321044922
Epoch 10/10, Loss: 1.5482674837112427
Test Accuracy: 0.3292


In [9]:
# Unfreeze the embedding layer
model.embedding.weight.requires_grad = True

# Fine-tune the model
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

for epoch in range(n_epochs):
    model.train()
    for data, target in train_loader:
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
    print(f'Fine-Tuning Epoch {epoch + 1}/{n_epochs}, Loss: {loss.item()}')

# Re-evaluate the model
model.eval()
correct = 0
with torch.no_grad():
    for data, target in test_loader:
        output = model(data)
        pred = output.argmax(dim=1, keepdim=True)
        correct += pred.eq(target.view_as(pred)).sum().item()

fine_tuned_accuracy = correct / len(test_loader.dataset)
print(f'Fine-Tuned Test Accuracy: {fine_tuned_accuracy:.4f}')


Fine-Tuning Epoch 1/10, Loss: 1.5280665159225464
Fine-Tuning Epoch 2/10, Loss: 1.5403600931167603
Fine-Tuning Epoch 3/10, Loss: 1.525551676750183
Fine-Tuning Epoch 4/10, Loss: 1.5483684539794922
Fine-Tuning Epoch 5/10, Loss: 1.5164172649383545
Fine-Tuning Epoch 6/10, Loss: 1.526739239692688
Fine-Tuning Epoch 7/10, Loss: 1.4913585186004639
Fine-Tuning Epoch 8/10, Loss: 1.4941202402114868
Fine-Tuning Epoch 9/10, Loss: 1.5412604808807373
Fine-Tuning Epoch 10/10, Loss: 1.4814260005950928
Fine-Tuned Test Accuracy: 0.4522
