## LOADING IMDB DATASET

In [1]:
import os

train_folder = "train"
train_files = os.listdir(train_folder)
print("Files in train folder:", train_files)

test_folder = "test"
test_files = os.listdir(test_folder)
print("Files in test folder:", test_files)

Files in train folder: ['labeledBow.feat', 'neg', 'pos', 'unsup', 'unsupBow.feat', 'urls_neg.txt', 'urls_pos.txt', 'urls_unsup.txt']
Files in test folder: ['labeledBow.feat', 'neg', 'pos', 'urls_neg.txt', 'urls_pos.txt']


## DATA PRE PROCESSING 

In [2]:
train_pos = "train/pos"
train_neg = "train/neg"

def display_sample_text(folder_path, num_samples=1):
    print(f"Displaying sample text files from {folder_path}:")
    file_names = os.listdir(folder_path)[:num_samples]
    for file_name in file_names:
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, "r") as file:
            text = file.read()
            print(f"File: {file_name}")
            print("Content:")
            print(text)
            print("\n")

display_sample_text(train_pos)
display_sample_text(train_neg)

Displaying sample text files from train/pos:
File: 0_9.txt
Content:
Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I'm here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn't!


Displaying sample text files from train/neg:
File: 0_3.txt
Content:
Story of a man who has unnatural feelings for a pig. St

In [3]:
import os

train_folder = "train"
train_data = []


pos_folder = os.path.join(train_folder, 'pos')
for filename in os.listdir(pos_folder):
    with open(os.path.join(pos_folder, filename), 'r', encoding='utf-8') as file:
        train_data.append((file.read(), 'pos'))


neg_folder = os.path.join(train_folder, 'neg')
for filename in os.listdir(neg_folder):
    with open(os.path.join(neg_folder, filename), 'r', encoding='utf-8') as file:
        train_data.append((file.read(), 'neg'))

pos_folder = os.path.join(train_folder, 'pos')
pos_files_count = len(os.listdir(pos_folder))
print("Positive Count",pos_files_count)
neg_folder = os.path.join(train_folder, 'neg')
neg_files_count = len(os.listdir(neg_folder))
print("Negative Count",neg_files_count)

print("Number of training examples:", len(train_data))


Positive Count 12500
Negative Count 12500
Number of training examples: 25000


## SPACY TOKENIZATION

In [4]:
import os
import spacy
from collections import Counter
from tqdm import tqdm


nlp = spacy.load("en_core_web_sm")


def tokenize_and_filter(text):
    tokens = [token.text for token in nlp(text)]
    token_counts = Counter(tokens)
    filtered_tokens = [token if token_counts[token] >= 5 else "UNK" for token in tokens]
    return filtered_tokens


def process_folder(folder_path, output_folder):
    filenames = os.listdir(folder_path)
    for filename in tqdm(filenames, desc=f"Processing {os.path.basename(folder_path)}"):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            text = file.read()
            tokens = tokenize_and_filter(text)
        
        output_file_path = os.path.join(output_folder, filename)
        with open(output_file_path, 'w', encoding='utf-8') as output_file:
            output_file.write(" ".join(tokens))


train_folder = "train"
pos_train_folder = os.path.join(train_folder, 'pos')
neg_train_folder = os.path.join(train_folder, 'neg')
output_train_folder = "tokenized_train_data"


os.makedirs(output_train_folder, exist_ok=True)
os.makedirs(os.path.join(output_train_folder, 'pos'), exist_ok=True)
os.makedirs(os.path.join(output_train_folder, 'neg'), exist_ok=True)


process_folder(pos_train_folder, os.path.join(output_train_folder, 'pos'))
process_folder(neg_train_folder, os.path.join(output_train_folder, 'neg'))


Processing pos: 100%|████████████████████████████████████████████████████████████| 12500/12500 [10:50<00:00, 19.23it/s]
Processing neg: 100%|████████████████████████████████████████████████████████████| 12500/12500 [09:38<00:00, 21.60it/s]


## One hot encoding of training data

In [None]:
import os
import numpy as np
import h5py
from tqdm import tqdm


def load_tokenized_data(folder_path, num_files):
    tokenized_data = []
    filenames = os.listdir(folder_path)[:num_files]
    for filename in filenames:
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            tokens = file.read().strip().split()
            tokenized_data.append(tokens)
    return tokenized_data


def create_vocab(tokenized_data):
    vocab = {}
    vocab['PAD'] = 0  
    for tokens in tokenized_data:
        for token in tokens:
            if token not in vocab:
                vocab[token] = len(vocab)
    return vocab


def one_hot_encoding(tokens, vocab, max_length):
    one_hot_encoded = []
    for token in tokens:
        one_hot = [0] * len(vocab)
        if token in vocab:
            one_hot[vocab[token]] = 1
        else:
            one_hot[vocab['PAD']] = 1  
        one_hot_encoded.append(one_hot)
    if len(one_hot_encoded) < max_length:
        one_hot_encoded += [[0] * len(vocab)] * (max_length - len(one_hot_encoded))
    else:
        one_hot_encoded = one_hot_encoded[:max_length]
    return one_hot_encoded

def process_folder_in_batches(folder_path, output_folder, vocab, max_length, batch_size):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    filenames = os.listdir(folder_path)
    num_batches = len(filenames) // batch_size
    for batch_idx in tqdm(range(num_batches), desc=f"Processing {os.path.basename(folder_path)}"):
        batch_filenames = filenames[batch_idx * batch_size: (batch_idx + 1) * batch_size]
        batch_data = []
        for filename in batch_filenames:
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                tokens = file.read().strip().split()
                encoded_sentence = one_hot_encoding(tokens, vocab, max_length)
                batch_data.append(encoded_sentence)
        if batch_data:
            with h5py.File(os.path.join(output_folder, f'batch_{batch_idx}.h5'), 'w') as hf:
                hf.create_dataset('data', data=batch_data)
        else:
            print(f"No data for batch {batch_idx}")

tokenized_train_folder = "tokenized_train_data"
pos_tokenized_folder = os.path.join(tokenized_train_folder, 'pos')
neg_tokenized_folder = os.path.join(tokenized_train_folder, 'neg')
output_train_folder = "encoded_train_data"


num_files = 5000  
pos_tokenized_data = load_tokenized_data(pos_tokenized_folder, num_files)
neg_tokenized_data = load_tokenized_data(neg_tokenized_folder, num_files)


all_tokenized_data = pos_tokenized_data + neg_tokenized_data
vocab = create_vocab(all_tokenized_data)


if 'PAD' not in vocab:
    vocab['PAD'] = len(vocab)

avg_length = sum(len(tokens) for tokens in all_tokenized_data) / len(all_tokenized_data)
max_length = int(avg_length)

batch_size = 2  
process_folder_in_batches(pos_tokenized_folder, os.path.join(output_train_folder, 'pos'), vocab, max_length, batch_size)
process_folder_in_batches(neg_tokenized_folder, os.path.join(output_train_folder, 'neg'), vocab, max_length, batch_size)


Processing pos: 100%|██████████████████████████████████████████████████████████████| 6250/6250 [46:26<00:00,  2.24it/s]
Processing neg:  33%|███████████████████▊                                        | 2059/6250 [32:53<1:09:01,  1.01it/s]

## Loading encoded data and converting to tensors

In [None]:
import os
import h5py
import numpy as np

def load_encoded_data_from_h5(folder_path, num_files, max_length):
    encoded_data_list = []
    filenames = os.listdir(folder_path)[:num_files]
    for filename in filenames:
        with h5py.File(os.path.join(folder_path, filename), 'r') as hf:
            data = hf['data'][:]
            # Ensure data is of max_length
            if data.shape[1] < max_length:
                padding = np.zeros((data.shape[0], max_length - data.shape[1], data.shape[2]))
                data = np.concatenate([data, padding], axis=1)
            encoded_data_list.append(data)
    return np.array(encoded_data_list)

encoded_train_folder_pos = "encoded_train_data/pos"
encoded_train_folder_neg = "encoded_train_data/neg"


max_length = 10
num_files_to_load = 100
for folder_path in [encoded_train_folder_pos, encoded_train_folder_neg]:
    filenames = os.listdir(folder_path)[:num_files_to_load]
    for filename in filenames:
        with h5py.File(os.path.join(folder_path, filename), 'r') as hf:
            data = hf['data'][:]
            max_length = max(max_length, data.shape[1])

print("Maximum length:", max_length)


batch_size = 2  
num_batches = num_files_to_load // batch_size

encoded_data_pos_list = []
for batch_idx in range(num_batches):
    start_idx = batch_idx * batch_size
    end_idx = min((batch_idx + 1) * batch_size, num_files_to_load)
    encoded_data_pos_batch = load_encoded_data_from_h5(encoded_train_folder_pos, end_idx - start_idx, max_length)
    encoded_data_pos_list.append(encoded_data_pos_batch)

encoded_data_neg_list = []
for batch_idx in range(num_batches):
    start_idx = batch_idx * batch_size
    end_idx = min((batch_idx + 1) * batch_size, num_files_to_load)
    encoded_data_neg_batch = load_encoded_data_from_h5(encoded_train_folder_neg, end_idx - start_idx, max_length)
    encoded_data_neg_list.append(encoded_data_neg_batch)

encoded_data_pos = np.concatenate(encoded_data_pos_list, axis=0)
encoded_data_neg = np.concatenate(encoded_data_neg_list, axis=0)


print("Shape of encoded data (positive):", encoded_data_pos.shape)
print("Shape of encoded data (negative):", encoded_data_neg.shape)


## DEMO NN

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class TwoHiddenLayerNN(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, output_size):
        super(TwoHiddenLayerNN, self).__init__()
        self.hidden_layer1 = nn.Linear(input_size, hidden_size1)
        self.hidden_layer2 = nn.Linear(hidden_size1, hidden_size2)
        self.output_layer = nn.Linear(hidden_size2, output_size)

    def forward(self, x):
        x = F.relu(self.hidden_layer1(x))
        x = F.relu(self.hidden_layer2(x))
        x = self.output_layer(x)
        return x


input_size = 2283
hidden_size1 = 256
hidden_size2 = 128
output_size = 2  

model = TwoHiddenLayerNN(input_size, hidden_size1, hidden_size2, output_size)


inputs = torch.randn(100, input_size).float()  

outputs = model(inputs)

print("Shape of output tensor:", outputs.shape)


## Feed Forward NN with two hidden layer of size 256 and 128

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import os
import h5py

class TwoHiddenLayerNN(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, output_size):
        super(TwoHiddenLayerNN, self).__init__()
        self.hidden_layer1 = nn.Linear(input_size, hidden_size1)
        self.hidden_layer2 = nn.Linear(hidden_size1, hidden_size2)
        self.output_layer = nn.Linear(hidden_size2, output_size)

    def forward(self, x):
        x = torch.relu(self.hidden_layer1(x))
        x = torch.relu(self.hidden_layer2(x))
        x = self.output_layer(x)
        return x

input_size = 667389  
hidden_size1 = 256
hidden_size2 = 128
output_size = 2 
learning_rate = 0.001
batch_size = 32
num_epochs = 10

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = TwoHiddenLayerNN(input_size, hidden_size1, hidden_size2, output_size).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

def load_encoded_data_from_h5(folder_path, num_files, max_length):
    encoded_data_list = []
    filenames = os.listdir(folder_path)[:num_files]
    for filename in filenames:
        with h5py.File(os.path.join(folder_path, filename), 'r') as hf:
            data = hf['data'][:]
            if data.shape[1] < max_length:
                padding = np.zeros((data.shape[0], max_length - data.shape[1], data.shape[2]))
                data = np.concatenate([data, padding], axis=1)
            encoded_data_list.append(data)
    return np.array(encoded_data_list)

encoded_train_folder_pos = "encoded_train_data/pos"
encoded_train_folder_neg = "encoded_train_data/neg"
num_files_to_load = 50 
max_length = 269  

encoded_data_pos = load_encoded_data_from_h5(encoded_train_folder_pos, num_files_to_load, max_length)
encoded_data_neg = load_encoded_data_from_h5(encoded_train_folder_neg, num_files_to_load, max_length)

assert encoded_data_pos.shape[0] == num_files_to_load
assert encoded_data_neg.shape[0] == num_files_to_load

inputs_pos = encoded_data_pos.reshape(-1, 269 * 2481)
inputs_neg = encoded_data_neg.reshape(-1, 269 * 2481)

labels_pos = np.ones((inputs_pos.shape[0],), dtype=np.int64)
labels_neg = np.zeros((inputs_neg.shape[0],), dtype=np.int64)

inputs = np.concatenate((inputs_pos, inputs_neg), axis=0)
labels = np.concatenate((labels_pos, labels_neg), axis=0)

inputs = torch.tensor(inputs, dtype=torch.float32).to(device)
labels = torch.tensor(labels, dtype=torch.long).to(device)

train_dataset = TensorDataset(inputs, labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

for epoch in range(num_epochs):
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")

print("Training finished!")


##  Validation on 10% of training data¶

In [None]:
validation_ratio = 0.1
num_validation = int(len(train_dataset) * validation_ratio)
num_training = len(train_dataset) - num_validation
train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [num_training, num_validation])

val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

for epoch in range(num_epochs):
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    model.eval()  
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {running_loss/len(train_loader):.4f}, Validation Loss: {val_loss/len(val_loader):.4f}, Validation Accuracy: {(100 * correct / total):.2f}%")

    model.train() 

print("Training finished!")


## Evaluation metrices of both the labels

In [None]:
from sklearn.metrics import precision_recall_fscore_support

validation_ratio = 0.1
num_validation = int(len(train_dataset) * validation_ratio)
num_training = len(train_dataset) - num_validation
train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [num_training, num_validation])

val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

train_losses = []
val_losses = []
accuracies = []
precisions = []
recalls = []
f1_scores = []

for epoch in range(num_epochs):
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    model.eval() 
    val_loss = 0.0
    correct = 0
    total = 0
    all_predicted_labels = []
    all_true_labels = []
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            all_predicted_labels.extend(predicted.cpu().numpy())
            all_true_labels.extend(labels.cpu().numpy())

    accuracy = (correct / total) * 100
    precision, recall, f1, _ = precision_recall_fscore_support(all_true_labels, all_predicted_labels, average=None)

    train_losses.append(running_loss/len(train_loader))
    val_losses.append(val_loss/len(val_loader))
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

    print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {running_loss/len(train_loader):.4f}, Validation Loss: {val_loss/len(val_loader):.4f}, Validation Accuracy: {accuracy:.2f}%")
    for label in range(output_size):
        print(f"Label {label}: Precision={precision[label]:.4f}, Recall={recall[label]:.4f}, F1-Score={f1[label]:.4f}")
    model.train() 

print("Training finished!")


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
plt.plot(range(1, num_epochs + 1), train_losses, label='Training Loss')
plt.plot(range(1, num_epochs + 1), val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Losses')
plt.legend()
plt.show()


In [None]:
plt.figure(figsize=(10, 5))
plt.plot(range(1, num_epochs + 1), accuracies, label='Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy (%)')
plt.title('Validation Accuracy')
plt.legend()
plt.show()


In [None]:

labels = [f'Label {i}' for i in range(output_size)]
plt.figure(figsize=(15, 5))
plt.plot(labels, precisions[-1], label='Precision')
plt.plot(labels, recalls[-1], label='Recall')
plt.plot(labels, f1_scores[-1], label='F1-Score')
plt.xlabel('Labels')
plt.ylabel('Metrics')
plt.title('Precision, Recall, and F1-Score for the Last Epoch')
plt.legend()
plt.show()


## SPACY TOKENIZATION OF TEST DATA

In [None]:
import os
import spacy
from collections import Counter
from tqdm import tqdm

nlp = spacy.load("en_core_web_sm")

def tokenize_and_filter(text):
    tokens = [token.text for token in nlp(text)]
    token_counts = Counter(tokens)
    filtered_tokens = [token if token_counts[token] >= 5 else "UNK" for token in tokens]
    return filtered_tokens

def process_folder(folder_path, output_folder, file_limit=10):
    filenames = os.listdir(folder_path)[:file_limit]
    for filename in tqdm(filenames, desc=f"Processing {os.path.basename(folder_path)}"):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            text = file.read()
            tokens = tokenize_and_filter(text)
        
        output_file_path = os.path.join(output_folder, filename)
        with open(output_file_path, 'w', encoding='utf-8') as output_file:
            output_file.write(" ".join(tokens))

train_folder = "C:/Users/Deepjyoti Bodo/Downloads/Datasets/aclImdb/test"
pos_train_folder = os.path.join(train_folder, 'pos')
neg_train_folder = os.path.join(train_folder, 'neg')
output_train_folder = "tokenized_test_data"

os.makedirs(output_train_folder, exist_ok=True)
os.makedirs(os.path.join(output_train_folder, 'pos'), exist_ok=True)
os.makedirs(os.path.join(output_train_folder, 'neg'), exist_ok=True)

file_limit = 10
process_folder(pos_train_folder, os.path.join(output_train_folder, 'pos'), file_limit)
process_folder(neg_train_folder, os.path.join(output_train_folder, 'neg'), file_limit)


In [None]:
import os
import numpy as np
import h5py
from tqdm import tqdm

def load_tokenized_data(folder_path, num_files):
    tokenized_data = []
    filenames = os.listdir(folder_path)[:num_files]
    for filename in filenames:
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            tokens = file.read().strip().split()
            tokenized_data.append(tokens)
    return tokenized_data

def create_vocab(tokenized_data):
    vocab = {}
    vocab['PAD'] = 0  
    for tokens in tokenized_data:
        for token in tokens:
            if token not in vocab:
                vocab[token] = len(vocab)
    return vocab

def one_hot_encoding(tokens, vocab, max_length):
    one_hot_encoded = []
    for token in tokens:
        one_hot = [0] * len(vocab)
        if token in vocab:
            one_hot[vocab[token]] = 1
        else:
            one_hot[vocab['PAD']] = 1  
        one_hot_encoded.append(one_hot)
    if len(one_hot_encoded) < max_length:
        one_hot_encoded += [[0] * len(vocab)] * (max_length - len(one_hot_encoded))
    else:
        one_hot_encoded = one_hot_encoded[:max_length]
    return one_hot_encoded

def process_folder_in_batches(folder_path, output_folder, vocab, max_length, batch_size):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    filenames = os.listdir(folder_path)
    num_batches = len(filenames) // batch_size
    for batch_idx in tqdm(range(num_batches), desc=f"Processing {os.path.basename(folder_path)}"):
        batch_filenames = filenames[batch_idx * batch_size: (batch_idx + 1) * batch_size]
        batch_data = []
        for filename in batch_filenames:
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                tokens = file.read().strip().split()
                encoded_sentence = one_hot_encoding(tokens, vocab, max_length)
                batch_data.append(encoded_sentence)
        if batch_data:
            with h5py.File(os.path.join(output_folder, f'batch_{batch_idx}.h5'), 'w') as hf:
                hf.create_dataset('data', data=batch_data)
        else:
            print(f"No data for batch {batch_idx}")

tokenized_test_folder = "tokenized_test_data"
pos_tokenized_test_folder = os.path.join(tokenized_test_folder, 'pos')
neg_tokenized_test_folder = os.path.join(tokenized_test_folder, 'neg')
output_test_folder = "encoded_test_data"

num_files_test = 20  
pos_tokenized_test_data = load_tokenized_data(pos_tokenized_test_folder, num_files_test)
neg_tokenized_test_data = load_tokenized_data(neg_tokenized_test_folder, num_files_test)

all_tokenized_test_data = pos_tokenized_test_data + neg_tokenized_test_data
vocab = create_vocab(all_tokenized_test_data)

if 'PAD' not in vocab:
    vocab['PAD'] = len(vocab)

avg_length_test = sum(len(tokens) for tokens in all_tokenized_test_data) / len(all_tokenized_test_data)
max_length_test = int(avg_length_test)

batch_size_test = 1  
process_folder_in_batches(pos_tokenized_test_folder, os.path.join(output_test_folder, 'pos'), vocab, max_length_test, batch_size_test)
process_folder_in_batches(neg_tokenized_test_folder, os.path.join(output_test_folder, 'neg'), vocab, max_length_test, batch_size_test)


In [None]:
import os
import spacy
from collections import Counter
from tqdm import tqdm

nlp = spacy.load("en_core_web_sm")

def tokenize_and_filter(text):
    tokens = [token.text for token in nlp(text)]
    token_counts = Counter(tokens)
    filtered_tokens = [token if token_counts[token] >= 5 else "UNK" for token in tokens]
    return filtered_tokens

def process_folder(folder_path, output_folder):
    filenames = os.listdir(folder_path)
    for filename in tqdm(filenames, desc=f"Processing {os.path.basename(folder_path)}"):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            text = file.read()
            tokens = tokenize_and_filter(text)
        output_file_path = os.path.join(output_folder, filename)
        with open(output_file_path, 'w', encoding='utf-8') as output_file:
            output_file.write(" ".join(tokens))

train_folder = "test"
pos_train_folder = os.path.join(train_folder, 'pos')
neg_train_folder = os.path.join(train_folder, 'neg')
output_train_folder = "tokenized_test_data"

os.makedirs(output_train_folder, exist_ok=True)
os.makedirs(os.path.join(output_train_folder, 'pos'), exist_ok=True)
os.makedirs(os.path.join(output_train_folder, 'neg'), exist_ok=True)

process_folder(pos_train_folder, os.path.join(output_train_folder, 'pos'))
process_folder(neg_train_folder, os.path.join(output_train_folder, 'neg'))


In [None]:
import os
import h5py
import numpy as np

def load_encoded_data_from_h5(folder_path, num_files, max_length):
    encoded_data_list = []
    filenames = os.listdir(folder_path)[:num_files]
    for filename in filenames:
        with h5py.File(os.path.join(folder_path, filename), 'r') as hf:
            data = hf['data'][:]
            if data.shape[1] < max_length:
                padding = np.zeros((data.shape[0], max_length - data.shape[1], data.shape[2]))
                data = np.concatenate([data, padding], axis=1)
            encoded_data_list.append(data)
    return np.array(encoded_data_list)


encoded_train_folder_pos = "encoded_test_data/pos"
encoded_train_folder_neg = "encoded_test_data/neg"

max_length = 0
num_files_to_load = 100
for folder_path in [encoded_train_folder_pos, encoded_train_folder_neg]:
    filenames = os.listdir(folder_path)[:num_files_to_load]
    for filename in filenames:
        with h5py.File(os.path.join(folder_path, filename), 'r') as hf:
            data = hf['data'][:]
            max_length = max(max_length, data.shape[1])

print("Maximum length:", max_length)

batch_size = 2  
num_batches = num_files_to_load // batch_size

encoded_data_pos_list = []
for batch_idx in range(num_batches):
    start_idx = batch_idx * batch_size
    end_idx = min((batch_idx + 1) * batch_size, num_files_to_load)
    encoded_data_pos_batch = load_encoded_data_from_h5(encoded_train_folder_pos, end_idx - start_idx, max_length)
    encoded_data_pos_list.append(encoded_data_pos_batch)

encoded_data_neg_list = []
for batch_idx in range(num_batches):
    start_idx = batch_idx * batch_size
    end_idx = min((batch_idx + 1) * batch_size, num_files_to_load)
    encoded_data_neg_batch = load_encoded_data_from_h5(encoded_train_folder_neg, end_idx - start_idx, max_length)
    encoded_data_neg_list.append(encoded_data_neg_batch)

encoded_data_pos = np.concatenate(encoded_data_pos_list, axis=0)
encoded_data_neg = np.concatenate(encoded_data_neg_list, axis=0)

print("Shape of encoded data (positive):", encoded_data_pos.shape)
print("Shape of encoded data (negative):", encoded_data_neg.shape)


## Loading test data

In [None]:
import torch
import numpy as np
import os
import h5py

def load_encoded_test_data_from_h5(folder_path):
    encoded_data_list = []
    for filename in os.listdir(folder_path):
        with h5py.File(os.path.join(folder_path, filename), 'r') as hf:
            data = hf['data'][:]
            encoded_data_list.append(data)
    return encoded_data_list

encoded_test_folder_pos = "encoded_test_data/pos"
encoded_test_folder_neg = "encoded_test_data/neg"

encoded_test_data_pos = load_encoded_test_data_from_h5(encoded_test_folder_pos)
encoded_test_data_neg = load_encoded_test_data_from_h5(encoded_test_folder_neg)

inputs_test_pos = torch.tensor(encoded_test_data_pos, dtype=torch.float32)
inputs_test_neg = torch.tensor(encoded_test_data_neg, dtype=torch.float32)

total_elements_pos = inputs_test_pos.numel()
total_elements_neg = inputs_test_neg.numel()

print("Total elements in inputs_test_pos:", total_elements_pos)
print("Total elements in inputs_test_neg:", total_elements_neg)


In [None]:
encoded_train_folder_pos = "encoded_test_data/pos"
encoded_train_folder_neg = "encoded_test_data/neg"
num_files_to_load = 50 
max_length = 263  

encoded_data_pos = load_encoded_data_from_h5(encoded_train_folder_pos, num_files_to_load, max_length)
encoded_data_neg = load_encoded_data_from_h5(encoded_train_folder_neg, num_files_to_load, max_length)

print("Shape of encoded_data_pos before reshaping:", encoded_data_pos.shape)
print("Shape of encoded_data_neg before reshaping:", encoded_data_neg.shape)


expected_size = num_files_to_load * 2 * max_length * input_size
print("Expected size after reshaping:", expected_size)

inputs_pos = encoded_data_pos.reshape(-1, 2 * 263 * 2283)
inputs_neg = encoded_data_neg.reshape(-1, 2 * 263 * 2283)

print("Shape of inputs_pos after reshaping:", inputs_pos.shape)
print("Shape of inputs_neg after reshaping:", inputs_neg.shape)



## Testing the model on the test data

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import os
import h5py

class TwoHiddenLayerNN(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, output_size):
        super(TwoHiddenLayerNN, self).__init__()
        self.hidden_layer1 = nn.Linear(input_size, hidden_size1)
        self.hidden_layer2 = nn.Linear(hidden_size1, hidden_size2)
        self.output_layer = nn.Linear(hidden_size2, output_size)

    def forward(self, x):
        x = torch.relu(self.hidden_layer1(x))
        x = torch.relu(self.hidden_layer2(x))
        x = self.output_layer(x)
        return x
input_size = 600429 
hidden_size1 = 256
hidden_size2 = 128
output_size = 2 
learning_rate = 0.001
batch_size = 32
num_epochs = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TwoHiddenLayerNN(input_size, hidden_size1, hidden_size2, output_size).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

def load_encoded_data_from_h5(folder_path, num_files, max_length):
    encoded_data_list = []
    filenames = os.listdir(folder_path)[:num_files]
    for filename in filenames:
        with h5py.File(os.path.join(folder_path, filename), 'r') as hf:
            data = hf['data'][:]
            if data.shape[1] < max_length:
                padding = np.zeros((data.shape[0], max_length - data.shape[1], data.shape[2]))
                data = np.concatenate([data, padding], axis=1)
            encoded_data_list.append(data)
    return np.array(encoded_data_list)

encoded_train_folder_pos = "encoded_test_data/pos"
encoded_train_folder_neg = "encoded_test_data/neg"
num_files_to_load = 50 
max_length = 263 

encoded_data_pos = load_encoded_data_from_h5(encoded_train_folder_pos, num_files_to_load, max_length)
encoded_data_neg = load_encoded_data_from_h5(encoded_train_folder_neg, num_files_to_load, max_length)

assert encoded_data_pos.shape[0] == num_files_to_load
assert encoded_data_neg.shape[0] == num_files_to_load

inputs_pos = encoded_data_pos.reshape(-1, 263 * 2283)
inputs_neg = encoded_data_neg.reshape(-1, 263 * 2283)  # Corrected reshaping

labels_pos = np.ones((inputs_pos.shape[0],), dtype=np.int64)
labels_neg = np.zeros((inputs_neg.shape[0],), dtype=np.int64)

inputs = np.concatenate((inputs_pos, inputs_neg), axis=0)
labels = np.concatenate((labels_pos, labels_neg), axis=0)

inputs = torch.tensor(inputs, dtype=torch.float32).to(device)
labels = torch.tensor(labels, dtype=torch.long).to(device)

train_dataset = TensorDataset(inputs, labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

for epoch in range(num_epochs):
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad() 
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")

print("Training finished!")


## EXtra


In [None]:
import torch
import torch.nn as nn

class BiLSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(BiLSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.bilstm1 = nn.LSTM(embed_size, hidden_size, batch_first=True, bidirectional=True)
        self.bilstm2 = nn.LSTM(hidden_size * 2, hidden_size // 2, batch_first=True, bidirectional=True)
        self.fc1 = nn.Linear(hidden_size * 2, 1024)
        self.dropout1 = nn.Dropout(0.25)
        self.fc2 = nn.Linear(1024, 512)
        self.dropout2 = nn.Dropout(0.25)
        self.fc3 = nn.Linear(512, 256)
        self.dropout3 = nn.Dropout(0.25)
        self.fc4 = nn.Linear(256, 128)
        self.dropout4 = nn.Dropout(0.25)
        self.fc5 = nn.Linear(128, 64)
        self.dropout5 = nn.Dropout(0.25)
        self.fc6 = nn.Linear(64, 4)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm1_out, _ = self.bilstm1(embedded)
        lstm2_out, _ = self.bilstm2(lstm1_out)
        avg_pooled = torch.mean(lstm2_out, dim=1)
        fc1_out = self.dropout1(torch.relu(self.fc1(avg_pooled)))
        fc2_out = self.dropout2(torch.relu(self.fc2(fc1_out)))
        fc3_out = self.dropout3(torch.relu(self.fc3(fc2_out)))
        fc4_out = self.dropout4(torch.relu(self.fc4(fc3_out)))
        fc5_out = self.dropout5(torch.relu(self.fc5(fc4_out)))
        output = self.fc6(fc5_out)
        return self.softmax(output)


vocab_size =12000
embed_size = 100
hidden_size = 256 
model = BiLSTMModel(vocab_size, embed_size, hidden_size)
print(model)
