## Word Embeddings

In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

np.random.seed(42)
torch.manual_seed(42)

x_train = pd.read_csv('x_train.csv')
y_train = pd.read_csv('y_train.csv')['is_positive_sentiment']

x_train['text'] = x_train['text'].str.lower()

vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), min_df=1, max_df=0.95)
X_train_tfidf = vectorizer.fit_transform(x_train['text']).toarray()

X_train_tensor = torch.tensor(X_train_tfidf).float()
y_train_tensor = torch.tensor(y_train.values).float()

class NeuralNetworkPA3(nn.Module):
    def __init__(self, input_dim):
        super(NeuralNetworkPA3, self).__init__()

        dropout_rate = 0.2
        first_layer_count = 50
        second_layer_count = 10
        self.layer1 = nn.Linear(input_dim, first_layer_count)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(dropout_rate)
        self.layer2 = nn.Linear(first_layer_count, second_layer_count)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(dropout_rate)
        self.output = nn.Linear(second_layer_count, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.dropout1(self.relu1(self.layer1(x)))
        x = self.dropout2(self.relu2(self.layer2(x)))
        x = self.sigmoid(self.output(x))
        return x

kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold = 0
accuracy_scores = []

for train_index, val_index in kf.split(X_train_tensor):
    fold += 1
    X_train, X_val = X_train_tensor[train_index], X_train_tensor[val_index]
    y_train, y_val = y_train_tensor[train_index], y_train_tensor[val_index]
    
    train_dataset = TensorDataset(X_train, y_train)
    val_dataset = TensorDataset(X_val, y_val)
    train_loader = DataLoader(train_dataset, batch_size=10, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=10)

    model = NeuralNetworkPA3(X_train_tensor.shape[1])
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    criterion = nn.BCELoss()

    num_epochs = 10
    for epoch in range(num_epochs):
        model.train()
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()

        model.eval()
        with torch.no_grad():
            correct = 0
            total = 0
            for inputs, labels in val_loader:
                outputs = model(inputs)
                predicted = (outputs.squeeze() >= 0.5).float()
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
            accuracy = 100 * correct / total
            accuracy_scores.append(accuracy)
            print(f'Fold {fold}, Epoch {epoch + 1}, Val Accuracy: {accuracy:.2f}%')

print(f'Average Val Accuracy: {np.mean(accuracy_scores):.2f}%')

model.eval()

x_test = pd.read_csv('x_test.csv')
x_test['text'] = x_test['text'].str.lower()
X_test_tfidf = vectorizer.transform(x_test['text']).toarray()
X_test_tensor = torch.tensor(X_test_tfidf).float()

with torch.no_grad():
    outputs = model(X_test_tensor)
    predicted = (outputs.squeeze() >= 0.5).float()

np.savetxt('y_prediction.txt', predicted.numpy(), fmt='%d')


Fold 1, Epoch 1, Val Accuracy: 82.08%
Fold 1, Epoch 2, Val Accuracy: 81.46%
Fold 1, Epoch 3, Val Accuracy: 81.04%
Fold 1, Epoch 4, Val Accuracy: 81.04%
Fold 1, Epoch 5, Val Accuracy: 79.38%
Fold 1, Epoch 6, Val Accuracy: 76.88%
Fold 1, Epoch 7, Val Accuracy: 81.46%
Fold 1, Epoch 8, Val Accuracy: 80.21%
Fold 1, Epoch 9, Val Accuracy: 80.00%
Fold 1, Epoch 10, Val Accuracy: 80.21%
Fold 2, Epoch 1, Val Accuracy: 81.46%
Fold 2, Epoch 2, Val Accuracy: 81.04%
Fold 2, Epoch 3, Val Accuracy: 82.50%
Fold 2, Epoch 4, Val Accuracy: 81.25%
Fold 2, Epoch 5, Val Accuracy: 81.04%
Fold 2, Epoch 6, Val Accuracy: 78.96%
Fold 2, Epoch 7, Val Accuracy: 80.62%
Fold 2, Epoch 8, Val Accuracy: 79.17%
Fold 2, Epoch 9, Val Accuracy: 80.21%
Fold 2, Epoch 10, Val Accuracy: 79.79%
Fold 3, Epoch 1, Val Accuracy: 78.75%
Fold 3, Epoch 2, Val Accuracy: 76.25%
Fold 3, Epoch 3, Val Accuracy: 81.67%
Fold 3, Epoch 4, Val Accuracy: 81.67%
Fold 3, Epoch 5, Val Accuracy: 81.04%
Fold 3, Epoch 6, Val Accuracy: 80.83%
Fold 3, Ep