## TF-IDF

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

np.random.seed(42)
torch.manual_seed(42)

x_train = pd.read_csv('x_train.csv')
y_train = pd.read_csv('y_train.csv')['is_positive_sentiment']
x_train['text'] = x_train['text'].str.lower()

vectorizer = TfidfVectorizer(min_df=1, max_df=0.9)
X_train_tfidf = vectorizer.fit_transform(x_train['text']).toarray()
X_train_tensor = torch.tensor(X_train_tfidf).float()
y_train_tensor = torch.tensor(y_train.values).float()

class NeuralNetworkPA1(nn.Module):
    def __init__(self, input_dim):
        super(NeuralNetworkPA1, self).__init__()
        self.layer1 = nn.Linear(input_dim, 10)
        self.relu = nn.ReLU()                  
        self.layer2 = nn.Linear(10, 1)         
        self.sigmoid = nn.Sigmoid()            

    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.sigmoid(self.layer2(x))
        return x

kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold = 0
accuracy_scores = []

for train_index, val_index in kf.split(X_train_tensor):
    fold += 1
    X_train, X_val = X_train_tensor[train_index], X_train_tensor[val_index]
    y_train, y_val = y_train_tensor[train_index], y_train_tensor[val_index]
    
    train_dataset = TensorDataset(X_train, y_train)
    val_dataset = TensorDataset(X_val, y_val)
    train_loader = DataLoader(train_dataset, batch_size=10, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=10)

    model = NeuralNetworkPA1(X_train_tensor.shape[1])
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    criterion = nn.BCELoss()

    num_epochs = 10
    for epoch in range(num_epochs):
        model.train()
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()

        model.eval()
        with torch.no_grad():
            correct = 0
            total = 0
            for inputs, labels in val_loader:
                outputs = model(inputs)
                predicted = (outputs.squeeze() >= 0.5).float()
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
            accuracy = 100 * correct / total
            accuracy_scores.append(accuracy)
            print(f'Fold {fold}, Epoch {epoch + 1}, Val Accuracy: {accuracy:.2f}%')

print(f'Average Val Accuracy: {np.mean(accuracy_scores):.2f}%')


Fold 1, Epoch 1, Val Accuracy: 83.75%
Fold 1, Epoch 2, Val Accuracy: 83.75%
Fold 1, Epoch 3, Val Accuracy: 83.96%
Fold 1, Epoch 4, Val Accuracy: 82.92%
Fold 1, Epoch 5, Val Accuracy: 82.29%
Fold 1, Epoch 6, Val Accuracy: 81.88%
Fold 1, Epoch 7, Val Accuracy: 83.12%
Fold 1, Epoch 8, Val Accuracy: 81.25%
Fold 1, Epoch 9, Val Accuracy: 81.88%
Fold 1, Epoch 10, Val Accuracy: 81.88%
Fold 2, Epoch 1, Val Accuracy: 84.38%
Fold 2, Epoch 2, Val Accuracy: 83.12%
Fold 2, Epoch 3, Val Accuracy: 83.75%
Fold 2, Epoch 4, Val Accuracy: 82.71%
Fold 2, Epoch 5, Val Accuracy: 83.12%
Fold 2, Epoch 6, Val Accuracy: 83.12%
Fold 2, Epoch 7, Val Accuracy: 83.33%
Fold 2, Epoch 8, Val Accuracy: 83.75%
Fold 2, Epoch 9, Val Accuracy: 83.54%
Fold 2, Epoch 10, Val Accuracy: 83.75%
Fold 3, Epoch 1, Val Accuracy: 80.21%
Fold 3, Epoch 2, Val Accuracy: 81.25%
Fold 3, Epoch 3, Val Accuracy: 80.00%
Fold 3, Epoch 4, Val Accuracy: 80.62%
Fold 3, Epoch 5, Val Accuracy: 81.04%
Fold 3, Epoch 6, Val Accuracy: 78.75%
Fold 3, Ep

## Word Embeddings

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from collections import OrderedDict

word_embeddings = pd.read_csv('glove.6B.50d.txt.zip',
                               header=None, sep=' ', index_col=0,
                               nrows=100000, compression='zip', encoding='utf-8', quoting=3)
word_list = word_embeddings.index.values.tolist()
word2vec = OrderedDict(zip(word_list, word_embeddings.values))

class NeuralNetworkPA2(nn.Module):
    def __init__(self, input_dim):
        super(NeuralNetworkPA2, self).__init__()
        hidden_dim=20
        self.layer1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.sigmoid(self.layer2(x))
        return x

def vectorize_sentence(sentence):
    vectors = [word2vec[word] for word in sentence.lower().split() if word in word2vec]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(50)

df = pd.read_csv('./x_train.csv')  
y = pd.read_csv('./y_train.csv')['is_positive_sentiment']

X = np.array([vectorize_sentence(review) for review in df['text']])
y = y.values

results = []

X_train_tensor = torch.tensor(X).float()
y_train_tensor = torch.tensor(y).float()

kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold = 0
accuracy_scores = []

for train_index, val_index in kf.split(X_train_tensor):
    fold += 1
    X_train, X_val = X_train_tensor[train_index], X_train_tensor[val_index]
    y_train, y_val = y_train_tensor[train_index], y_train_tensor[val_index]
    
    train_dataset = TensorDataset(X_train, y_train)
    val_dataset = TensorDataset(X_val, y_val)
    train_loader = DataLoader(train_dataset, batch_size=10, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=10)

    model = NeuralNetworkPA2(X_train_tensor.shape[1])
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    criterion = nn.BCELoss()

    num_epochs = 10
    for epoch in range(num_epochs):
        model.train()
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()

        model.eval()
        with torch.no_grad():
            correct = 0
            total = 0
            for inputs, labels in val_loader:
                outputs = model(inputs)
                predicted = (outputs.squeeze() >= 0.5).float()
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
            accuracy = 100 * correct / total
            accuracy_scores.append(accuracy)
            print(f'Fold {fold}, Epoch {epoch + 1}, Val Accuracy: {accuracy:.2f}%')

print(f'Average Val Accuracy: {np.mean(accuracy_scores):.2f}%')


Fold 1, Epoch 1, Val Accuracy: 69.58%
Fold 1, Epoch 2, Val Accuracy: 65.62%
Fold 1, Epoch 3, Val Accuracy: 71.25%
Fold 1, Epoch 4, Val Accuracy: 69.58%
Fold 1, Epoch 5, Val Accuracy: 70.00%
Fold 1, Epoch 6, Val Accuracy: 71.67%
Fold 1, Epoch 7, Val Accuracy: 72.08%
Fold 1, Epoch 8, Val Accuracy: 69.38%
Fold 1, Epoch 9, Val Accuracy: 70.83%
Fold 1, Epoch 10, Val Accuracy: 72.92%
Fold 2, Epoch 1, Val Accuracy: 70.21%
Fold 2, Epoch 2, Val Accuracy: 69.79%
Fold 2, Epoch 3, Val Accuracy: 70.00%
Fold 2, Epoch 4, Val Accuracy: 65.62%
Fold 2, Epoch 5, Val Accuracy: 71.04%
Fold 2, Epoch 6, Val Accuracy: 72.92%
Fold 2, Epoch 7, Val Accuracy: 70.42%
Fold 2, Epoch 8, Val Accuracy: 71.04%
Fold 2, Epoch 9, Val Accuracy: 65.21%
Fold 2, Epoch 10, Val Accuracy: 70.21%
Fold 3, Epoch 1, Val Accuracy: 66.04%
Fold 3, Epoch 2, Val Accuracy: 74.17%
Fold 3, Epoch 3, Val Accuracy: 74.38%
Fold 3, Epoch 4, Val Accuracy: 76.46%
Fold 3, Epoch 5, Val Accuracy: 69.17%
Fold 3, Epoch 6, Val Accuracy: 74.17%
Fold 3, Ep