# Lab 03

The project is a continuation of what we started on the second lab. You will train a logistic regression classifier on manually extracted features.

# Features (6 points)

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split

import torch
from torch import nn

## Loading the dataset

We load the dataset from the previous notebook.

In [2]:
from datasets import Dataset
from datasets import load_dataset
from datasets import load_dataset_builder

dataset = load_dataset_builder("imdb")
dataset_train = load_dataset("imdb", split='train')
dataset_test = load_dataset("imdb", split='test')
dataset_unsupervised = load_dataset("imdb", split='unsupervised')

Downloading readme:   0%|          | 0.00/7.59k [00:00<?, ?B/s]

Found cached dataset imdb (/home/timothee/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
Found cached dataset imdb (/home/timothee/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
Found cached dataset imdb (/home/timothee/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


In [3]:
punctuation_filter = ['"', '#', '$', '%', '&', "'", '(', ')', '*', '+',
                      ',', '.', '/', ':', ';', '<', '=', '>', '?', '@',
                      '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']

def to_lower_case(row: dict) -> dict:
    """
    Lower text field in the row dict
    return: updated row
    """
    row['text'] = row['text'].lower()
    return row

def remove_punctuation(row: dict) -> dict:
    """
    Replace punctuation from punctuation_filter list to
    spaces in the text field of row dict
    return: updated row
    """
    for punctuation in punctuation_filter:
        row['text'] = row['text'].replace(punctuation, ' ')
    row['text'] = row['text'].replace('!', ' ! ')
    return row

def preprocessing(row: dict) -> dict:
    """
    Lower text field in the row dict and replace punctuation
    from punctuation_filter list to spaces in the text field
    of row dict
    return: updated row
    """
    return to_lower_case(remove_punctuation(row))

In [4]:
preprocess_train = dataset_test.map(preprocessing)
preprocess_test = dataset_train.map(preprocessing)
preprocess_unsupervised = dataset_unsupervised.map(preprocessing)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

# Features (6 points)

In [5]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
import math

## Get positive and negative lexicons

In [7]:
data = pd.read_csv('vader_lexicon.txt', delimiter='	', names=['token', 'sentiment', 'A', 'B'])
data = data.drop(columns=['A', 'B'])
positive = data.loc[data.sentiment >= 1]
positive = positive["token"].values.tolist()
negative = data.loc[data.sentiment <= -1]
negative = negative["token"].values.tolist()

## Vectorize text
For every given text, we want to generate a vector with the features seen in class.

(6 points) Code the following features:

- 1 if "no" appears in the document, 0 otherwise.
- The count of first and second pronouns in the document.
- 1 if "!" is in the document, 0 otherwise.
- Log(word count in the document).
- Number of words in the document which are in the positive lexicon.
- Number of words in the document which are in the negative lexicon.

In [None]:
pronouns = ['I', 'you', 'we'] 
def Vectorizer(documents: list[str]):
    X = []
    for doc in documents:
        words = doc.split()

        positive_count = 0
        negative_count = 0
        pronoun_count = 0
        for word in words:
            if word in pronouns:
                pronoun_count += 1
            if word in positive:
                positive_count += 1
            elif word in negative:
                negative_count += 1

        vec = []
        # 1 if "no" appears in the document, 0 otherwise.
        if "no" in words:
            vec.append(1)
        else:
            vec.append(0)
            
        # The count of first and second pronouns in the document.
        vec.append(pronoun_count)
        
        # 1 if "!" is in the document, 0 otherwise.
        if "!" in doc:
            vec.append(1)
        else:
            vec.append(0)
        # Log(word count in the document).
        vec.append(math.log10(len(words)))
    
        # positive and negative lexicon
    
        vec.append(positive_count)
        vec.append(negative_count)

        X.append(vec)
    return X

In [None]:
%%time
dataset = Vectorizer(preprocess_train['text'])
dataset[:10]

In [None]:
all_points = torch.tensor(dataset, dtype=torch.float32)
# norm result btw -1 and 1
#pre_labels = np.array(preprocess_train['label']) * 2 - 1
pre_labels = np.array(preprocess_train['label'])
labels = torch.tensor(pre_labels, dtype=torch.float32).reshape(-1, 1)
print('Train dataset:', all_points.shape)

X_train, X_test, y_train, y_test = train_test_split(
    all_points,
    labels,
    test_size=0.2,
    stratify=labels,
    random_state=42,
)
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train,
    y_train,
    test_size=0.2,
    stratify=y_train,
    random_state=42,
)

print('X_train len:', X_train.shape)
print('X_test len:', X_test.shape)
print('X_valid len:', X_valid.shape)


# Logistic regression classifier (6 points)

In [None]:
class LinearRegression(nn.Module):
    """A linear regression implementation"""

    def __init__(self, input_dim: int, nb_classes: int) -> None:
        """
        Args:
            input_dim: the dimension of the input features.
            nb_classes: the number of classes to predict.
        """
        super().__init__()
        self.linear = nn.Linear(input_dim, nb_classes)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x: the input tensor.
        Returns:
            The output of the linear layer.
        """
        return self.linear(x)

# Training the model

In [None]:
model = LinearRegression(6, 1)
# Stochastic gradient descent
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, weight_decay=0.5)

In [None]:
%%time

n_epochs = 10000

# Keeping an eye on the losses
train_losses = []
test_losses = []

# Training loop
for epoch in range(n_epochs):
    # Setting all gradients to zero.
    optimizer.zero_grad()

    # Sending the whole training set through the model.
    predictions = model(X_train)
    # Computing the loss.
    loss = criterion(predictions, y_train)
    train_losses.append(loss.item())
    if epoch % 100 == 0:
        print(loss)
    # Computing the gradients and gradient descent.
    loss.backward()
    optimizer.step()

    # When computing the validation loss, we do not want to update the weights.
    # torch.no_grad tells PyTorch to not save the necessary data used for
    # gradient descent.
    with torch.no_grad():
        predictions = model(X_valid)
        loss = criterion(predictions, y_valid)
        test_losses.append(loss)

In [None]:
# Checking the losses
plt.plot(np.arange(len(train_losses)), train_losses, label="Training loss")
plt.plot(np.arange(len(test_losses)), test_losses, label="Test loss")
plt.legend()

In [None]:
# Note that we need to use a sigmoid on the output now.
with torch.no_grad():
    p_train = torch.sigmoid(model(X_train))
    p_train = np.round(p_train.numpy())
    training_accuracy = np.mean(p_train == y_train.numpy())
    p_valid = torch.sigmoid(model(X_valid))
    p_valid = np.round(p_valid.numpy())
    valid_accuracy = np.mean(p_valid == y_valid.numpy())
    p_test = torch.sigmoid(model(X_test))
    p_test = np.round(p_test.numpy())
    test_accuracy = np.mean(p_test == y_test.numpy())
print(training_accuracy, valid_accuracy, test_accuracy)

In [None]:
# Predicting new samples
sample = 'This film was a very good one! I really like it.'
pre_sample = preprocessing({'text': sample})
print(pre_sample)
vec_sample = Vectorizer([pre_sample['text']])
print(vec_sample)
torch.sigmoid(model(torch.tensor(vec_sample, dtype=torch.float32)))

In [None]:
for name, param in model.named_parameters():
    print(name, param.data)