In [1]:
import pandas as pd
import numpy as np


In [5]:
df_reduced=pd.read_csv('/content/imdb_reduced.csv')

In [6]:
df_reduced.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,"Petter Mattei's ""Love in the Time of Money"" is...",positive
4,"Probably my all-time favorite movie, a story o...",positive


In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [None]:
vocab = set()
max_seq_len = 0
for text in df_reduced['review']:
    vocab.update(text)
    max_seq_len = max(max_seq_len, len(text))
char_to_index = {char: i+1 for i, char in enumerate(vocab)}
char_to_index['<PAD>'] = 0
sequences = []
for text in df['review']:
    seq = [char_to_index[char] for char in text]
    seq += [char_to_index['<PAD>']] * (max_seq_len - len(seq))
    sequences.append(seq)



In [11]:
len(sequences)

100

In [13]:
len(vocab)

80

In [14]:
max_seq_len

390

In [27]:
from sklearn.preprocessing import LabelEncoder

# Create label encoder
label_encoder = LabelEncoder()

# Encode the sentiment labels
encoded_labels = label_encoder.fit_transform(df_reduced['sentiment'])

# Convert the sequences to a PyTorch tensor
data = torch.tensor(sequences, dtype=torch.long)
labels = torch.tensor(encoded_labels, dtype=torch.float)

In [28]:
from sklearn.model_selection import train_test_split
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, test_size=0.2, random_state=42)


In [29]:
# Define the CNN model
class CNN(nn.Module):
    def __init__(self, vocab_size, embed_size, num_filters, filter_sizes, output_size):
        super(CNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embed_size, out_channels=num_filters, kernel_size=fs)
            for fs in filter_sizes
        ])
        self.fc = nn.Linear(len(filter_sizes) * num_filters, output_size)
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x):
        embedded = self.embedding(x)
        embedded = embedded.permute(0, 2, 1)
        conved = [nn.functional.relu(conv(embedded)) for conv in self.convs]
        pooled = [nn.functional.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim=1))
        return self.fc(cat)

In [30]:
# Set the hyperparameters
vocab_size = len(vocab) + 1
embed_size = 100
num_filters = 100
filter_sizes = [3, 4, 5]
output_size = 1
learning_rate = 0.001
num_epochs = 10


In [31]:
model = CNN(vocab_size, embed_size, num_filters, filter_sizes, output_size)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.BCEWithLogitsLoss()


In [35]:
# Train the model
for epoch in range(num_epochs):
    running_loss = 0.0
    for i, inputs in enumerate(train_data):
        inputs = inputs.unsqueeze(0)  # Add an extra dimension
        targets = train_labels[i].unsqueeze(0).unsqueeze(0)  # Ensure targets have the same shape as outputs
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {running_loss / len(train_data)}')


Epoch 1/10, Loss: 0.9921326019510162
Epoch 2/10, Loss: 0.77435418679961
Epoch 3/10, Loss: 0.8027823110343888
Epoch 4/10, Loss: 0.7060855770163471
Epoch 5/10, Loss: 0.6375900574610569
Epoch 6/10, Loss: 0.5690484489143273
Epoch 7/10, Loss: 0.34818496799907733
Epoch 8/10, Loss: 0.6165876677310962
Epoch 9/10, Loss: 0.512820607995127
Epoch 10/10, Loss: 0.4536109174431601


In [36]:
test_predictions = []
with torch.no_grad():
    for inputs in test_data:
        outputs = model(inputs.unsqueeze(0))
        predictions = torch.round(torch.sigmoid(outputs))
        test_predictions.extend(predictions.tolist())

In [38]:
from sklearn.metrics import classification_report
report = classification_report(test_labels, test_predictions)
print(report)

              precision    recall  f1-score   support

         0.0       0.31      0.50      0.38         8
         1.0       0.43      0.25      0.32        12

    accuracy                           0.35        20
   macro avg       0.37      0.38      0.35        20
weighted avg       0.38      0.35      0.34        20

