<a href="https://colab.research.google.com/github/sioulruble/movie_rater_IMDB/blob/main/movie_rater.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Movie Rater project using differents deep learning architectures

1. Data Preprocessing

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('IMDB_Dataset.csv')

print(f"Nombre total de données : {len(df)}")

sentiment_counts = df['sentiment'].value_counts()
print(f"Nombre de critiques positives : {sentiment_counts['positive']}")
print(f"Nombre de critiques négatives : {sentiment_counts['negative']}")
x = df['review']
y = df['sentiment'].map({'positive': 0, 'negative': 1})
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

Nombre total de données : 50000
Nombre de critiques positives : 25000
Nombre de critiques négatives : 25000
(40000,) (10000,) (40000,) (10000,)


2. Tokenizer

In [None]:
from transformers import AutoTokenizer
import torch
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
x = 'Salut ! Comment ca va chef ?'
print("tokenization", tokenizer(x)['input_ids'])
print("5 tokens max per sequence",tokenizer(x, truncation=True, max_length=5)['input_ids'])
print("15 tokens max per sequence",tokenizer(x, padding='max_length', truncation=True, max_length=15)['input_ids'])

#tokenize the training and test dataset
max_len=100
x_train_tokenized = [ tokenizer(x, padding='max_length', truncation=True, max_length=max_len)['input_ids'] for x in x_train ]
x_test_tokenized = [ tokenizer(x, padding='max_length', truncation=True, max_length=max_len)['input_ids'] for x in x_test ]



tokenization [19221, 315, 5145, 18957, 1275, 46935, 21221, 5633]
5 tokens max per sequence [19221, 315, 5145, 18957, 1275]
15 tokens max per sequence [19221, 315, 5145, 18957, 1275, 46935, 21221, 5633, 50256, 50256, 50256, 50256, 50256, 50256, 50256]


In [None]:
import numpy as np
trainset = torch.utils.data.TensorDataset(torch.tensor(x_train_tokenized), torch.tensor(np.array(y_train, dtype=np.int64)))
x, y = trainset[0]
print(x, y)
print( tokenizer.decode(x) )

trainloader = torch.utils.data.DataLoader(trainset, batch_size=32)

print( len( tokenizer ))
print( tokenizer.vocab_size )



tensor([ 2504,   338,   644,   314,  4030,  4737,  3589,  1141,   262,   867,
        11418,    11, 14788,  7466,    11, 38372,   290,  2276, 43744,   326,
        29298,   378,   262,  9508,  2431,    13,   383, 17909,   635,  1302,
          510,   618,   345,   892,   286,   262,   530,    12, 19577,  3435,
           11,   508,   423,   523,  1310,  6795,   326,   340,   318,  9826,
         5340,   284,  1337,   644,  4325,   284,   606,    13,  1119,   389,
          655, 11234,  3194,  3075,    79,  7084,   329,   262,  3437,   284,
         8181,   465, 34641,  9056,   319,    11,   257,  7243,   326,   468,
          587,  1760,   881,  1365,   287,   584, 43972,  1111,   319,  3195,
          290,   262, 22041, 29847,  1671,  1220,  6927,  1671, 11037,    40]) tensor(1)
That's what I kept asking myself during the many fights, screaming matches, swearing and general mayhem that permeate the 84 minutes. The comparisons also stand up when you think of the one-dimensional charact

In [None]:
emb1 = torch.nn.Embedding(50257, 128)
emb_example = torch.nn.Embedding(50257, 300)
lstm_example = torch.nn.LSTM(300, 100, batch_first=True, bidirectional=False)
x = torch.randint(0, 50257, (32, 100))
print(x.shape, x.dtype)
x, state = lstm_example(emb_example(x))
print( x.shape, state[0].shape, state[1].shape )

torch.Size([32, 100]) torch.int64
torch.Size([32, 100, 100]) torch.Size([1, 32, 100]) torch.Size([1, 32, 100])


LSTM-based Sequence Classification Model




In [None]:
from torchsummary import summary

class Net1(torch.nn.Module):
  def __init__(self):
    super().__init__()
    self.embedding = torch.nn.Embedding(50257, 128)
    self.LSTM = torch.nn.LSTM(128, 128, batch_first=True, bidirectional=False, num_layers=1)
    self.drop = torch.nn.Dropout(0.5)
    self.linear1 = torch.nn.Linear(128, 2)
  def forward(self, x):
    x = self.embedding(x)
    x, _ = self.LSTM(x)
    x = torch.mean(x, dim=1)
    x = self.drop(x)
    x = self.linear1(x)
    return x
  def predict(self, x):
    with torch.no_grad():
      x = self.forward(x)
      return torch.argmax(x, dim=1)

  def predict_proba(self, x):
    with torch.no_grad():
      x = self.forward(x)
      return torch.softmax(x, dim=1)

  # Instantiate the model
model = Net1().cuda()


In [None]:
import torch.optim as optim
from tqdm import tqdm

optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = torch.nn.CrossEntropyLoss()

num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch_idx, (data, target) in enumerate(tqdm(trainloader)):
        data, target = data.cuda(), target.cuda()
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(trainloader)}")

torch.save(model.state_dict(), "movie_rater_model.pth")

100%|██████████| 1250/1250 [00:13<00:00, 89.61it/s] 


Epoch 1/10, Loss: 0.5088964509606362


100%|██████████| 1250/1250 [00:05<00:00, 214.10it/s]


Epoch 2/10, Loss: 0.33413186519145965


100%|██████████| 1250/1250 [00:06<00:00, 194.83it/s]


Epoch 3/10, Loss: 0.2360781230777502


100%|██████████| 1250/1250 [00:05<00:00, 213.32it/s]


Epoch 4/10, Loss: 0.14568036005795001


100%|██████████| 1250/1250 [00:06<00:00, 195.01it/s]


Epoch 5/10, Loss: 0.08437187575995922


100%|██████████| 1250/1250 [00:05<00:00, 213.29it/s]


Epoch 6/10, Loss: 0.058915777200507


100%|██████████| 1250/1250 [00:06<00:00, 193.63it/s]


Epoch 7/10, Loss: 0.044435661974782124


100%|██████████| 1250/1250 [00:05<00:00, 213.75it/s]


Epoch 8/10, Loss: 0.024284927038384193


100%|██████████| 1250/1250 [00:06<00:00, 195.77it/s]


Epoch 9/10, Loss: 0.01627169949197978


100%|██████████| 1250/1250 [00:05<00:00, 214.88it/s]

Epoch 10/10, Loss: 0.011200788322390872





In [None]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

text = 'The movie was horrible!, the worst movie I have seen'
x = tokenizer(text, padding='max_length', truncation=True, max_length=max_len)['input_ids']
x = torch.tensor(x).unsqueeze(0).to(device)
print(model.predict_proba(x))
text = 'The movie was incredible!, the best movie I have seen'
x = tokenizer(text, padding='max_length', truncation=True, max_length=max_len)['input_ids']
x = torch.tensor(x).unsqueeze(0).to(device)
print(model.predict_proba(x))
text = 'The movie was very good'
x = tokenizer(text, padding='max_length', truncation=True, max_length=max_len)['input_ids']
x = torch.tensor(x).unsqueeze(0).to(device)
print(model.predict_proba(x))
text = 'The movie was not very good'
x = tokenizer(text, padding='max_length', truncation=True, max_length=max_len)['input_ids']
x = torch.tensor(x).unsqueeze(0).to(device)
print(model.predict_proba(x))

tensor([[1.0786e-20, 1.0000e+00]], device='cuda:0')
tensor([[9.9999e-01, 7.3054e-06]], device='cuda:0')
tensor([[0.0111, 0.9889]], device='cuda:0')
tensor([[5.1797e-04, 9.9948e-01]], device='cuda:0')
