* Sentiment analysis
* Baseline: Logistic Regression
* IMDB dataset

* From v0.
* The main change is using ngram of 2 and max_features=20000

# Import libraries

In [1]:
# for reloading any changed module
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import torch

In [3]:
from pathlib import Path
import sys
import os

In [4]:
from torch.utils.data import DataLoader

In [5]:
Path.cwd().parent.parent.parent

PosixPath('/media/skesava/D/Training/MachineLearning')

In [6]:
data_directory = Path.cwd().parent.parent.parent.joinpath("ML datasets/IMDB_SentimentAnalysis_TextData/aclImdb")

In [7]:
if not data_directory.exists():
    raise FileNotFoundError()

In [8]:
processed_data_directory = data_directory.joinpath("processed_data")

In [9]:
processed_data_directory.exists()

True

In [10]:
df_train = pd.read_csv(processed_data_directory.joinpath("train.csv"))
df_train.head(5)

Unnamed: 0,text,label
0,Robert Duvall is a direct descendent of Confed...,1
1,"The Railway Children, at least this 1970 movie...",1
2,Anatomie was a German made Movie and subtitled...,0
3,This film makes Clooney. All his films combine...,1
4,This is easily the best cinematic version of W...,1


In [11]:
df_train.tail(5)

Unnamed: 0,text,label
34995,This movie is well done. It really attempts to...,1
34996,Ever wonder where the ideas for romance novels...,1
34997,"How truly friendly, charming and cordial is th...",1
34998,Very good 1939 film where John Garfield plays ...,1
34999,I couldn't disagree more with those who says t...,1


In [12]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35000 entries, 0 to 34999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    35000 non-null  object
 1   label   35000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 547.0+ KB


In [13]:
df_test = pd.read_csv(processed_data_directory.joinpath("test.csv"))
df_test.head(5)

Unnamed: 0,text,label
0,"I have to say, when ""Pushing Daisies"" came out...",1
1,"Bah. Another tired, desultory reworking of an ...",0
2,It's a shame that Asterix and his buddy Obelix...,1
3,this is the worst movie ive ever seen. And i h...,0
4,The comments for Commune make it sound like a ...,0


In [14]:
df_val = pd.read_csv(processed_data_directory.joinpath("val.csv"))
df_val.head(5)

Unnamed: 0,text,label
0,This is one of the funniest movies I've ever s...,1
1,One of my favorite westerns and one of John Fo...,1
2,I felt I had to add a comment after seeing the...,0
3,Not to be mistaken as the highly touted Samuel...,0
4,I watched the entire movie recognizing the par...,0


# Bag of Words model

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

In [16]:
max_input_features = 20000

In [17]:
cv = CountVectorizer(encoding='utf-8', 
                     lowercase=True, 
                     stop_words='english',
                     max_features=max_input_features,
                     ngram_range=(2,2)                     
                    )

### FIT

In [18]:
cv.fit(df_train['text'])

In [19]:
cv.vocabulary_

{'robert duvall': 15437,
 'according imdb': 215,
 'imdb com': 8860,
 'seeing film': 16051,
 'film think': 6688,
 'favorite films': 5941,
 'true life': 18377,
 'moment time': 11963,
 'time life': 18026,
 'moves slowly': 12044,
 'railway children': 14758,
 'movie version': 12796,
 'written directed': 19807,
 'long time': 10896,
 'character actor': 2981,
 'story great': 17058,
 'production br': 14613,
 'br br': 1879,
 'small village': 16609,
 'train station': 18321,
 'important story': 8881,
 'story br': 17011,
 'br time': 2483,
 'time passes': 18057,
 'earn money': 5094,
 'story children': 17016,
 'want share': 18980,
 'poor man': 14344,
 'speak english': 16762,
 'wife child': 19437,
 'come home': 3434,
 'takes long': 17456,
 'young man': 19957,
 'birthday party': 1599,
 'happy endings': 8289,
 'br problems': 2310,
 'mother father': 12026,
 'taken away': 17440,
 'happy ending': 8288,
 'ending br': 5310,
 '14 year': 51,
 'year old': 19860,
 'heart story': 8430,
 'make movie': 11341,
 'mov

In [20]:
cv.vocabulary_.items()



In [21]:
sorted(cv.vocabulary_.items(), key=lambda x: x[1], reverse=True)

[('zombies br', 19999),
 ('zombie movies', 19998),
 ('zombie movie', 19997),
 ('zombie like', 19996),
 ('zombie flicks', 19995),
 ('zombie flick', 19994),
 ('zombie films', 19993),
 ('zombie film', 19992),
 ('zombie bloodbath', 19991),
 ('zodiac killer', 19990),
 ('zhang yimou', 19989),
 ('zeta jones', 19988),
 ('zero stars', 19987),
 ('zero mostel', 19986),
 ('zero day', 19985),
 ('zero budget', 19984),
 ('zelah clarke', 19983),
 ('zabriskie point', 19982),
 ('yul brynner', 19981),
 ('yr old', 19980),
 ('youngest son', 19979),
 ('younger woman', 19978),
 ('younger viewers', 19977),
 ('younger sister', 19976),
 ('younger man', 19975),
 ('younger generation', 19974),
 ('younger days', 19973),
 ('younger children', 19972),
 ('younger brother', 19971),
 ('younger audience', 19970),
 ('young women', 19969),
 ('young woman', 19968),
 ('young viewers', 19967),
 ('young victoria', 19966),
 ('young stars', 19965),
 ('young son', 19964),
 ('young person', 19963),
 ('young people', 19962),
 ('yo

#### NOTE

* The value of a key is the word's position in the list of selected words within the `max_threshold` by CountVectorizer, and not the number of times it occurs.

### Transform

In [22]:
X_train = cv.transform(df_train['text'])

In [23]:
X_train[0]

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 10 stored elements and shape (1, 20000)>

In [24]:
X_train[0].todense()

matrix([[0, 0, 0, ..., 0, 0, 0]])

In [25]:
X_train[0].todense()[0]

matrix([[0, 0, 0, ..., 0, 0, 0]])

In [26]:
np.unique(X_train[0].todense(), return_counts=True)

(matrix([[0, 0, 0, ..., 1, 1, 1]]), array([0, 0, 0, ..., 1, 1, 1]))

In [27]:
np.nonzero(X_train[0].todense())
# first array: row index; # second array: column index

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([  215,  5941,  6688,  8860, 11963, 12044, 15437, 16051, 18026,
        18377]))

In [28]:
# number of words
np.nonzero(X_train[0].todense())[0].size

10

In [29]:
cv.inverse_transform(X_train[0])

[array(['according imdb', 'favorite films', 'film think', 'imdb com',
        'moment time', 'moves slowly', 'robert duvall', 'seeing film',
        'time life', 'true life'], dtype='<U25')]

In [30]:
# Test
X_test = cv.transform(df_test['text'])
X_test[0].todense()

matrix([[0, 0, 0, ..., 0, 0, 0]])

In [31]:
# ValidationY
X_val = cv.transform(df_val['text'])
X_val[0].todense()

matrix([[0, 0, 0, ..., 0, 0, 0]])

In [32]:
type(X_train)

scipy.sparse._csr.csr_matrix

# torch Dataset and Dataloader

In [33]:
from local_dataset_utilities import IMDBDataset

#### Train dataset

In [34]:
train_dataset = IMDBDataset(X_train.todense(), df_train['label'].to_numpy())

#### Test dataset

In [35]:
test_dataset = IMDBDataset(X_test.todense(), df_test['label'].to_numpy())

#### Validation dataset

In [36]:
val_dataset = IMDBDataset(X_val.todense(), df_val['label'].to_numpy())

### Dataloaders

#### Batch size

In [37]:
batch_size = 32

In [38]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

##### Testing

In [39]:
for test_features, test_labels in train_dataloader:
    break

In [40]:
test_features.shape

torch.Size([32, 20000])

In [41]:
test_labels.shape

torch.Size([32])

In [42]:
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [43]:
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Model definitions

## Pytorch model

In [44]:
from pytorch_model import PytorchLogReg

In [45]:
logreg_model = PytorchLogReg(num_features=max_input_features, num_classes=2)

## Lightning model

In [46]:
from pytorch_model import LightningModel

In [47]:
learning_rate = 0.01

In [48]:
Logreg_L_Model = LightningModel(pytorch_model=logreg_model, learning_rate=learning_rate)

## Callbacks

### Model checkpoint

In [49]:
from lightning.pytorch.callbacks import ModelCheckpoint

In [50]:
Path.cwd()

PosixPath('/media/skesava/D/Training/MachineLearning/Pytorch/NLP_using_LLMs/SentimentAnalysis')

In [51]:
modelcheckpoints_dir = Path.cwd().joinpath("model_checkpoints")

In [52]:
if not modelcheckpoints_dir.exists():
    modelcheckpoints_dir.mkdir()

#### Set model checkpoint folder

In [53]:
import ipynbname

In [54]:
ipynbname.name()

'SA_LogReg_v2'

In [55]:
model_checkpoint_folder = modelcheckpoints_dir.joinpath(f"{ipynbname.name()}")
model_checkpoint_folder

PosixPath('/media/skesava/D/Training/MachineLearning/Pytorch/NLP_using_LLMs/SentimentAnalysis/model_checkpoints/SA_LogReg_v2')

In [56]:
if not model_checkpoint_folder.exists():
    model_checkpoint_folder.mkdir()

In [57]:
model_checkpoint = ModelCheckpoint(dirpath=model_checkpoint_folder,
                                  filename="{epoch}-{val_acc:.2f}",
                                  save_top_k=1,
                                  save_last=True,
                                  monitor="val_acc",
                                  mode="max"
                                  )

In [58]:
callbacks = [model_checkpoint]

### ProgressBar

In [59]:
from pytorch_model import MyProgressBar

In [60]:
callbacks.append(MyProgressBar())

In [61]:
callbacks

[<lightning.pytorch.callbacks.model_checkpoint.ModelCheckpoint at 0x7fcdec8dcb90>,
 <pytorch_model.MyProgressBar at 0x7fcdec94ced0>]

# Model Training

## Logger

In [62]:
from lightning.pytorch.loggers import CSVLogger

In [63]:
logs_dir = Path.cwd().joinpath("logs")

In [64]:
if not logs_dir.exists():
    logs_dir.mkdir()

In [65]:
logging_folder = logs_dir.joinpath(ipynbname.name())

In [66]:
if not logging_folder.exists():
    logging_folder.mkdir()

## Lightning Trainer

In [67]:
from lightning import Trainer

In [68]:
max_epochs = 20

### Quick bug test

In [69]:
trainer = Trainer(logger=CSVLogger(save_dir=logging_folder, name="bow_model_v0"),
                 callbacks=callbacks,
                 accelerator="cpu",
                  devices="auto",
                 deterministic=True,
                 max_epochs=max_epochs,
                 fast_dev_run=True # testing for any bugs
                 )

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Running in `fast_dev_run` mode: will run the requested loop using 1 batch(es). Logging and checkpointing is suppressed.


In [70]:
trainer.fit(model=Logreg_L_Model, 
            train_dataloaders=train_dataloader, 
            val_dataloaders=val_dataloader
           )


  | Name      | Type               | Params | Mode 
---------------------------------------------------------
0 | model     | PytorchLogReg      | 40.0 K | train
1 | train_acc | MulticlassAccuracy | 0      | train
2 | val_acc   | MulticlassAccuracy | 0      | train
3 | test_acc  | MulticlassAccuracy | 0      | train
---------------------------------------------------------
40.0 K    Trainable params
0         Non-trainable params
40.0 K    Total params
0.160     Total estimated model params size (MB)
5         Modules in train mode
0         Modules in eval mode
/media/skesava/D/Training/MachineLearning/Pytorch/NLP_using_LLMs/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
/media/skesava/D/Training/MachineLearning/Pytorch/NLP_using_LLMs/.ven

Epoch 0: 100%|████████████████████████████████████| 1/1 [00:00<00:00, 38.76it/s]
Epoch 0: 100%|█| 1/1 [00:00<00:00, 25.76it/s, val_loss=0.694, val_acc=0.344, tra[A

`Trainer.fit` stopped: `max_steps=1` reached.


Epoch 0: 100%|█| 1/1 [00:00<00:00, 24.80it/s, val_loss=0.694, val_acc=0.344, tra


## Training

In [71]:
trainer = Trainer(logger=CSVLogger(save_dir=logging_folder, name="bow_model_v0"),
                 callbacks=callbacks,
                 accelerator="cpu",
                  devices="auto",
                 deterministic=True,
                 max_epochs=max_epochs,
                 fast_dev_run=False
                 )

Trainer already configured with model summary callbacks: [<class 'lightning.pytorch.callbacks.model_summary.ModelSummary'>]. Skipping setting a default `ModelSummary` callback.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [72]:
trainer.fit(model=Logreg_L_Model, 
            train_dataloaders=train_dataloader, 
            val_dataloaders=val_dataloader
           )


  | Name      | Type               | Params | Mode 
---------------------------------------------------------
0 | model     | PytorchLogReg      | 40.0 K | train
1 | train_acc | MulticlassAccuracy | 0      | train
2 | val_acc   | MulticlassAccuracy | 0      | train
3 | test_acc  | MulticlassAccuracy | 0      | train
---------------------------------------------------------
40.0 K    Trainable params
0         Non-trainable params
40.0 K    Total params
0.160     Total estimated model params size (MB)
5         Modules in train mode
0         Modules in eval mode


Epoch 0: 100%|████████████████████| 1094/1094 [00:08<00:00, 133.56it/s, v_num=0]
Epoch 1: 100%|█| 1094/1094 [00:09<00:00, 109.50it/s, v_num=0, val_loss=0.650, va[A
Epoch 2: 100%|█| 1094/1094 [00:09<00:00, 118.00it/s, v_num=0, val_loss=0.622, va[A
Epoch 3: 100%|█| 1094/1094 [00:11<00:00, 95.73it/s, v_num=0, val_loss=0.599, val[A
Epoch 4: 100%|█| 1094/1094 [00:08<00:00, 123.60it/s, v_num=0, val_loss=0.581, va[A
Epoch 5: 100%|█| 1094/1094 [00:07<00:00, 146.62it/s, v_num=0, val_loss=0.567, va[A
Epoch 6: 100%|█| 1094/1094 [00:07<00:00, 144.17it/s, v_num=0, val_loss=0.555, va[A
Epoch 7: 100%|█| 1094/1094 [00:07<00:00, 138.24it/s, v_num=0, val_loss=0.544, va[A
Epoch 8: 100%|█| 1094/1094 [00:08<00:00, 128.27it/s, v_num=0, val_loss=0.535, va[A
Epoch 9: 100%|█| 1094/1094 [00:08<00:00, 128.99it/s, v_num=0, val_loss=0.527, va[A
Epoch 10: 100%|█| 1094/1094 [00:08<00:00, 136.30it/s, v_num=0, val_loss=0.519, v[A
Epoch 11: 100%|█| 1094/1094 [00:08<00:00, 129.11it/s, v_num=0, val_loss=0.513, 

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|█| 1094/1094 [00:08<00:00, 122.00it/s, v_num=0, val_loss=0.474, v


### Test set evaluation

In [73]:
trainer.test(model=Logreg_L_Model, dataloaders=test_dataloader, ckpt_path="best")

Restoring states from the checkpoint path at /media/skesava/D/Training/MachineLearning/Pytorch/NLP_using_LLMs/SentimentAnalysis/model_checkpoints/SA_LogReg_v2/epoch=18-val_acc=0.81.ckpt
Loaded model weights from the checkpoint at /media/skesava/D/Training/MachineLearning/Pytorch/NLP_using_LLMs/SentimentAnalysis/model_checkpoints/SA_LogReg_v2/epoch=18-val_acc=0.81.ckpt
/media/skesava/D/Training/MachineLearning/Pytorch/NLP_using_LLMs/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Testing: |                                                | 0/? [00:00<?, ?it/s]────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_acc            0.8097000122070312
        test_loss           0.47405463457107544
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 0.47405463457107544, 'test_acc': 0.8097000122070312}]

In [74]:
trainer.test(model=Logreg_L_Model, dataloaders=test_dataloader, ckpt_path="last")

Restoring states from the checkpoint path at /media/skesava/D/Training/MachineLearning/Pytorch/NLP_using_LLMs/SentimentAnalysis/model_checkpoints/SA_LogReg_v2/last.ckpt
Loaded model weights from the checkpoint at /media/skesava/D/Training/MachineLearning/Pytorch/NLP_using_LLMs/SentimentAnalysis/model_checkpoints/SA_LogReg_v2/last.ckpt


Testing: |                                                | 0/? [00:00<?, ?it/s]────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_acc            0.8037999868392944
        test_loss           0.4721769392490387
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 0.4721769392490387, 'test_acc': 0.8037999868392944}]