* Sentiment analysis
* Baseline: Logistic Regression
* IMDB dataset

* From v0.
* The main change is using ngram of 4 instead of 1

# Import libraries

In [1]:
# for reloading any changed module
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import torch

In [3]:
from pathlib import Path
import sys
import os

In [4]:
from torch.utils.data import DataLoader

In [5]:
Path.cwd().parent.parent.parent

PosixPath('/media/skesava/D/Training/MachineLearning')

In [6]:
data_directory = Path.cwd().parent.parent.parent.joinpath("ML datasets/IMDB_SentimentAnalysis_TextData/aclImdb")

In [7]:
if not data_directory.exists():
    raise FileNotFoundError()

In [9]:
processed_data_directory = data_directory.joinpath("processed_data")

In [10]:
processed_data_directory.exists()

True

In [11]:
df_train = pd.read_csv(processed_data_directory.joinpath("train.csv"))
df_train.head(5)

Unnamed: 0,text,label
0,Robert Duvall is a direct descendent of Confed...,1
1,"The Railway Children, at least this 1970 movie...",1
2,Anatomie was a German made Movie and subtitled...,0
3,This film makes Clooney. All his films combine...,1
4,This is easily the best cinematic version of W...,1


In [12]:
df_train.tail(5)

Unnamed: 0,text,label
34995,This movie is well done. It really attempts to...,1
34996,Ever wonder where the ideas for romance novels...,1
34997,"How truly friendly, charming and cordial is th...",1
34998,Very good 1939 film where John Garfield plays ...,1
34999,I couldn't disagree more with those who says t...,1


In [13]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35000 entries, 0 to 34999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    35000 non-null  object
 1   label   35000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 547.0+ KB


In [14]:
df_test = pd.read_csv(processed_data_directory.joinpath("test.csv"))
df_test.head(5)

Unnamed: 0,text,label
0,"I have to say, when ""Pushing Daisies"" came out...",1
1,"Bah. Another tired, desultory reworking of an ...",0
2,It's a shame that Asterix and his buddy Obelix...,1
3,this is the worst movie ive ever seen. And i h...,0
4,The comments for Commune make it sound like a ...,0


In [15]:
df_val = pd.read_csv(processed_data_directory.joinpath("val.csv"))
df_val.head(5)

Unnamed: 0,text,label
0,This is one of the funniest movies I've ever s...,1
1,One of my favorite westerns and one of John Fo...,1
2,I felt I had to add a comment after seeing the...,0
3,Not to be mistaken as the highly touted Samuel...,0
4,I watched the entire movie recognizing the par...,0


# Bag of Words model

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
max_input_features = 10000

In [18]:
cv = CountVectorizer(encoding='utf-8', 
                     lowercase=True, 
                     stop_words='english',
                     max_features=max_input_features,
                     ngram_range=(4,4)                     
                    )

### FIT

In [19]:
cv.fit(df_train['text'])

In [20]:
cv.vocabulary_

{'important story br br': 5923,
 'happy ending br br': 5663,
 've seen ve seen': 9464,
 'br br far know': 1411,
 'br br actors great': 912,
 'entire movie br br': 4741,
 'br br given film': 1669,
 'br br downside movie': 1294,
 'br br opening scene': 2470,
 'different br br movie': 4346,
 'br br movie does': 2277,
 'just don make movies': 6108,
 'don make movies like': 4490,
 'make movies like anymore': 6784,
 'br br fact actually': 1371,
 'love smell napalm morning': 6675,
 'br br lot people': 2133,
 'br br movie based': 2249,
 'possible way br br': 7854,
 'br br true classic': 3211,
 'br br rating 10': 2676,
 'br br ll rooting': 2108,
 'br br great film': 1741,
 'br br film funny': 1503,
 'watching old black white': 9690,
 'br br bad things': 992,
 'br br ve seen': 3255,
 've seen half dozen': 9435,
 'seen half dozen times': 8507,
 'half dozen times time': 5640,
 'br br does film': 1244,
 'br br movie funny': 2297,
 'film extremely low budget': 5085,
 'br br probably best': 2634,
 'n

In [21]:
cv.vocabulary_.items()



In [22]:
sorted(cv.vocabulary_.items(), key=lambda x: x[1], reverse=True)

[('zu warriors magic mountain', 9999),
 ('zombie movie br br', 9998),
 ('zero day br br', 9997),
 ('zero 10 br br', 9996),
 ('young woman just met', 9995),
 ('young woman br br', 9994),
 ('young randolph scott dean', 9993),
 ('young man br br', 9992),
 ('young adults br br', 9991),
 ('young actor br br', 9990),
 ('yes br br movie', 9989),
 ('yeh dil na hota', 9988),
 ('years right understand initial', 9987),
 ('years passed br br', 9986),
 ('years old time saw', 9985),
 ('years old br br', 9984),
 ('years later saw movie', 9983),
 ('years later br br', 9982),
 ('years earlier br br', 9981),
 ('years come br br', 9980),
 ('years br br seen', 9979),
 ('years br br real', 9978),
 ('years br br movie', 9977),
 ('years br br film', 9976),
 ('years ago ve watched', 9975),
 ('years ago saw movie', 9974),
 ('years ago saw film', 9973),
 ('years ago haven seen', 9972),
 ('years ago br br', 9971),
 ('years 100 stars men', 9970),
 ('years 100 laughs good', 9969),
 ('year olds br br', 9968),
 ('ye

#### NOTE

* The value of a key is the word's position in the list of selected words within the `max_threshold` by CountVectorizer, and not the number of times it occurs.

### Transform

In [23]:
X_train = cv.transform(df_train['text'])

In [24]:
X_train[0]

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 0 stored elements and shape (1, 10000)>

In [25]:
X_train[0].todense()

matrix([[0, 0, 0, ..., 0, 0, 0]])

In [26]:
X_train[0].todense()[0]

matrix([[0, 0, 0, ..., 0, 0, 0]])

In [27]:
np.unique(X_train[0].todense(), return_counts=True)

(matrix([[0, 0, 0, ..., 0, 0, 0]]), array([0, 0, 0, ..., 1, 1, 1]))

In [28]:
np.nonzero(X_train[0].todense())
# first array: row index; # second array: column index

(array([], dtype=int64), array([], dtype=int64))

In [29]:
# number of words
np.nonzero(X_train[0].todense())[0].size

0

In [30]:
cv.inverse_transform(X_train[0])

[array([], dtype='<U291')]

In [31]:
# Test
X_test = cv.transform(df_test['text'])
X_test[0].todense()

matrix([[0, 0, 0, ..., 0, 0, 0]])

In [32]:
# ValidationY
X_val = cv.transform(df_val['text'])
X_val[0].todense()

matrix([[0, 0, 0, ..., 0, 0, 0]])

In [33]:
type(X_train)

scipy.sparse._csr.csr_matrix

# torch Dataset and Dataloader

In [34]:
from local_dataset_utilities import IMDBDataset

#### Train dataset

In [35]:
train_dataset = IMDBDataset(X_train.todense(), df_train['label'].to_numpy())

#### Test dataset

In [36]:
test_dataset = IMDBDataset(X_test.todense(), df_test['label'].to_numpy())

#### Validation dataset

In [37]:
val_dataset = IMDBDataset(X_val.todense(), df_val['label'].to_numpy())

### Dataloaders

#### Batch size

In [38]:
batch_size = 32

In [39]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

##### Testing

In [40]:
for test_features, test_labels in train_dataloader:
    break

In [41]:
test_features.shape

torch.Size([32, 10000])

In [42]:
test_labels.shape

torch.Size([32])

In [43]:
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [44]:
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Model definitions

## Pytorch model

In [45]:
from pytorch_model import PytorchLogReg

In [46]:
logreg_model = PytorchLogReg(num_features=max_input_features, num_classes=2)

## Lightning model

In [47]:
from pytorch_model import LightningModel

In [48]:
learning_rate = 0.01

In [49]:
Logreg_L_Model = LightningModel(pytorch_model=logreg_model, learning_rate=learning_rate)

## Callbacks

### Model checkpoint

In [50]:
from lightning.pytorch.callbacks import ModelCheckpoint

In [51]:
Path.cwd()

PosixPath('/media/skesava/D/Training/MachineLearning/Pytorch/NLP_using_LLMs/SentimentAnalysis')

In [52]:
modelcheckpoints_dir = Path.cwd().joinpath("model_checkpoints")

In [53]:
if not modelcheckpoints_dir.exists():
    modelcheckpoints_dir.mkdir()

#### Set model checkpoint folder

In [54]:
import ipynbname

In [55]:
ipynbname.name()

'SA_LogReg_v1'

In [56]:
model_checkpoint_folder = modelcheckpoints_dir.joinpath(f"{ipynbname.name()}")
model_checkpoint_folder

PosixPath('/media/skesava/D/Training/MachineLearning/Pytorch/NLP_using_LLMs/SentimentAnalysis/model_checkpoints/SA_LogReg_v1')

In [57]:
if not model_checkpoint_folder.exists():
    model_checkpoint_folder.mkdir()

In [58]:
model_checkpoint = ModelCheckpoint(dirpath=model_checkpoint_folder,
                                  filename="{epoch}-{val_acc:.2f}",
                                  save_top_k=1,
                                  save_last=True,
                                  monitor="val_acc",
                                  mode="max"
                                  )

In [59]:
callbacks = [model_checkpoint]

### ProgressBar

In [60]:
from pytorch_model import MyProgressBar

In [61]:
callbacks.append(MyProgressBar())

In [62]:
callbacks

[<lightning.pytorch.callbacks.model_checkpoint.ModelCheckpoint at 0x7f5582ca5950>,
 <pytorch_model.MyProgressBar at 0x7f5582ca7790>]

# Model Training

## Logger

In [63]:
from lightning.pytorch.loggers import CSVLogger

In [64]:
logs_dir = Path.cwd().joinpath("logs")

In [65]:
if not logs_dir.exists():
    logs_dir.mkdir()

In [66]:
logging_folder = logs_dir.joinpath(ipynbname.name())

In [67]:
if not logging_folder.exists():
    logging_folder.mkdir()

## Lightning Trainer

In [68]:
from lightning import Trainer

In [69]:
max_epochs = 20

### Quick bug test

In [70]:
trainer = Trainer(logger=CSVLogger(save_dir=logging_folder, name="bow_model_v0"),
                 callbacks=callbacks,
                 accelerator="cpu",
                  devices="auto",
                 deterministic=True,
                 max_epochs=max_epochs,
                 fast_dev_run=True # testing for any bugs
                 )

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Running in `fast_dev_run` mode: will run the requested loop using 1 batch(es). Logging and checkpointing is suppressed.


In [71]:
trainer.fit(model=Logreg_L_Model, 
            train_dataloaders=train_dataloader, 
            val_dataloaders=val_dataloader
           )


  | Name      | Type               | Params | Mode 
---------------------------------------------------------
0 | model     | PytorchLogReg      | 20.0 K | train
1 | train_acc | MulticlassAccuracy | 0      | train
2 | val_acc   | MulticlassAccuracy | 0      | train
3 | test_acc  | MulticlassAccuracy | 0      | train
---------------------------------------------------------
20.0 K    Trainable params
0         Non-trainable params
20.0 K    Total params
0.080     Total estimated model params size (MB)
5         Modules in train mode
0         Modules in eval mode
/media/skesava/D/Training/MachineLearning/Pytorch/NLP_using_LLMs/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
/media/skesava/D/Training/MachineLearning/Pytorch/NLP_using_LLMs/.ven

Epoch 0: 100%|████████████████████████████████████| 1/1 [00:00<00:00, 41.74it/s]
Epoch 0: 100%|█| 1/1 [00:00<00:00, 27.89it/s, val_loss=0.693, val_acc=0.469, tra[A

`Trainer.fit` stopped: `max_steps=1` reached.


Epoch 0: 100%|█| 1/1 [00:00<00:00, 26.60it/s, val_loss=0.693, val_acc=0.469, tra


## Training

In [72]:
trainer = Trainer(logger=CSVLogger(save_dir=logging_folder, name="bow_model_v0"),
                 callbacks=callbacks,
                 accelerator="cpu",
                  devices="auto",
                 deterministic=True,
                 max_epochs=max_epochs,
                 fast_dev_run=False
                 )

Trainer already configured with model summary callbacks: [<class 'lightning.pytorch.callbacks.model_summary.ModelSummary'>]. Skipping setting a default `ModelSummary` callback.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [73]:
trainer.fit(model=Logreg_L_Model, 
            train_dataloaders=train_dataloader, 
            val_dataloaders=val_dataloader
           )


  | Name      | Type               | Params | Mode 
---------------------------------------------------------
0 | model     | PytorchLogReg      | 20.0 K | train
1 | train_acc | MulticlassAccuracy | 0      | train
2 | val_acc   | MulticlassAccuracy | 0      | train
3 | test_acc  | MulticlassAccuracy | 0      | train
---------------------------------------------------------
20.0 K    Trainable params
0         Non-trainable params
20.0 K    Total params
0.080     Total estimated model params size (MB)
5         Modules in train mode
0         Modules in eval mode


Epoch 0: 100%|████████████████████| 1094/1094 [00:06<00:00, 163.57it/s, v_num=0]
Epoch 1: 100%|█| 1094/1094 [00:07<00:00, 151.08it/s, v_num=0, val_loss=0.693, va[A
Epoch 2: 100%|█| 1094/1094 [00:07<00:00, 151.57it/s, v_num=0, val_loss=0.692, va[A
Epoch 3: 100%|█| 1094/1094 [00:07<00:00, 145.69it/s, v_num=0, val_loss=0.692, va[A
Epoch 4: 100%|█| 1094/1094 [00:07<00:00, 146.43it/s, v_num=0, val_loss=0.691, va[A
Epoch 5: 100%|█| 1094/1094 [00:07<00:00, 139.83it/s, v_num=0, val_loss=0.691, va[A
Epoch 6: 100%|█| 1094/1094 [00:07<00:00, 145.42it/s, v_num=0, val_loss=0.691, va[A
Epoch 7: 100%|█| 1094/1094 [00:07<00:00, 145.10it/s, v_num=0, val_loss=0.690, va[A
Epoch 8: 100%|█| 1094/1094 [00:07<00:00, 146.56it/s, v_num=0, val_loss=0.690, va[A
Epoch 9: 100%|█| 1094/1094 [00:07<00:00, 138.27it/s, v_num=0, val_loss=0.689, va[A
Epoch 10: 100%|█| 1094/1094 [00:07<00:00, 139.00it/s, v_num=0, val_loss=0.689, v[A
Epoch 11: 100%|█| 1094/1094 [00:07<00:00, 145.56it/s, v_num=0, val_loss=0.689, 

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|█| 1094/1094 [00:07<00:00, 138.62it/s, v_num=0, val_loss=0.686, v


### Test set evaluation

In [74]:
trainer.test(model=Logreg_L_Model, dataloaders=test_dataloader, ckpt_path="best")

Restoring states from the checkpoint path at /media/skesava/D/Training/MachineLearning/Pytorch/NLP_using_LLMs/SentimentAnalysis/model_checkpoints/SA_LogReg_v1/epoch=17-val_acc=0.59.ckpt
Loaded model weights from the checkpoint at /media/skesava/D/Training/MachineLearning/Pytorch/NLP_using_LLMs/SentimentAnalysis/model_checkpoints/SA_LogReg_v1/epoch=17-val_acc=0.59.ckpt
/media/skesava/D/Training/MachineLearning/Pytorch/NLP_using_LLMs/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Testing: |                                                | 0/? [00:00<?, ?it/s]────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_acc            0.5907999873161316
        test_loss           0.6859671473503113
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 0.6859671473503113, 'test_acc': 0.5907999873161316}]

In [75]:
trainer.test(model=Logreg_L_Model, dataloaders=test_dataloader, ckpt_path="last")

Restoring states from the checkpoint path at /media/skesava/D/Training/MachineLearning/Pytorch/NLP_using_LLMs/SentimentAnalysis/model_checkpoints/SA_LogReg_v1/last.ckpt
Loaded model weights from the checkpoint at /media/skesava/D/Training/MachineLearning/Pytorch/NLP_using_LLMs/SentimentAnalysis/model_checkpoints/SA_LogReg_v1/last.ckpt


Testing: |                                                | 0/? [00:00<?, ?it/s]────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_acc            0.5669999718666077
        test_loss           0.6852495074272156
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 0.6852495074272156, 'test_acc': 0.5669999718666077}]