* Sentiment analysis
* Baseline: Logistic Regression
* IMDB dataset

# Import libraries

In [1]:
# for reloading any changed module
%load_ext autoreload
%autoreload 2

In [73]:
import numpy as np
import pandas as pd
import torch

In [3]:
from pathlib import Path
import sys
import os

In [65]:
from torch.utils.data import DataLoader

In [4]:
os.path.curdir

'.'

In [5]:
Path.cwd().parent.parent.parent

PosixPath('/media/skesava/D/Training/MachineLearning')

# Download IMDB dataset

In [6]:
from local_dataset_utilities import download_dataset

In [7]:
download_dataset(download_location=Path.cwd().parent.parent.parent.joinpath("ML datasets/IMDB_SentimentAnalysis_TextData"))

In [8]:
data_directory = Path.cwd().parent.parent.parent.joinpath("ML datasets/IMDB_SentimentAnalysis_TextData/aclImdb")

In [9]:
if not data_directory.exists():
    raise FileNotFoundError()

# Process Data

In [10]:
for item in data_directory.glob(pattern='*'):
    print(item)

/media/skesava/D/Training/MachineLearning/ML datasets/IMDB_SentimentAnalysis_TextData/aclImdb/processed_data
/media/skesava/D/Training/MachineLearning/ML datasets/IMDB_SentimentAnalysis_TextData/aclImdb/train
/media/skesava/D/Training/MachineLearning/ML datasets/IMDB_SentimentAnalysis_TextData/aclImdb/README
/media/skesava/D/Training/MachineLearning/ML datasets/IMDB_SentimentAnalysis_TextData/aclImdb/test
/media/skesava/D/Training/MachineLearning/ML datasets/IMDB_SentimentAnalysis_TextData/aclImdb/imdb.vocab
/media/skesava/D/Training/MachineLearning/ML datasets/IMDB_SentimentAnalysis_TextData/aclImdb/imdbEr.txt


In [11]:
train_dir = data_directory.joinpath('train')
test_dir = data_directory.joinpath('test')

### README information

* There is also a dataset of 50000 reviews for unsupervised learning

In [12]:
from local_dataset_utilities import load_dataset_into_to_dataframe

In [13]:
# from importlib import reload
# only works if the module is in sys.modules

In [14]:
np.random.seed(0)

In [15]:
df = load_dataset_into_to_dataframe(data_directory)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50000/50000 [00:49<00:00, 1009.71it/s]


In [16]:
df = df.reindex(np.random.permutation(df.index))

In [17]:
df.head(10)

Unnamed: 0,text,label
0,I went and saw this movie last night after bei...,1
0,Actor turned director Bill Paxton follows up h...,1
0,As a recreational golfer with some knowledge o...,1
0,"I saw this film in a sneak preview, and it is ...",1
0,Bill Paxton has taken the true story of the 19...,1
0,"I saw this film on September 1st, 2005 in Indi...",1
0,"Maybe I'm reading into this too much, but I wo...",1
0,I felt this film did have many good qualities....,1
0,This movie is amazing because the fact that th...,1
0,"""Quitting"" may be as much about exiting a pre-...",1


In [18]:
df.tail(10)

Unnamed: 0,text,label
0,Yeti: Curse of the Snow Demon starts aboard a ...,0
0,"Hmmm, a sports team is in a plane crash, gets ...",0
0,"I saw this piece of garbage on AMC last night,...",0
0,Although the production and Jerry Jameson's di...,0
0,Capt. Gallagher (Lemmon) and flight attendant ...,0
0,"Towards the end of the movie, I felt it was to...",0
0,This is the kind of movie that my enemies cont...,0
0,I saw 'Descent' last night at the Stockholm Fi...,0
0,Some films that you pick up for a pound turn o...,0
0,"This is one of the dumbest films, I've ever se...",0


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50000 entries, 0 to 0
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    50000 non-null  object
 1   label   50000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.1+ MB


# Examine data

In [20]:
np.bincount(df.label)

array([25000, 25000])

In [21]:
np.unique(df.label, return_counts=True)

(array([0, 1]), array([25000, 25000]))

#### NOTE

* Equal number of 0 and 1 labels

# Partition data and save

In [22]:
df = df.sample(frac=1)
df.head(10)

Unnamed: 0,text,label
0,Robert Duvall is a direct descendent of Confed...,1
0,"The Railway Children, at least this 1970 movie...",1
0,Anatomie was a German made Movie and subtitled...,0
0,This film makes Clooney. All his films combine...,1
0,This is easily the best cinematic version of W...,1
0,We taped this when it aired on TV back in 1995...,1
0,This movie got extremely silly when things sta...,0
0,Its No wonder this was free with the Mail on S...,0
0,"Seriously, I don´t really get why people here ...",1
0,A great suspense movie with terrific slow came...,1


In [23]:
df.reset_index(inplace=True)
df.head(10)

Unnamed: 0,index,text,label
0,0,Robert Duvall is a direct descendent of Confed...,1
1,0,"The Railway Children, at least this 1970 movie...",1
2,0,Anatomie was a German made Movie and subtitled...,0
3,0,This film makes Clooney. All his films combine...,1
4,0,This is easily the best cinematic version of W...,1
5,0,We taped this when it aired on TV back in 1995...,1
6,0,This movie got extremely silly when things sta...,0
7,0,Its No wonder this was free with the Mail on S...,0
8,0,"Seriously, I don´t really get why people here ...",1
9,0,A great suspense movie with terrific slow came...,1


In [24]:
df.drop(columns = ['index'], inplace=True)
df.head(5)

Unnamed: 0,text,label
0,Robert Duvall is a direct descendent of Confed...,1
1,"The Railway Children, at least this 1970 movie...",1
2,Anatomie was a German made Movie and subtitled...,0
3,This film makes Clooney. All his films combine...,1
4,This is easily the best cinematic version of W...,1


In [25]:
df.tail(5)

Unnamed: 0,text,label
49995,"This film, for what it was set out to be, succ...",1
49996,I love this movie so much. It always makes me ...,1
49997,I was on a mission to watch Uwe Boll movies to...,0
49998,I haven't laughed so much in a theater in year...,0
49999,The story for Hare Rama Hare Krishna actually ...,1


In [26]:
data_directory

PosixPath('/media/skesava/D/Training/MachineLearning/ML datasets/IMDB_SentimentAnalysis_TextData/aclImdb')

In [27]:
processed_data_directory = data_directory.joinpath("processed_data")

In [28]:
processed_data_directory.exists()

True

In [29]:
if not processed_data_directory.exists():
    Path.mkdir(processed_data_directory)
    df.iloc[:35000].to_csv(processed_data_directory.joinpath("train.csv"), index=False, encoding="utf-8")
    df.iloc[35000:40000].to_csv(processed_data_directory.joinpath("val.csv"), index=False, encoding="utf-8")
    df.iloc[40000:].to_csv(processed_data_directory.joinpath("test.csv"), index=False, encoding="utf-8")

In [30]:
sys.getsizeof(df)/(1024 * 1024) # MB

65.73923110961914

In [31]:
del df

In [32]:
df_train = pd.read_csv(processed_data_directory.joinpath("train.csv"))
df_train.head(5)

Unnamed: 0,text,label
0,Robert Duvall is a direct descendent of Confed...,1
1,"The Railway Children, at least this 1970 movie...",1
2,Anatomie was a German made Movie and subtitled...,0
3,This film makes Clooney. All his films combine...,1
4,This is easily the best cinematic version of W...,1


In [33]:
df_train.tail(5)

Unnamed: 0,text,label
34995,This movie is well done. It really attempts to...,1
34996,Ever wonder where the ideas for romance novels...,1
34997,"How truly friendly, charming and cordial is th...",1
34998,Very good 1939 film where John Garfield plays ...,1
34999,I couldn't disagree more with those who says t...,1


In [34]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35000 entries, 0 to 34999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    35000 non-null  object
 1   label   35000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 547.0+ KB


In [35]:
df_test = pd.read_csv(processed_data_directory.joinpath("test.csv"))
df_test.head(5)

Unnamed: 0,text,label
0,"I have to say, when ""Pushing Daisies"" came out...",1
1,"Bah. Another tired, desultory reworking of an ...",0
2,It's a shame that Asterix and his buddy Obelix...,1
3,this is the worst movie ive ever seen. And i h...,0
4,The comments for Commune make it sound like a ...,0


In [36]:
df_val = pd.read_csv(processed_data_directory.joinpath("val.csv"))
df_val.head(5)

Unnamed: 0,text,label
0,This is one of the funniest movies I've ever s...,1
1,One of my favorite westerns and one of John Fo...,1
2,I felt I had to add a comment after seeing the...,0
3,Not to be mistaken as the highly touted Samuel...,0
4,I watched the entire movie recognizing the par...,0


# Bag of Words model

In [37]:
from sklearn.feature_extraction.text import CountVectorizer

In [76]:
max_input_features = 10000

In [38]:
cv = CountVectorizer(encoding='utf-8', 
                     lowercase=True, 
                     stop_words='english',
                     max_features=max_input_features
                    )

### FIT

In [39]:
cv.fit(df_train['text'])

In [40]:
cv.vocabulary_

{'robert': 7581,
 'duvall': 2882,
 'direct': 2619,
 'general': 3809,
 'lee': 5175,
 'according': 218,
 'imdb': 4505,
 'com': 1816,
 'movie': 5913,
 'seeing': 7891,
 'film': 3487,
 'think': 9028,
 'appearance': 536,
 'best': 943,
 'favorite': 3408,
 'films': 3495,
 'wish': 9858,
 'composer': 1902,
 'peter': 6566,
 'cd': 1468,
 'soundtrack': 8357,
 'available': 732,
 'wonderful': 9886,
 'scenery': 7798,
 'music': 5946,
 'true': 9288,
 'life': 5239,
 'especially': 3142,
 'live': 5307,
 'moved': 5909,
 'south': 8363,
 'real': 7203,
 'moment': 5833,
 'time': 9093,
 'moves': 5912,
 'slowly': 8255,
 'strangers': 8612,
 'remain': 7351,
 'railway': 7115,
 'children': 1600,
 '1970': 63,
 'version': 9553,
 'written': 9945,
 'directed': 2620,
 'long': 5350,
 'british': 1196,
 'character': 1530,
 'actor': 244,
 'lionel': 5286,
 'classic': 1694,
 'tells': 8958,
 'childhood': 1598,
 'story': 8601,
 'great': 3992,
 'simplicity': 8153,
 'charm': 1548,
 'sentimentality': 7939,
 'muted': 5959,
 'adventur

In [41]:
cv.vocabulary_.items()



In [42]:
sorted(cv.vocabulary_.items(), key=lambda x: x[1], reverse=True)

[('zucco', 9999),
 ('zorro', 9998),
 ('zoom', 9997),
 ('zoo', 9996),
 ('zone', 9995),
 ('zombies', 9994),
 ('zombie', 9993),
 ('zombi', 9992),
 ('zoey', 9991),
 ('zizek', 9990),
 ('zhang', 9989),
 ('zeta', 9988),
 ('zero', 9987),
 ('zellweger', 9986),
 ('zealand', 9985),
 ('zany', 9984),
 ('zane', 9983),
 ('zach', 9982),
 ('yul', 9981),
 ('youtube', 9980),
 ('youthful', 9979),
 ('youth', 9978),
 ('youngsters', 9977),
 ('youngest', 9976),
 ('younger', 9975),
 ('young', 9974),
 ('york', 9973),
 ('yokai', 9972),
 ('yesterday', 9971),
 ('yes', 9970),
 ('yep', 9969),
 ('yells', 9968),
 ('yellow', 9967),
 ('yelling', 9966),
 ('yell', 9965),
 ('years', 9964),
 ('yearning', 9963),
 ('year', 9962),
 ('yeah', 9961),
 ('yawn', 9960),
 ('yarn', 9959),
 ('yards', 9958),
 ('yard', 9957),
 ('ya', 9956),
 ('wynorski', 9955),
 ('www', 9954),
 ('wwii', 9953),
 ('wwf', 9952),
 ('wwe', 9951),
 ('ww2', 9950),
 ('wtf', 9949),
 ('wrote', 9948),
 ('wrongly', 9947),
 ('wrong', 9946),
 ('written', 9945),
 ('wri

#### NOTE

* The value of a key is the word's position in the list of selected words within the `max_threshold` by CountVectorizer, and not the number of times it occurs.

### Transform

In [43]:
X_train = cv.transform(df_train['text'])

In [44]:
X_train[0]

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 38 stored elements and shape (1, 10000)>

In [45]:
X_train[0].todense()

matrix([[0, 0, 0, ..., 0, 0, 0]])

In [46]:
X_train[0].todense()[0]

matrix([[0, 0, 0, ..., 0, 0, 0]])

In [47]:
np.unique(X_train[0].todense(), return_counts=True)

(matrix([[0, 0, 0, ..., 2, 2, 2]]), array([0, 0, 0, ..., 1, 1, 1]))

In [48]:
np.nonzero(X_train[0].todense())

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([ 218,  536,  732,  943, 1468, 1816, 1902, 2619, 2882, 3142, 3408,
        3487, 3495, 3809, 4505, 5175, 5239, 5307, 5833, 5909, 5912, 5913,
        5946, 6566, 7203, 7351, 7581, 7798, 7891, 8255, 8357, 8363, 8612,
        9028, 9093, 9288, 9858, 9886]))

In [49]:
# number of words
np.nonzero(X_train[0].todense())[0].size

38

In [50]:
cv.inverse_transform(X_train[0])

[array(['according', 'appearance', 'available', 'best', 'cd', 'com',
        'composer', 'direct', 'duvall', 'especially', 'favorite', 'film',
        'films', 'general', 'imdb', 'lee', 'life', 'live', 'moment',
        'moved', 'moves', 'movie', 'music', 'peter', 'real', 'remain',
        'robert', 'scenery', 'seeing', 'slowly', 'soundtrack', 'south',
        'strangers', 'think', 'time', 'true', 'wish', 'wonderful'],
       dtype='<U17')]

In [51]:
# Test
X_test = cv.transform(df_test['text'])
X_test[0].todense()

matrix([[0, 0, 0, ..., 0, 0, 0]])

In [52]:
# ValidationY
X_val = cv.transform(df_val['text'])
X_val[0].todense()

matrix([[0, 0, 0, ..., 0, 0, 0]])

In [53]:
type(X_train)

scipy.sparse._csr.csr_matrix

# torch Dataset and Dataloader

In [60]:
from local_dataset_utilities import IMDBDataset

#### Train dataset

In [61]:
train_dataset = IMDBDataset(X_train.todense(), df_train['label'].to_numpy())

#### Test dataset

In [62]:
test_dataset = IMDBDataset(X_test.todense(), df_test['label'].to_numpy())

#### Validation dataset

In [63]:
val_dataset = IMDBDataset(X_val.todense(), df_val['label'].to_numpy())

### Dataloaders

#### Batch size

In [64]:
batch_size = 32

In [66]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

##### Testing

In [70]:
for test_features, test_labels in train_dataloader:
    break

In [68]:
test_features.shape

torch.Size([32, 10000])

In [69]:
test_labels.shape

torch.Size([32])

In [71]:
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [72]:
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Model definition

## Pytorch model

In [74]:
from pytorch_model import PytorchLogReg

In [77]:
logreg_model = PytorchLogReg(num_features=max_input_features, num_classes=2)

## Lightning model