* Sentiment analysis
* Baseline: Logistic Regression
* IMDB dataset

In [1]:
# for reloading any changed module
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path

In [3]:
import os

In [4]:
os.path.curdir

'.'

In [5]:
Path.cwd().parent.parent.parent

PosixPath('/media/skesava/D/Training/MachineLearning')

# Download IMDB dataset

In [6]:
from local_dataset_utilities import download_dataset

In [7]:
download_dataset(download_location=Path.cwd().parent.parent.parent.joinpath("ML datasets/IMDB_SentimentAnalysis_TextData"))

In [8]:
data_directory = Path.cwd().parent.parent.parent.joinpath("ML datasets/IMDB_SentimentAnalysis_TextData/aclImdb")

In [9]:
if not data_directory.exists():
    raise FileNotFoundError()

# Process Data

In [10]:
for item in data_directory.glob(pattern='*'):
    print(item)

/media/skesava/D/Training/MachineLearning/ML datasets/IMDB_SentimentAnalysis_TextData/aclImdb/train
/media/skesava/D/Training/MachineLearning/ML datasets/IMDB_SentimentAnalysis_TextData/aclImdb/README
/media/skesava/D/Training/MachineLearning/ML datasets/IMDB_SentimentAnalysis_TextData/aclImdb/test
/media/skesava/D/Training/MachineLearning/ML datasets/IMDB_SentimentAnalysis_TextData/aclImdb/imdb.vocab
/media/skesava/D/Training/MachineLearning/ML datasets/IMDB_SentimentAnalysis_TextData/aclImdb/imdbEr.txt


In [11]:
train_dir = data_directory.joinpath('train')
test_dir = data_directory.joinpath('test')

### README information

* There is also a dataset of 50000 reviews for unsupervised learning

In [12]:
from local_dataset_utilities import load_dataset_into_to_dataframe

In [13]:
# from importlib import reload
# only works if the module is in sys.modules

In [14]:
np.random.seed(0)

In [15]:
df = load_dataset_into_to_dataframe(data_directory)

100%|█████████████████████████████████████████████| 50000/50000 [00:50<00:00, 991.12it/s]

Class distribution: [25000 25000]





In [16]:
df = df.reindex(np.random.permutation(df.index))

In [17]:
df.head(10)

Unnamed: 0,text,label
0,I went and saw this movie last night after bei...,1
0,Actor turned director Bill Paxton follows up h...,1
0,As a recreational golfer with some knowledge o...,1
0,"I saw this film in a sneak preview, and it is ...",1
0,Bill Paxton has taken the true story of the 19...,1
0,"I saw this film on September 1st, 2005 in Indi...",1
0,"Maybe I'm reading into this too much, but I wo...",1
0,I felt this film did have many good qualities....,1
0,This movie is amazing because the fact that th...,1
0,"""Quitting"" may be as much about exiting a pre-...",1


In [19]:
df.tail(10)

Unnamed: 0,text,label
0,Yeti: Curse of the Snow Demon starts aboard a ...,0
0,"Hmmm, a sports team is in a plane crash, gets ...",0
0,"I saw this piece of garbage on AMC last night,...",0
0,Although the production and Jerry Jameson's di...,0
0,Capt. Gallagher (Lemmon) and flight attendant ...,0
0,"Towards the end of the movie, I felt it was to...",0
0,This is the kind of movie that my enemies cont...,0
0,I saw 'Descent' last night at the Stockholm Fi...,0
0,Some films that you pick up for a pound turn o...,0
0,"This is one of the dumbest films, I've ever se...",0


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50000 entries, 0 to 0
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    50000 non-null  object
 1   label   50000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.1+ MB


# Examine data

In [20]:
np.bincount(df.label)

array([25000, 25000])

In [21]:
np.unique(df.label, return_counts=True)

(array([0, 1]), array([25000, 25000]))

#### Note
* Equal number of 0 and 1 labels

# Partition data and save

In [22]:
df = df.sample(frac=1)
df.head(10)

Unnamed: 0,text,label
0,Robert Duvall is a direct descendent of Confed...,1
0,"The Railway Children, at least this 1970 movie...",1
0,Anatomie was a German made Movie and subtitled...,0
0,This film makes Clooney. All his films combine...,1
0,This is easily the best cinematic version of W...,1
0,We taped this when it aired on TV back in 1995...,1
0,This movie got extremely silly when things sta...,0
0,Its No wonder this was free with the Mail on S...,0
0,"Seriously, I don´t really get why people here ...",1
0,A great suspense movie with terrific slow came...,1


In [23]:
df.reset_index(inplace=True)
df.head(10)

Unnamed: 0,index,text,label
0,0,Robert Duvall is a direct descendent of Confed...,1
1,0,"The Railway Children, at least this 1970 movie...",1
2,0,Anatomie was a German made Movie and subtitled...,0
3,0,This film makes Clooney. All his films combine...,1
4,0,This is easily the best cinematic version of W...,1
5,0,We taped this when it aired on TV back in 1995...,1
6,0,This movie got extremely silly when things sta...,0
7,0,Its No wonder this was free with the Mail on S...,0
8,0,"Seriously, I don´t really get why people here ...",1
9,0,A great suspense movie with terrific slow came...,1


In [25]:
df.drop(columns = ['index'], inplace=True)
df.head(5)

Unnamed: 0,text,label
0,Robert Duvall is a direct descendent of Confed...,1
1,"The Railway Children, at least this 1970 movie...",1
2,Anatomie was a German made Movie and subtitled...,0
3,This film makes Clooney. All his films combine...,1
4,This is easily the best cinematic version of W...,1


In [27]:
df.tail(5)

Unnamed: 0,text,label
49995,"This film, for what it was set out to be, succ...",1
49996,I love this movie so much. It always makes me ...,1
49997,I was on a mission to watch Uwe Boll movies to...,0
49998,I haven't laughed so much in a theater in year...,0
49999,The story for Hare Rama Hare Krishna actually ...,1


In [28]:
data_directory

PosixPath('/media/skesava/D/Training/MachineLearning/ML datasets/IMDB_SentimentAnalysis_TextData/aclImdb')

In [29]:
processed_data_directory = data_directory.joinpath("processed_data")

In [30]:
processed_data_directory.exists()

False

In [31]:
if not processed_data_directory.exists():
    Path.mkdir(processed_data_directory)

In [32]:
df.iloc[:35000].to_csv(processed_data_directory.joinpath("train.csv"), index=False, encoding="utf-8")
df.iloc[35000:40000].to_csv(processed_data_directory.joinpath("val.csv"), index=False, encoding="utf-8")
df.iloc[40000:].to_csv(processed_data_directory.joinpath("test.csv"), index=False, encoding="utf-8")

# torch Dataset and Dataloader

In [33]:
from local_dataset_utilities import IMDBDataset

# TODO: create datasets

In [34]:
train_dataset = IMDBDataset(processed_data_directory.joinpath("train.csv"))

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint64, uint32, uint16, uint8, and bool.

In [38]:
import torch

In [39]:
torch.tensor(df['text'].to_numpy())

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint64, uint32, uint16, uint8, and bool.