# Tutorial - Sentiment Analysis - imdb

In [1]:
import pandas as pd
import numpy as np

In [2]:
imdb = pd.read_csv('movie.csv')

In [3]:
imdb.head(5)

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


## Change the target variable to ordinal

This is a multi-class classification problem. There are three categories we will predict:<br>
Whether a post is "graphics," "hockey," or "medical" related

#### Keras doesn't like text-based target values. So, we have to change it to "ordinal" values. Though, this is only needed to convert each category to an integer value.

In [4]:
#Convert the target to ordinal to avoid text 
from sklearn.preprocessing import OrdinalEncoder

enc = OrdinalEncoder()

imdb['target'] = enc.fit_transform(imdb[['label']])



In [5]:
imdb.head()

Unnamed: 0,text,label,target
0,I grew up (b. 1965) watching and loving the Th...,0,0.0
1,"When I put this movie in my DVD player, and sa...",0,0.0
2,Why do people who do not know what a particula...,0,0.0
3,Even though I have great interest in Biblical ...,0,0.0
4,Im a die hard Dads Army fan and nothing will e...,1,1.0


In [6]:
target = imdb['target']

## Assign the "text" (input) variable

In [7]:
# Check for missing values

imdb[['text']].isna().sum()

text    0
dtype: int64

In [8]:
# If there were missing values:
# imdb['text'].fillna('missing', inplace=True)

In [9]:
input_data = imdb['text']

## Split the data

In [10]:
from sklearn.model_selection import train_test_split

train_set, test_set, train_y, test_y = train_test_split(input_data, target, test_size=0.3, random_state=42)

In [11]:
train_set.shape, train_y.shape

((28000,), (28000,))

In [12]:
test_set.shape, test_y.shape

((12000,), (12000,))

## Keras: Tokenizer

In order to use Keras, you first need to install tensorflow. You can start the Anaconda Prompt and enter the following to do so: `pip install tensorflow`

Keras Tokenizer works a little different than scikit-learn (but the idea is the same)

In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer

keras_tokenizer = Tokenizer(num_words=2000, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True)
# Only looking at most important 2,000 "words"/columns

keras_tokenizer.fit_on_texts(train_set)  # similar to fit_transform()

2023-10-22 15:31:19.657747: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-22 15:31:19.683994: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [14]:
# After identifying the terms to be used in the term-by-document matrix, 
# create the matrix using one of the below using tfidf

train_binary_matrix = keras_tokenizer.texts_to_matrix(train_set, mode='tfidf')
train_binary_matrix.shape

(28000, 2000)

In [15]:
# Now we need to perform the test data set

test_binary_matrix = keras_tokenizer.texts_to_matrix(test_set, mode='tfidf')
test_binary_matrix.shape

(12000, 2000)

In [16]:
train_binary_matrix

array([[0.        , 1.94633466, 0.7100851 , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.18041367, 1.85292298, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 2.22901331, 1.85292298, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 2.88314801, 1.85292298, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 3.17585807, 2.4127948 , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.46309232, 1.49019332, ..., 0.        , 0.        ,
        0.        ]])

In [17]:
# Tokenizer's attributes:

print(keras_tokenizer.word_index) # shows the indeces -- will include the first 2000 columns




## Random Forest

In [18]:
from sklearn.ensemble import RandomForestClassifier 

from sklearn.metrics import accuracy_score

In [19]:
rnd_clf = RandomForestClassifier(n_estimators=300, n_jobs=-1) 

rnd_clf.fit(train_binary_matrix, train_y)



## Accuracy

In [20]:
from sklearn.metrics import accuracy_score

In [21]:
#Train accuracy

train_y_pred = rnd_clf.predict(train_binary_matrix)

train_acc = accuracy_score(train_y, train_y_pred)

print('Train acc: {}' .format(train_acc))

Train acc: 1.0


In [22]:
#Test accuracy

test_y_pred = rnd_clf.predict(test_binary_matrix)

test_acc = accuracy_score(test_y, test_y_pred)

print('Test acc: {}' .format(test_acc))

Test acc: 0.849


# Confusion Matrix

In [23]:
from sklearn.metrics import confusion_matrix

#Usually created on test set
confusion_matrix(test_y, test_y_pred)

array([[5022,  942],
       [ 870, 5166]])

## Stochastic Gradient Descent Classifier

In [24]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(max_iter=150, tol=1e-3)


In [25]:
sgd_clf.fit(train_binary_matrix, train_y)

## Accuracy

In [26]:
#Train accuracy

train_y_pred = sgd_clf.predict(train_binary_matrix)

train_acc = accuracy_score(train_y, train_y_pred)

print('Train acc: {}' .format(train_acc))

Train acc: 0.8956785714285714


In [27]:
#Test accuracy

test_y_pred = sgd_clf.predict(test_binary_matrix)

test_acc = accuracy_score(test_y, test_y_pred)

print('Test acc: {}' .format(test_acc))

Test acc: 0.8611666666666666


# Confusion Matrix

In [28]:
from sklearn.metrics import confusion_matrix

#Usually created on test set
confusion_matrix(test_y, test_y_pred)

array([[4902, 1062],
       [ 604, 5432]])