In [None]:
# Extracting the data from tar.gz file
import tarfile
tf = tarfile.open("aclImdb_v1.tar.gz")
tf.extractall()

In [1]:
# Importing the required libraries
import os
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import LSTM, Embedding, Dense

In [2]:
# Collecting/organizing the data
        
pos_reviews_train = []
neg_reviews_train = []
pos_reviews_test = []
neg_reviews_test = []

directory = "aclImdb/train/pos/"
for filename in os.listdir(directory) :
    f = os.path.join(directory, filename)
    file = open(f, "r", encoding='UTF8')
    text = file.read()
    pos_reviews_train.append(text)

directory = "aclImdb/test/pos/"
for filename in os.listdir(directory) :
    f = os.path.join(directory, filename)
    file = open(f, "r", encoding='UTF8')
    text = file.read()
    pos_reviews_test.append(text)

directory = "aclImdb/train/neg/"
for filename in os.listdir(directory) :
    f = os.path.join(directory, filename)
    file = open(f, "r", encoding='UTF8')
    text = file.read()
    neg_reviews_train.append(text)

directory = "aclImdb/test/neg/"
for filename in os.listdir(directory) :
    f = os.path.join(directory, filename)
    file = open(f, "r", encoding='UTF8')
    text = file.read()
    neg_reviews_test.append(text)


In [3]:
# Creating dfs for train and test data
train_df = pd.DataFrame(pos_reviews_train + neg_reviews_train, columns = ["text"])
train_df["sentiment"] = [1] * len(pos_reviews_train) + [0] * len(neg_reviews_train)
test_df = pd.DataFrame(pos_reviews_test + neg_reviews_test, columns = ["text"])
test_df["sentiment"] = [1] * len(pos_reviews_test) + [0] * len(neg_reviews_test)


In [4]:
# Creating bag of words vectorizer with only unigrams
v = CountVectorizer()
X_train_uni = v.fit_transform(train_df["text"])
y_train_uni = train_df["sentiment"]
X_test_uni = v.transform(test_df["text"])
y_test_uni = test_df["sentiment"]

In [5]:
# Creating a model using Naive Bayes to test unigram bag of words
nb = MultinomialNB()
nb.fit(X_train_uni, y_train_uni)
nb.score(X_test_uni, y_test_uni)
y_pred_uni = nb.predict(X_test_uni)
print(classification_report(y_test_uni, y_pred_uni))

              precision    recall  f1-score   support

           0       0.78      0.88      0.83     12500
           1       0.86      0.75      0.80     12500

    accuracy                           0.81     25000
   macro avg       0.82      0.81      0.81     25000
weighted avg       0.82      0.81      0.81     25000



In [6]:
# TF-IDF vectorizer and bigrams
tfidf = TfidfVectorizer(ngram_range=(2,2))
X_train_bi = tfidf.fit_transform(train_df["text"])
y_train_bi = train_df["sentiment"]
X_test_bi = tfidf.transform(test_df["text"])
y_test_bi = test_df["sentiment"]


In [7]:
# Creating a model using Naive Bayes to test bigram bag of words
nb.fit(X_train_bi, y_train_bi)
nb.score(X_test_bi, y_test_bi)
y_pred_bi = nb.predict(X_test_bi)
print(classification_report(y_test_bi, y_pred_bi))

              precision    recall  f1-score   support

           0       0.85      0.91      0.88     12500
           1       0.91      0.84      0.87     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000



In [8]:
# Word tokenizer for LSTM model
word_tokenizer = Tokenizer()
X_train_LSTM = train_df["text"]
y_train_LSTM = train_df["sentiment"]
X_test_LSTM = test_df["text"]
y_test_LSTM = test_df["sentiment"]
word_tokenizer.fit_on_texts(X_train_LSTM)
X_train_LSTM = word_tokenizer.texts_to_sequences(X_train_LSTM)
X_test_LSTM = word_tokenizer.texts_to_sequences(X_test_LSTM)
vocab_size = len(word_tokenizer.word_index) + 1

# Padding the reviews to make them of same length
maxsize = 100
X_train_LSTM = sequence.pad_sequences(X_train_LSTM, padding='post', maxlen=maxsize)
X_test_LSTM = sequence.pad_sequences(X_test_LSTM, padding='post', maxlen=maxsize)

In [9]:
# Creating a sequential model using LSTM
embedding_size = 32
model = Sequential()
model.add(Embedding(vocab_size, embedding_size, input_length=maxsize))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))

In [10]:
# Train and evaluate the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train_LSTM, y_train_LSTM, epochs=3, verbose=1, validation_data=(X_test_LSTM, y_test_LSTM))
loss, accuracy = model.evaluate(X_test_LSTM, y_test_LSTM, verbose=1)
print('Accuracy: %f' % (accuracy*100))

Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 82.572001
