In [None]:
# import nltk
# nltk.download('punkt')
# nltk.download('stopwords')

In [None]:
# !wget https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
# !unzip glove.6B.zip

In [None]:
display_width = 100 # in percentage

from IPython.display import display, HTML
display(HTML("<style>.container { width:"+str(display_width)+"% !important; }</style>"))

In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
# import sys
# sys.path.append("/Users/sudhanshugupta/Library/Python/3.9/lib/python/site-packages")

In [None]:
import pandas as pd
import numpy as np

import os

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from EDA import EDA_movie_reviews
from preprocessing import preprocMovieReview
from feature_extraction import textVectorizer, gloveVectorizer
from utils import train_val_test_split
from classification import LogisticRegressionClf, BiLSTMClf

# Read Data

In [None]:
df_movie_reviews = pd.read_csv("./data/movie_review_train.tsv", delimiter="\t")
df_movie_reviews.sample(10)

## Basic Text Preprocessing

In [None]:
text_preprocessor = preprocMovieReview(df_movie_reviews['review'])
df_movie_reviews['review_clean'] = text_preprocessor.basic_text_sanitization_pipeline()

# Exploratory Data Analysis

In [None]:
eda = EDA_movie_reviews(df_movie_reviews, reviews_col='review_clean', target_col='sentiment')
eda.visualize_wordcloud()
eda.visulaize_class_distribution()
eda.statistics_on_review_text()
eda.show_common_n_grams(n=5, show_count=10)

In [None]:
(
    X_train, 
    y_train, 
    X_test, 
    y_test, 
    X_val, 
    y_val
) = train_val_test_split(
    df_movie_reviews, 
    feature_cols='review', 
    target_col='sentiment',
    test_percent=20, 
    val_percent=20,
)
# print(f"{X_train.shape=}, {y_train.shape=}, {X_test.shape=}, {y_test.shape=}, {X_val.shape=}, {y_val.shape=}")

# Establish Baseline: TFIDF Vectorizer + Basic Classifier

In [None]:
tfidf_vectorizer = textVectorizer(vectorizer_type='tfidf')
X_train_tfidf = tfidf_vectorizer.apply_transform_train(X_train)
X_test_tfidf = tfidf_vectorizer.apply_transform_test(X_test)

print(X_train_tfidf.shape, X_test_tfidf.shape)

In [None]:
clf_logistic_regression = LogisticRegressionClf()
clf_logistic_regression.fit_classifier(X_train_tfidf, y_train)
y_pred = clf_logistic_regression.predict_classifier(X_test_tfidf)
clf_logistic_regression.evaluate_classifier(y_test, y_pred)

# Improved Classifier: TFIDF Vectorizer + BiLSTM

In [None]:
tfidf_vectorizer = textVectorizer(vectorizer_type='tfidf')
X_train_tfidf = tfidf_vectorizer.apply_transform_train(X_train)
X_test_tfidf = tfidf_vectorizer.apply_transform_test(X_test)
X_val_tfidf = tfidf_vectorizer.apply_transform_test(X_val)

print(X_train_tfidf.shape, X_test_tfidf.shape, X_val_tfidf.shape)

In [None]:
# Example usage
hidden_size = 16
output_size = 2
num_epochs = 10
batch_size = 8
lr_init = 1e-3

embedding_dim = np.nan
vocab_size = X_train_tfidf.shape[1]
max_sequence_length = X_train_tfidf.shape[1]
HIDDEN_ACTIVATION = 'relu'
MAX_EPOCHS = 3
LR_INIT = 1e-3
BATCH_SIZE = 16
L2_REG_PENALTY = 1e-3
CALLBACKS = ["es", "rlrop", "chkpt", "tensorboard"]
VERBOSITY_LEVEL = 2
SAVE_DIR = "./data/"


clf_bilstm = BiLSTMClf(
    embedding_dim,
        vocab_size,
        max_sequence_length,
        HIDDEN_ACTIVATION,
        MAX_EPOCHS,
        LR_INIT,
        BATCH_SIZE,
        L2_REG_PENALTY,
        CALLBACKS,
        VERBOSITY_LEVEL,
        SAVE_DIR,
)

clf_bilstm.fit_classifier(X_train_tfidf[:32], y_train[:32], X_val_tfidf[:32], y_val[:32])
# y_pred = clf_bilstm.predict_classifier(X_test)
# clf_bilstm.evaluate_classifier(y_test, y_pred)

In [None]:
y_pred = clf_bilstm.predict_classifier(X_test_tfidf[:16*3])
clf_bilstm.evaluate_classifier(y_test[:16*3], y_pred)

# Improved Embeddings: Glove + BiLSTM

In [None]:
glove_vectorizer = gloveVectorizer(
    X_train, 
    embedding_dim=50, 
    vocab_size=10_000, 
    max_sequence_length=150,
)

In [None]:
glove_vectorizer.embedding_matrix.shape

In [None]:
glove_vectorizer

In [None]:
from tensorflow.keras.layers import Embedding

In [None]:
Embedding(
    10_000,
    50,
    weights = [glove_vectorizer.embedding_matrix],
    input_length=300,
    trainable = False
)

In [None]:
glove_vectorizer.tokenizer.word_index['sequence']

In [None]:
glove_vectorizer.text_sequence.shape

In [None]:
X_train[0]

In [None]:
glove_vectorizer.text_sequence[0]

In [None]:
len(glove_vectorizer.word_index)

In [None]:
from itertools import islice

In [None]:
{k for k in islice(glove_vectorizer.tokenizer.word_index, 10)}

In [None]:
take(glove_vectorizer.tokenizer.texts_to_sequences(X_train))

In [None]:
glove_vectorizer.apply_transform_train(X_train)

In [None]:
X_train = glove_vectorizer.apply_transform(X_train)

In [None]:
X_train

In [None]:
LR = 1e-3
BATCH_SIZE = 1024
EPOCHS = 10
MODEL_PATH = './data/best_model_glove_BiLSTM.hdf5'

# Improved Embeddings and Classifier: BERT Classifier

In [None]:
from classification import DistilBERTClassifier

In [None]:
clf_distilbert = DistilBERTClassifier()
clf_distilbert.fit_classifier(X_train, y_train, X_val, y_val, batch_size=16, n_epochs=2)

In [None]:
y_pred = clf_distilbert.predict_classifier(X_test, y_test)
clf_distilbert.evaluate_classifier(y_test, y_pred)

# Error Analysis