## To-Do:

1. Save Models
2. Add comments and markdown
3. 

In [None]:
# import nltk
# nltk.download('punkt')
# nltk.download('stopwords')

In [None]:
# !wget https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
# !unzip glove.6B.zip

In [None]:
display_width = 100 # in percentage

from IPython.display import display, HTML
display(HTML("<style>.container { width:"+str(display_width)+"% !important; }</style>"))

In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
import sys
sys.path.append("/Users/sudhanshugupta/Library/Python/3.9/lib/python/site-packages")

In [None]:
import pandas as pd
import numpy as np

import os

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from EDA import EDA_movie_reviews
from preprocessing import preprocMovieReview
from feature_extraction import textVectorizer, GloveVectorizer
from utils import train_val_test_split, compare_performances_across_classifiers
from classification import LogisticRegressionClf, BiLSTMClf, DistilBERTClassifier

# Read Data

In [None]:
df_movie_reviews = pd.read_csv("./data/movie_review_train.tsv", delimiter="\t")
df_movie_reviews.sample(10)

## Basic Text Preprocessing

In [None]:
text_preprocessor = preprocMovieReview(df_movie_reviews['review'])
df_movie_reviews['review_clean'] = text_preprocessor.basic_text_sanitization_pipeline()

In [None]:
df_movie_reviews['review_clean'].sample(3).values

# Exploratory Data Analysis

In [None]:
eda = EDA_movie_reviews(df_movie_reviews, reviews_col='review_clean', target_col='sentiment')

In [None]:
eda.visualize_wordcloud()

In [None]:
eda.visulaize_class_distribution()

In [None]:
eda.statistics_on_review_text()

***Conclusions***

1. **max_sequence_length of 150 will cover 75% of the sentences.**
2. **max_sequence_length of 300 will cover 95% of the sentences.**

In [None]:
eda.show_common_n_grams(n=3, show_count=20)

# Establish Baseline: TFIDF Vectorizer + Basic Classifier

In [None]:
(
    X_train, 
    y_train, 
    X_test, 
    y_test, 
    X_val, 
    y_val
) = train_val_test_split(
    df_movie_reviews, 
    feature_cols='review_clean', 
    target_col='sentiment',
    test_percent=25, 
)

# print(f"{X_train.shape=}, {y_train.shape=}, {X_test.shape=}, {y_test.shape=}, {X_val.shape=}, {y_val.shape=}")

In [None]:
tfidf_vectorizer = textVectorizer(vectorizer_type='tfidf')
X_train_tfidf = tfidf_vectorizer.apply_transform_train(X_train)
X_test_tfidf = tfidf_vectorizer.apply_transform_test(X_test)

print(X_train_tfidf.shape, X_test_tfidf.shape)

In [None]:
clf_logistic_regression = LogisticRegressionClf()
clf_logistic_regression.fit_classifier(X_train_tfidf, y_train)
y_pred = clf_logistic_regression.predict_classifier(X_test_tfidf)
clf_logistic_regression.evaluate_classifier(y_test, y_pred)

# Improved Classifier: TFIDF Vectorizer + BiLSTM

In [None]:
(
    X_train, 
    y_train, 
    X_test, 
    y_test, 
    X_val, 
    y_val
) = train_val_test_split(
    df_movie_reviews, 
    feature_cols='review_clean', 
    target_col='sentiment',
    test_percent=15, 
    val_percent=15,
)
# print(f"{X_train.shape=}, {y_train.shape=}, {X_test.shape=}, {y_test.shape=}, {X_val.shape=}, {y_val.shape=}")

In [None]:
embedding_dim = 1
vocab_size = 10_000
max_sequence_length = vocab_size

In [None]:
tfidf_vectorizer = textVectorizer(vectorizer_type='tfidf', vocab_size=vocab_size)
X_train_tfidf = tfidf_vectorizer.apply_transform_train(X_train).toarray().reshape(-1, len(X_train), vocab_size).transpose(1, 2, 0)
X_test_tfidf = tfidf_vectorizer.apply_transform_test(X_test).toarray().reshape(-1, len(X_test), vocab_size).transpose(1, 2, 0)
X_val_tfidf = tfidf_vectorizer.apply_transform_test(X_val).toarray().reshape(-1, len(X_val), vocab_size).transpose(1, 2, 0)

print(X_train_tfidf.shape, X_test_tfidf.shape, X_val_tfidf.shape)

In [None]:
HIDDEN_ACTIVATION = 'relu'
MAX_EPOCHS = 5
LR_INIT = 1e-3
BATCH_SIZE = 16
L2_REG_PENALTY = 1e-3
CALLBACKS = ["es", "rlrop", "tensorboard"]
VERBOSITY_LEVEL = 2
SAVE_DIR = "./assets/"


clf_bilstm = BiLSTMClf(
        embedding_dim,
        vocab_size,
        max_sequence_length,
        HIDDEN_ACTIVATION,
        MAX_EPOCHS,
        LR_INIT,
        BATCH_SIZE,
        L2_REG_PENALTY,
        CALLBACKS,
        VERBOSITY_LEVEL,
        SAVE_DIR,
)

clf_bilstm.fit_classifier(X_train_tfidf[:32], y_train[:32], X_val_tfidf[:32], y_val[:32])

In [None]:
y_pred = clf_bilstm.predict_classifier(X_test_tfidf)
clf_bilstm.evaluate_classifier(y_test, y_pred)

# Improved Embeddings: GloVe + BiLSTM

In [None]:
text_preprocessor = preprocMovieReview(df_movie_reviews['review'])
df_movie_reviews['review_clean'] = text_preprocessor.remove_special_characters()

In [None]:
(
    X_train, 
    y_train, 
    X_test, 
    y_test, 
    X_val, 
    y_val
) = train_val_test_split(
    df_movie_reviews, 
    feature_cols='review_clean', 
    target_col='sentiment',
    test_percent=15, 
    val_percent=15,
)
# print(f"{X_train.shape=}, {y_train.shape=}, {X_test.shape=}, {y_test.shape=}, {X_val.shape=}, {y_val.shape=}")

In [None]:
embedding_dim = 50
vocab_size = 50000
max_sequence_length = 500
glove_emb_path = f"./assets/glove.6B.{embedding_dim}d.txt"

In [None]:
glove_vectorizer = GloveVectorizer(embedding_dim, vocab_size, max_sequence_length, glove_emb_path)

X_train_glove = glove_vectorizer.vectorize_text(X_train)
X_test_glove = glove_vectorizer.vectorize_text(X_test)
X_val_glove = glove_vectorizer.vectorize_text(X_val)

print(X_train_glove.shape, X_test_glove.shape, X_val_glove.shape)

In [None]:
HIDDEN_ACTIVATION = 'relu'
MAX_EPOCHS = 5
LR_INIT = 1e-3
BATCH_SIZE = 256
L2_REG_PENALTY = 1e-3
CALLBACKS = ["es", "rlrop", "tensorboard"]
VERBOSITY_LEVEL = 2
SAVE_DIR = "./assets/"

clf_bilstm = BiLSTMClf(
    embedding_dim,
        vocab_size,
        max_sequence_length,
        HIDDEN_ACTIVATION,
        MAX_EPOCHS,
        LR_INIT,
        BATCH_SIZE,
        L2_REG_PENALTY,
        CALLBACKS,
        VERBOSITY_LEVEL,
        SAVE_DIR,
)

clf_bilstm.fit_classifier(X_train_glove, y_train, X_val_glove, y_val)

In [None]:
y_pred = clf_bilstm.predict_classifier(X_test_glove)
clf_bilstm.evaluate_classifier(y_test, y_pred)

# Improved Embeddings and Classifier: DistilBERT Classifier

add rotten tomatoes transfer learning

In [None]:
(
    X_train, 
    y_train, 
    X_test, 
    y_test, 
    X_val, 
    y_val
) = train_val_test_split(
    df_movie_reviews, 
    feature_cols='review_clean', 
    target_col='sentiment',
    test_percent=15, 
    val_percent=15,
)
print(f"{X_train.shape=}, {y_train.shape=}, {X_test.shape=}, {y_test.shape=}, {X_val.shape=}, {y_val.shape=}")

In [None]:
clf_distilbert = DistilBERTClassifier()
clf_distilbert.fit_classifier(X_train[:16*1], y_train[:16*1], X_val[:16*1], y_val[:16*1], batch_size=4, n_epochs=2) 

In [None]:
y_pred = clf_distilbert.predict_classifier(X_test[:16*10])
clf_distilbert.evaluate_classifier(y_test[:16*10], y_pred)

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
print(torch.cuda.memory_summary(device=None, abbreviated=False))

# Compare Classifiers

compare_performances_across_classifiers(df_metrics)