In [1]:
import os

from pathlib import Path

import tensorflow as tf

import pandas as pd 
import numpy as np

from tensorflow.keras import layers
from tensorflow.keras import losses

In [2]:
print(tf.__version__)

2.13.1


In [3]:
def _get_imdb_data_from_web(path="train"):
    _LOCAL_FNAME = f"./imdb_full_dataset_{path}.csv"
    local_file = Path(_LOCAL_FNAME)
    
    if local_file.is_file():
        return pd.read_csv(_LOCAL_FNAME)
    
    url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

    dataset = tf.keras.utils.get_file("aclImdb_v1", url,
                                        untar=True, cache_dir='.',
                                        cache_subdir='')

    dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
    
    _, _, pos_files = next(os.walk(f"./aclImdb/{path}/pos/"))
    _, _, neg_files = next(os.walk(f"./aclImdb/{path}/neg/"))
    
    pos_corpus = []
    for fname in pos_files:
        with open(f"./aclImdb/{path}/pos/" + fname) as f:
            text = f.read()
            pos_corpus.append(text)

    pos_labels = [1] * len(pos_corpus)
    
    neg_corpus = []
    for fname in neg_files:
        with open(f"./aclImdb/{path}/neg/" + fname) as f:
            text = f.read()
            neg_corpus.append(text)

    neg_labels = [0] * len(neg_corpus)
    
    df = pd.DataFrame({"text": pos_corpus + neg_corpus, "label": pos_labels + neg_labels})
    
    df = df.sample(frac=1.0, random_state=0)
    
    df.to_csv(_LOCAL_FNAME)
    
    return df 

In [4]:
df = _get_imdb_data_from_web()

In [5]:
df.head()

Unnamed: 0,text,label
14149,I had two reasons for watching this swashbuckl...,0
8946,"This is, in my opinion, a very good film, espe...",1
22378,I knew this film was supposed to be so bad it ...,0
12162,"When the US entered World War I, the governmen...",1
4879,Few movies can be viewed almost 60 years later...,1


In [6]:
print("class one average", np.mean(df["label"]))

class one average 0.5


In [7]:
len(df)

25000

In [8]:
def preprocess_text(input_text: str) -> str:
    s = input_text.lower()
    s = s.replace('<br />', ' ')
    return s
    # return token_pattern.findall(s)

In [9]:
df["text"] = df["text"].map(preprocess_text)

In [10]:
tok = tf.keras.preprocessing.text.Tokenizer()

In [11]:
from sklearn.model_selection import train_test_split


In [12]:
text_train, text_test, y_train, y_test = train_test_split(
    df["text"], 
    df["label"], 
    test_size=0.2,
    random_state=1,
)

In [13]:
tok.fit_on_texts(text_train)

In [14]:
len(tok.word_index)


80476

In [15]:
X_train = tok.texts_to_sequences(text_train)
X_test = tok.texts_to_sequences(text_test)

X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=512, padding="post")
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=512, padding="post")

In [21]:
model = tf.keras.Sequential([
    layers.Embedding(input_dim=len(tok.index_word) + 1, output_dim=16, input_length=512),
    layers.Dropout(0.2),
    layers.GlobalAveragePooling1D(),
    layers.Dropout(0.2),
    layers.Dense(1, activation="sigmoid")]
)

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 512, 16)           1287632   
                                                                 
 dropout_2 (Dropout)         (None, 512, 16)           0         
                                                                 
 global_average_pooling1d_1  (None, 16)                0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dropout_3 (Dropout)         (None, 16)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 1287649 (4.91 MB)
Trainable params: 1287649 (4.91 MB)
Non-trainable params: 0 (0.00 Byte)
________________

In [24]:
callback = tf.keras.callbacks.EarlyStopping(
    monitor="val_binary_accuracy",
    patience=3,
    verbose=0,
    restore_best_weights=True,
)

In [25]:
model.compile(loss=losses.BinaryCrossentropy(),
              optimizer='adam',
              metrics=tf.metrics.BinaryAccuracy(threshold=0.5))


In [26]:
epochs = 20
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=epochs,
    callbacks=[callback],
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [27]:
df_test = _get_imdb_data_from_web(path="test")

In [28]:
df_test["text"] = df_test["text"].map(preprocess_text)
X = tok.texts_to_sequences(df_test["text"])
X = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen=512, padding="post")

In [29]:
preds = model.predict(X)



In [32]:
df_test["preds"] = preds.ravel()

In [33]:
df_test["pred_discrete"] = (df_test["preds"] > 0.5).map(int)

In [35]:
print("accuracy on test data is", np.mean(df_test["pred_discrete"] == df_test["label"]))

accuracy on test data is 0.88856


In [36]:
model.save('random_word_averaging_model.keras')

In [44]:
df_finance = pd.read_csv("financial_data_kaggle.csv",
                 names=["sentiment", "text"],
                 encoding="utf-8", encoding_errors="replace")

filter_ = df_finance["sentiment"].map(lambda x: x in {"positive", "negative"})

df_finance = df_finance[filter_].copy()

mapping_ = {
    "positive": 1,
    "negative": 0,
}

df_finance["sentiment"] = df_finance["sentiment"].map(mapping_)

df_finance["text"] = df_finance["text"].map(preprocess_text)

In [45]:
X = tok.texts_to_sequences(df_finance["text"])
X = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen=512, padding="post")

In [46]:
df_finance["preds"] = model.predict(X).ravel()



In [47]:
df_finance["pred_discrete"] = (df_finance["preds"] > 0.5).map(int)
print("accuracy on test data is", np.mean(df_finance["pred_discrete"] == df_finance["sentiment"]))

accuracy on test data is 0.614133197763091


# LSTM Model

In [48]:
X_train = tok.texts_to_sequences(text_train)
X_test = tok.texts_to_sequences(text_test)

X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=128, padding="post")
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=128, padding="post")

In [49]:
model = tf.keras.Sequential([
    layers.Embedding(input_dim=len(tok.index_word) + 1, output_dim=64, input_length=128),
    layers.Dropout(0.2),
    layers.LSTM(64, return_sequences=False),
    layers.Dropout(0.2),
    layers.Dense(1, activation="sigmoid")]
)

model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 128, 64)           5150528   
                                                                 
 dropout_4 (Dropout)         (None, 128, 64)           0         
                                                                 
 lstm (LSTM)                 (None, 64)                33024     
                                                                 
 dropout_5 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 5183617 (19.77 MB)
Trainable params: 5183617 (19.77 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [50]:
model.compile(loss=losses.BinaryCrossentropy(),
              optimizer='adam',
              metrics=tf.metrics.BinaryAccuracy(threshold=0.5))


In [52]:
callback = tf.keras.callbacks.EarlyStopping(
    monitor="val_binary_accuracy",
    patience=3,
    verbose=0,
    restore_best_weights=True,
)

In [53]:
epochs = 20
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=epochs,
    callbacks=[callback]
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20


In [54]:
df_test = _get_imdb_data_from_web(path="test")

df_test["text"] = df_test["text"].map(preprocess_text)
X = tok.texts_to_sequences(df_test["text"])
X = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen=128, padding="post")

preds = model.predict(X)

df_test["preds"] = preds.ravel()

df_test["pred_discrete"] = (df_test["preds"] > 0.5).map(int)

print("accuracy on test data is", np.mean(df_test["pred_discrete"] == df_test["label"]))

accuracy on test data is 0.85316


In [55]:
df_finance = pd.read_csv("financial_data_kaggle.csv",
                 names=["sentiment", "text"],
                 encoding="utf-8", encoding_errors="replace")

filter_ = df_finance["sentiment"].map(lambda x: x in {"positive", "negative"})

df_finance = df_finance[filter_].copy()

mapping_ = {
    "positive": 1,
    "negative": 0,
}

df_finance["sentiment"] = df_finance["sentiment"].map(mapping_)

df_finance["text"] = df_finance["text"].map(preprocess_text)

X = tok.texts_to_sequences(df_finance["text"])
X = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen=512, padding="post")

df_finance["preds"] = model.predict(X).ravel()

df_finance["pred_discrete"] = (df_finance["preds"] > 0.5).map(int)
print("accuracy on test data is", np.mean(df_finance["pred_discrete"] == df_finance["sentiment"]))

accuracy on test data is 0.594306049822064


In [56]:
model.save('random_word_lstm_model.keras')