<a href="https://colab.research.google.com/github/stemlock/w266_final_project/blob/master/Base_Model_Colab_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This workbook follows the example here: https://huggingface.co/transformers/custom_datasets.html?highlight=sequence#seq-imdb

Can download the data directly from Stanford website with the following two commands:
wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
tar -xf aclImdb_v1.tar.gz

In [None]:
from google.colab import drive
drive.mount('/content/drive')

%cd '/content/drive/My Drive/W266 Final Project/Code'

In [None]:
# This only needs to be run once to install the transformers library
# import os, sys
# nb_path = '/content/notebooks'
# os.symlink('/content/drive/My Drive/', nb_path)
# sys.path.insert(0,nb_path)

# !pip install --target=$nb_path transformers
# !pip install transformers

In [None]:
# Imports
import os
import random

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import tensorflow as tf
from transformers import DistilBertTokenizerFast, TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments

In [None]:
# Set random seed
seed = 42

In [None]:
# # # Preprocess data in script to create neutral and gendered versions
# !python3 preprocess.py -d 'data/aclImdb/train/' -v 'wordlist/' -o 'data/processed_train.csv'
# !python3 preprocess.py -d 'data/aclImdb/test/' -v 'wordlist/' -o 'data/processed_test.csv'

In [None]:
# Read in processed data (Rows with NA in the neutral_review_text had no tokens replaced)
df_train = pd.read_csv('data/processed_train.csv')
df_test = pd.read_csv('data/processed_test.csv')
df_train.head()

In [None]:
# Check how many reviews had no replacement tokens
print("Nongendered revies in train:", df_train['neutral_review_text'].isna().sum())
print("Nongendered revies in test:", df_test['neutral_review_text'].isna().sum())

In [None]:
# Isolate the nongendered reviews 
df_nongendered = df_train[df_train['neutral_review_text'].isna()]

In [None]:
# Check the balance of review scores in the original train set
df_train['review_score'].value_counts()/len(df_train['review_score'])

In [None]:
# Check the balance of review scores in the nongendered reviews
df_nongendered['review_score'].value_counts()/len(df_nongendered['review_score'])

In [None]:
# Check the distribution of the review scores in train
df_train.hist(column='review_score')
plt.title("Histogram of review scores in train dataset")
plt.show()

In [None]:
# Check the distribution of the review scores in nongendered reviews
df_nongendered.hist(column='review_score')
plt.title("Histogram of review scores in nongendered subset")
plt.show()

In [None]:
# Drop all nongendered rows in train by checking the neutral_review_text column
df_train.dropna(inplace=True)
print("Number of rows left in train:", df_train.shape[0])
print("Class balance:")
print(df_train['label'].value_counts())

In [None]:
# Drop all nongendered rows in test by checking the neutral_review_text column
df_test.dropna(inplace=True)
print("Number of rows left in test:", df_test.shape[0])
print("Class balance:")
print(df_test['label'].value_counts())

In [None]:
# ## TO DO: Try to fix this function. For some reason, this causes the model.fit() to fail
# def encode_datasets(X_train, y_train, X_test, y_test, tokenizer, split_size=0.5, seed=42):

#   '''
#   Takes in train and test data and encodes them into train, dev, and test
#   TF datasets using the provided tokenizer.
#   '''

#   # Split test set into dev and test
#   X_dev, X_test, y_dev, y_test = train_test_split(X_test, y_test, test_size=split_size, random_state=seed)

#   # Apply tokenizer to each dataset
#   train_encodings = tokenizer(X_train, truncation=True, padding=True)
#   dev_encodings = tokenizer(X_dev, truncation=True, padding=True)
#   test_encodings = tokenizer(X_test, truncation=True, padding=True)

#   # Turn encodings into datasets for easy batching
#   train_dataset = tf.data.Dataset.from_tensor_slices((
#       dict(train_encodings),
#       y_train
#   ))
#   dev_dataset = tf.data.Dataset.from_tensor_slices((
#       dict(dev_encodings),
#       y_dev
#   ))
#   test_dataset = tf.data.Dataset.from_tensor_slices((
#       dict(test_encodings),
#       y_test
#   ))

#   return train_dataset, dev_dataset, test_dataset

In [None]:
# # Load data
# train_texts = df_train['review_text'].values.tolist()
# n_train_texts = df_train['neutral_review_text'].values.tolist()
# f_train_texts = df_train['female_review_text'].values.tolist()
# m_train_texts = df_train['male_review_text'].values.tolist()
# train_labels = df_train['label'].values.tolist()

# test_texts = df_test['review_text'].values.tolist()
# n_test_texts = df_test['neutral_review_text'].values.tolist()
# f_test_texts = df_test['female_review_text'].values.tolist()
# m_test_texts = df_test['male_review_text'].values.tolist()
# test_labels = df_test['label'].values.tolist()

In [None]:
# # Specify tokenizer and encode each dataset
# tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# train_dataset, dev_dataset, test_dataset = encode_datasets(train_texts, train_labels, 
#                                                            test_texts, test_labels,
#                                                            tokenizer)
# n_train_dataset, n_dev_dataset, n_test_dataset = encode_datasets(n_train_texts, train_labels, 
#                                                                  n_test_texts, test_labels,
#                                                                  tokenizer)

In [None]:
# Load data
train_texts = df_train['review_text'].values.tolist()
n_train_texts = df_train['neutral_review_text'].values.tolist()
f_train_texts = df_train['female_review_text'].values.tolist()
m_train_texts = df_train['male_review_text'].values.tolist()
train_labels = df_train['label'].values.tolist()

test_texts = df_test['review_text'].values.tolist()
n_test_texts = df_test['neutral_review_text'].values.tolist()
f_test_texts = df_test['female_review_text'].values.tolist()
m_test_texts = df_test['male_review_text'].values.tolist()
test_labels = df_test['label'].values.tolist()

In [None]:
# Create dev set from portion of train set
dev_texts, test_texts, _, _ = train_test_split(test_texts, test_labels, test_size=.5, random_state=seed)
n_dev_texts, n_test_texts, _, _ = train_test_split(n_test_texts, test_labels, test_size=.5, random_state=seed)
f_dev_texts, f_test_texts, _, _ = train_test_split(f_test_texts, test_labels, test_size=.5, random_state=seed)
m_dev_texts, m_test_texts, dev_labels, test_labels = train_test_split(m_test_texts, test_labels, 
                                                                        test_size=.5, random_state=seed)

In [None]:
# Specify tokenizer and apply to each dataset
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
dev_encodings = tokenizer(dev_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [None]:
# Turn encodings into datasets for easy batching
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))
dev_dataset = tf.data.Dataset.from_tensor_slices((
    dict(dev_encodings),
    dev_labels
))
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_labels
))

In [None]:
# Specify tokenizer and apply to each dataset
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

n_train_encodings = tokenizer(n_train_texts, truncation=True, padding=True)
n_dev_encodings = tokenizer(n_dev_texts, truncation=True, padding=True)
n_test_encodings = tokenizer(n_test_texts, truncation=True, padding=True)

In [None]:
# Turn encodings into datasets for easy batching
n_train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(n_train_encodings),
    train_labels
))
n_dev_dataset = tf.data.Dataset.from_tensor_slices((
    dict(n_dev_encodings),
    dev_labels
))
n_test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(n_test_encodings),
    test_labels
))

In [None]:
# Initiliaze the TPU devices
resolver = tf.distribute.cluster_resolver.TPUClusterResolver()

tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
print("All devices:")
print(tf.config.list_logical_devices('TPU'))

# Create the distribution strategy for training on TPU cluster
tpu_strategy = tf.distribute.TPUStrategy(resolver)

In [None]:
1-sum(dev_labels)/len(dev_labels)

In [None]:
# Create the model within each device scope
histories1 = []
for train, dev in [(train_dataset, dev_dataset), (n_train_dataset, n_dev_dataset)]:
  with tpu_strategy.scope():
    model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

    optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
    
    # model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=[tf.keras.metrics.Accuracy(), 
    #                                                                         tf.keras.metrics.Precision(), 
    #                                                                         tf.keras.metrics.Recall()])

    model.compile(optimizer=optimizer, loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
                  metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
    print(model.summary())

  history = model.fit(train.batch(16).prefetch(1), validation_data=dev.batch(16).prefetch(1), epochs=10, batch_size=16, shuffle=True)
  histories1.append(history)

In [None]:
model.save_model("original_base_model")
tokenizer.save_pretrained("original_tokenizer")