In [None]:
# Use GPU option

# In Colab, go to Edit/Notebook Settings and choose the 'GPU' option before running this script

In [None]:
# Load data

# there are several ways to load data into Colab

# 1. Host your data to GitHub (up to 25MB) and use the url to the GitHub page
# e.g. df = pd.read_csv('https://raw.githubusercontent.com/junwang4/causal-language-use-in-science/master/data/pubmed_causal_language_use.csv') 

# 2. Host your data in your Google drive and then mount to your Google drive. You will be given an authorization code to finish the process
# e.g. the following code
# from google.colab import drive
# drive.mount('/drive')
# df = pd.read_csv('/drive/My Drive/train.tsv', sep='\t')

# 3. Upload your data to Colab Files. The uploaded file will be deleted when the session is disconnected. You will need to upload again after re-connect.
# e.g.   df = pd.read_csv('train.tsv', sep='\t')

# In this script we use method # 2

# We will use the Kaggle sentiment classification data
# https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews

In [1]:
import pandas as pd

In [2]:
# just need to run this code once
from google.colab import drive
drive.mount('/drive')

Mounted at /drive


In [3]:
df = pd.read_csv('/drive/My Drive/Colab Notebooks/data/train.tsv', sep='\t')

In [None]:
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [37]:
# prepare train and test data
# since fine tuning a BERT model still requires a significant amount of time, 
# only 1000 training examples and 1000 test examples will be used for demo purpose.
# prior experiment shows LinearSVC's best accuracy (3-fold CV) is about 62-65% depending on vectorization options
# BERT should be able to outperform LinearSVC with far fewer training examples.

dff = df.sample(frac=1)
#train_size = int(0.01 * len(dff))
#df_train = dff[:train_size]
#df_test = dff[train_size:]
df_train_svm = dff[:100000]
df_train_bert = dff[:5000]
df_test = dff[-1000:]
print(df_train_svm.shape)
print(df_train_bert.shape)
print(df_test.shape)

X_train_svm, y_train_svm = df_train_svm['Phrase'].values, df_train_svm['Sentiment'].values

X_train_bert, y_train_bert = df_train_bert['Phrase'].values, df_train_bert['Sentiment'].values

X_test, y_test = df_test['Phrase'].values, df_test['Sentiment'].values

(100000, 4)
(5000, 4)
(1000, 4)


In [32]:
# build a LinearSVC model as a baseline comparison to the BERT model
# since LinearSVC is a linear model, we can print out its top features in each category to see whether the model learned something meaningful
# in this example, the top 10 features for the "very negative" category (category 0) is printed out
# you can see that some top features are not so negative
# with only 1000 training examples, LinearSVC achieved 0.529 accuracy on the 1000 test examples.

from sklearn.feature_extraction.text import CountVectorizer
unigram_count_vectorizer = CountVectorizer(encoding='latin-1', binary=False, min_df=2, stop_words='english')
X_train_svm_vec = unigram_count_vectorizer.fit_transform(X_train_svm)
from sklearn.svm import LinearSVC
svm_clf = LinearSVC(C=1, max_iter=2000)
svm_clf.fit(X_train_svm_vec,y_train_svm)
feature_ranks = sorted(zip(svm_clf.coef_[0], unigram_count_vectorizer.get_feature_names_out()))
very_negative_10 = feature_ranks[-10:]
print("Very negative words")
for i in range(0, len(very_negative_10)):
    print(very_negative_10[i])
print()

Very negative words
(1.6970208793596249, 'disappointment')
(1.7011257540875322, 'loser')
(1.775843755405945, 'unbearable')
(1.8897275714258845, 'zzzzzzzzz')
(1.9628388341038916, 'cesspool')
(1.9652217877837286, 'repugnant')
(1.979214158944231, 'baaaaaaaaad')
(2.0213949016308703, 'snoozer')
(2.025041597506754, 'unwatchable')
(2.071373151241767, 'unappealing')



In [33]:
# LinearSVC test performance
from sklearn.metrics import f1_score

X_test_vec = unigram_count_vectorizer.transform(X_test)
y_pred = svm_clf.predict(X_test_vec)

# accuracy
svm_acc = svm_clf.score(X_test_vec,y_test)
print(svm_acc)

# f1_score
svm_f1 = f1_score(y_test, y_pred, average = None)
svm_f1_avg = f1_score(y_test, y_pred, average = 'macro')
print(svm_f1)
print(svm_f1_avg)

0.605
[0.34782609 0.41114983 0.73859649 0.46829268 0.36170213]
0.46551344291099356


In [9]:
# install BERT sklearn wrapper written by charles9n
# check out the github page for fine tuning options and usage
# https://github.com/charles9n/bert-sklearn

!git clone -b master https://github.com/charles9n/bert-sklearn
!cd bert-sklearn; pip install .

Cloning into 'bert-sklearn'...
remote: Enumerating objects: 259, done.[K
remote: Total 259 (delta 0), reused 0 (delta 0), pack-reused 259[K
Receiving objects: 100% (259/259), 516.15 KiB | 1.15 MiB/s, done.
Resolving deltas: 100% (131/131), done.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Processing /content/bert-sklearn
[33m  DEPRECATION: A future pip version will change local packages to be built in-place without first copying to a temporary directory. We recommend you use --use-feature=in-tree-build to test your packages with this new behavior before it becomes the default.
   pip 21.3 will remove support for this functionality. You can find discussion regarding this at https://github.com/pypa/pip/issues/7555.[0m
Collecting boto3
  Downloading boto3-1.24.89-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 26.2 MB/s 
Collecting botocore<1.28.0,>=1.27.89
  Downloading botocore-1.27.89-py3-none-any.wh

In [38]:
# fine tune a BERT base uncased model
# since this wrapper has included vectorization using word embedding, no need to vectorize like in LinearSVC
# first the pre-trained BERT model will be loaded in
# then the training starts. 90% examples will be used as training examples and the other 10% as validation (parameter tuning)
# default setting is 3 epoch. Each epoch takes in some training data
from bert_sklearn import BertClassifier
model = BertClassifier()         # text/text pair classification
print(model)
model.fit(X_train_bert, y_train_bert)

Building sklearn text classifier...
BertClassifier()
Loading bert-base-uncased model...
Defaulting to linear classifier/regressor
Loading Pytorch checkpoint

train data size: 4500, validation data size: 500



Training  :   0%|          | 0/141 [00:00<?, ?it/s]

Validating:   0%|          | 0/63 [00:00<?, ?it/s]


Epoch 1, Train loss: 1.0831, Val loss: 0.8373, Val accy: 64.60%



Training  :   0%|          | 0/141 [00:00<?, ?it/s]

Validating:   0%|          | 0/63 [00:00<?, ?it/s]


Epoch 2, Train loss: 0.6947, Val loss: 0.8374, Val accy: 66.00%



Training  :   0%|          | 0/141 [00:00<?, ?it/s]

Validating:   0%|          | 0/63 [00:00<?, ?it/s]


Epoch 3, Train loss: 0.4585, Val loss: 0.9059, Val accy: 66.40%



BertClassifier(do_lower_case=True, label_list=array([0, 1, 2, 3, 4]))

In [24]:
model.save('bert-sentiment.model')

In [39]:
# test accuracy
bert_acc = model.score(X_test, y_test)
print(bert_acc)

# test f1_score
y_pred = model.predict(X_test)
bert_f1 = f1_score(y_test, y_pred, average = None)
bert_f1_macro = f1_score(y_test, y_pred, average = 'macro')

print(bert_f1)
print(bert_f1_macro)

Testing:   0%|          | 0/125 [00:00<?, ?it/s]


Loss: 0.9581, Accuracy: 66.50%
66.5


Predicting:   0%|          | 0/125 [00:00<?, ?it/s]

[0.39344262 0.56111111 0.77255639 0.54892601 0.54166667]
0.56354056120517


In [40]:
# BERT error analysis
err_cnt = 0
for i in range(0, len(y_test)):
  if (y_test[i]==4 and y_pred[i]==1):
    print(X_test[i])
    err_cnt = err_cnt+1
print("errors:", err_cnt)

errors: 0
