In [2]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/48/35/ad2c5b1b8f99feaaf9d7cdadaeef261f098c6e1a6a2935d4d07662a6b780/transformers-2.11.0-py3-none-any.whl (674kB)
[K     |████████████████████████████████| 675kB 3.5MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 17.9MB/s 
[?25hCollecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 19.1MB/s 
Collecting tokenizers==0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/14/e5/a26eb4716523808bb0a799fcfdceb6ebf77a18169d9591b2f46a9adb87d9/tokenizers-0.7.0-cp36-cp36m-manylinux1_x86_64.whl (3.8MB)
[K     |███

In [0]:
import numpy as np
import pandas as pd
import torch
import transformers as ppb # pytorch transformers

In [0]:
df = pd.read_csv('https://raw.githubusercontent.com/JULIELab/EmoBank/master/corpus/emobank.csv')

In [4]:
df.head()

Unnamed: 0,id,split,V,A,D,text
0,110CYL068_1036_1079,train,3.0,3.0,3.2,"Remember what she said in my last letter? """
1,110CYL068_1079_1110,test,2.8,3.1,2.8,If I wasn't working here.
2,110CYL068_1127_1130,train,3.0,3.0,3.0,".."""
3,110CYL068_1137_1188,train,3.44,3.0,3.22,Goodwill helps people get off of public assist...
4,110CYL068_1189_1328,train,3.55,3.27,3.46,Sherry learned through our Future Works class ...


In [5]:
df.describe()

Unnamed: 0,V,A,D
count,10062.0,10062.0,10062.0
mean,2.97669,3.041785,3.062763
std,0.348715,0.259541,0.209675
min,1.2,1.8,1.78
25%,2.8,2.89,3.0
50%,3.0,3.0,3.09
75%,3.12,3.2,3.2
max,4.6,4.4,4.2


In [0]:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')


tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [0]:
tokenized = df["text"].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [8]:
tokenized.head()

0    [101, 3342, 2054, 2016, 2056, 1999, 2026, 2197...
1    [101, 2065, 1045, 2347, 1005, 1056, 2551, 2182...
2                         [101, 1012, 1012, 1000, 102]
3    [101, 22875, 7126, 2111, 2131, 2125, 1997, 227...
4    [101, 22268, 4342, 2083, 2256, 2925, 2573, 246...
Name: text, dtype: object

In [9]:
# maxiumum length of the sentence, for padding
tokenized_max_len = max([len(x) for x in tokenized])
print(tokenized_max_len)

153


In [0]:
padded = np.array([i + [0]*(tokenized_max_len-len(i)) for i in tokenized.values])

In [11]:
np.array(padded).shape

(10062, 153)

In [0]:
# Following the tutorial - https://jalammar.github.io/a-visual-guide-to-using-bert-for-the-first-time/
# And - https://colab.research.google.com/github/jalammar/jalammar.github.io/blob/master/notebooks/bert/A_Visual_Notebook_to_Using_BERT_for_the_First_Time.ipynb#scrollTo=4K_iGRNa_Ozc 

From the tutorial - 

If we directly send padded to BERT, that would slightly confuse it. We need to create another variable to tell it to ignore (mask) the padding we've added when it's processing its input. That's what attention_mask is:

In [13]:
attention_mask = np.where((padded != 0), 1, 0)
attention_mask.shape

(10062, 153)

In [32]:
bert_states = list()

before = -1
for r in range(100, 10001, 100):
  input_ids = torch.tensor(padded[before+1:r])  
  attention_mask_updated = torch.tensor(attention_mask[before+1:r])

  with torch.no_grad():
      last_hidden_states = model(input_ids, attention_mask=attention_mask_updated)
      bert_states.append(last_hidden_states)
      print("Done: {}".format(r))
  before = r

Done: 100
Done: 200
Done: 300
Done: 400
Done: 500
Done: 600
Done: 700
Done: 800
Done: 900
Done: 1000
Done: 1100
Done: 1200
Done: 1300
Done: 1400
Done: 1500
Done: 1600
Done: 1700
Done: 1800
Done: 1900
Done: 2000
Done: 2100
Done: 2200
Done: 2300
Done: 2400
Done: 2500
Done: 2600
Done: 2700
Done: 2800
Done: 2900
Done: 3000
Done: 3100
Done: 3200
Done: 3300
Done: 3400
Done: 3500
Done: 3600
Done: 3700
Done: 3800
Done: 3900
Done: 4000
Done: 4100
Done: 4200
Done: 4300
Done: 4400
Done: 4500
Done: 4600
Done: 4700
Done: 4800
Done: 4900
Done: 5000
Done: 5100
Done: 5200
Done: 5300
Done: 5400
Done: 5500
Done: 5600
Done: 5700
Done: 5800
Done: 5900
Done: 6000
Done: 6100
Done: 6200
Done: 6300
Done: 6400
Done: 6500
Done: 6600
Done: 6700
Done: 6800
Done: 6900
Done: 7000
Done: 7100
Done: 7200
Done: 7300
Done: 7400
Done: 7500
Done: 7600
Done: 7700
Done: 7800
Done: 7900
Done: 8000
Done: 8100
Done: 8200
Done: 8300
Done: 8400
Done: 8500
Done: 8600
Done: 8700
Done: 8800
Done: 8900
Done: 9000
Done: 9100
Done: 92

In [4]:
features = [i[0][:,0,:].numpy() for i in bert_states]

NameError: ignored

In [10]:
import pickle
with open('bert_states.pkl', 'rb') as f:
  features = pickle.load(f)
all_features = list()
for i in range(100):
  for j in range(100):
    if i > 0 and j==99:
      continue
    else:
      all_features.append(features[i][j])
print(len(all_features))
features = all_features

9901


In [5]:
!ls -l --block-size=MB

total 31MB
-rw-r--r-- 1 root root 31MB Jun 11 17:02 bert_states.pkl
drwxr-xr-x 1 root root  1MB May 29 18:19 sample_data


In [0]:
from google.colab import files
files.download("bert_states.pkl")

In [0]:
labels = df["D"]

In [0]:
from sklearn.model_selection import train_test_split

train_features, test_features, train_labels, test_labels = train_test_split(features[:9901], labels[:9901], test_size=0.1)

In [38]:
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from math import sqrt

svr_reg = make_pipeline(StandardScaler(), SVR(C=0.8, epsilon=0.2))
svr_reg.fit(train_features, train_labels)

train_score = svr_reg.score(train_features, train_labels)
val_score = svr_reg.score(test_features, test_labels)

svr_y_pred = svr_reg.predict(train_features)
train_rmse = sqrt(metrics.mean_squared_error(train_labels, svr_y_pred))

svr_y_pred = svr_reg.predict(test_features)
val_rmse = sqrt(metrics.mean_squared_error(test_labels, svr_y_pred))

print(train_score, val_score, train_rmse, val_rmse)

0.41528398997594973 -0.08223200270225028 0.15972418604161687 0.22482064428005527


In [37]:
print(len(test_features), len(train_features))

991 8910


In [40]:
from scipy import stats
svr_y_pred = svr_reg.predict(train_features)
stats.pearsonr(train_labels, svr_y_pred)

(0.6850570491945207, 0.0)