In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
!pip install transformers
!pip install imblearn



In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from scipy.stats import spearmanr
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np
import torch
import transformers as ppb




In [6]:
# specify GPU
device = torch.device("cuda")

In [7]:
class SentimentScores(object):
    def __init__(self):
        self.true_positives = 0
        self.true_negatives = 0
        self.true_neutrals = 0
        self.false_positives = 0
        self.false_negatives = 0
        self.false_neutrals = 0
        self.total = 0
        self.predicted = []
        self.ground_truth = []

    def update(self, pred, gt):
        self.ground_truth.append(gt)
        self.predicted.append(pred)
        if pred == 1:
            if gt == 1:
                self.true_positives += 1
            else:
                self.false_positives += 1
        elif pred == -1:
            if gt == -1:
                self.true_negatives += 1
            else:
                self.false_negatives += 1
        else:
            if gt == 0:
                self.true_neutrals += 1
            else:
                self.false_neutrals += 1
        self.total += 1

    def output_results(self):
        gt_arr = np.array(self.ground_truth)
        pred_arr = np.array(self.predicted)
        r, p = spearmanr(gt_arr, pred_arr)
        print(f'r {r:5.2f}')
        true_positive_pct = self.true_positives * 100 / self.total
        true_negative_pct = self.true_negatives * 100 / self.total
        true_neutral_pct = self.true_neutrals * 100 / self.total
        false_positive_pct = self.false_positives * 100 / self.total
        false_negative_pct = self.false_negatives * 100 / self.total
        false_neutral_pct = self.false_neutrals * 100 / self.total
        print(f'True Positives: {true_positive_pct:2.2f}%')
        print(f'True Negatives: {true_negative_pct:2.2f}%')
        print(f'True Neutral: {true_neutral_pct:2.2f}%')
        print(f'False Positives: {false_positive_pct:2.2f}%')
        print(f'False Negatives: {false_negative_pct:2.2f}%')
        print(f'False Neutrals: {false_neutral_pct:2.2f}%')



In [8]:
import tensorflow as tf
print(f'GPU device name: {tf.test.gpu_device_name()}')
from tensorflow.python.client import device_lib
print(f'Local Devices:  {device_lib.list_local_devices()}')
!cat /proc/meminfo

GPU device name: /device:GPU:0
Local Devices:  [name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 17801544388087939043
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 14563024576
locality {
  bus_id: 1
  links {
  }
}
incarnation: 12726769857024040650
physical_device_desc: "device: 0, name: Tesla V100-SXM2-16GB, pci bus id: 0000:00:04.0, compute capability: 7.0"
]
MemTotal:       26751732 kB
MemFree:        19523116 kB
MemAvailable:   22781756 kB
Buffers:           45364 kB
Cached:          2622460 kB
SwapCached:            0 kB
Active:          3584756 kB
Inactive:        3209636 kB
Active(anon):    3251508 kB
Inactive(anon):    10652 kB
Active(file):     333248 kB
Inactive(file):  3198984 kB
Unevictable:           0 kB
Mlocked:               0 kB
SwapTotal:             0 kB
SwapFree:              0 kB
Dirty:               828 kB
Writeback:             0 kB
AnonPages:       4126596 kB
Mapped:           835080 kB
Shmem:             1132

In [9]:

def classify():
    df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/bert_tweets.tsv", delimiter='\t', header=None)
    df = df.truncate(axis=1, after=400)
    df = df.sample(frac=1) # shuffle the rows
    df = df.head(10000)
    
    # Load pre-trained BERT model
    model_class, tokenizer_class, pretrained_weights = (
        ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

    # Load pretrained model/tokenizer
    tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
    model = model_class.from_pretrained(pretrained_weights)
    tokenized = df[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

    max_len = 0
    for i in tokenized.values:
        if len(i) > max_len:
            max_len = len(i)

    padded = np.array([i + [0] * (max_len - len(i)) for i in tokenized.values])
    attention_mask = np.where(padded != 0, 1, 0)
    
    input_ids = torch.tensor(padded)
    attention_mask = torch.tensor(attention_mask)

    with torch.no_grad():
        last_hidden_states = model(input_ids, attention_mask=attention_mask)

    features = last_hidden_states[0][:, 0, :].numpy()
    labels = df[1]

    train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

    # Upsample for equal class sizes in training data
    sm = SMOTE()
    train_features, train_labels = sm.fit_resample(train_features, train_labels)

    lr_clf = LogisticRegression()
    lr_clf.fit(train_features, train_labels)
    # score = lr_clf.score(test_features, test_labels)
    # print(f'Logistic Regression Score: {score}')

    scores = SentimentScores()
    gt_labels = test_labels.to_list()
    n = len(gt_labels) - 1
    for i in range(0, n):
        val = lr_clf.predict(test_features[i:i + 1])[0]
        gt = gt_labels[i]
        scores.update(val, gt)
    scores.output_results()








In [None]:
classify()