In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 7.8MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 49.7MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 53.8MB/s 
Installing collected packages: sacremoses, tokenizers, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1


In [None]:
#######################################
### -------- Load libraries ------- ###
# Load Huggingface transformers
from transformers import TFBertModel, BertConfig, BertTokenizer
# Then what you need from tensorflow.keras
from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
# And pandas for data import + sklearn because you allways need sklearn
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import glob
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [None]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Define Model Structure and Load Best Weights

In [None]:
model_name = 'hfl/chinese-roberta-wwm-ext-large'
tokenizer = BertTokenizer.from_pretrained(model_name)
transformer_model = TFBertModel.from_pretrained(model_name)
config = BertConfig.from_pretrained(model_name)
config.output_hidden_states = False

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=109540.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=19.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=268961.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=690.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1302594480.0, style=ProgressStyle(descr…




All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at hfl/chinese-roberta-wwm-ext-large.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
max_length = 32

In [None]:
roberta = transformer_model.layers[0]
input_ids = Input(shape=(max_length,), name='input_ids', dtype='int32')
inputs = {'input_ids': input_ids}

#Load the Transformers BERT model as a layer in a Keras model
roberta_model = roberta(inputs)[1]
dropout = Dropout(config.hidden_dropout_prob, name='pooled_output')
pooled_output = dropout(roberta_model, training=False)

#output
fc1 = Dense(units=512,activation='relu', kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='fc1')(pooled_output)
fc1_output = Dropout(0.1, name='fc1_output')(fc1)
label = Dense(units=3,activation='softmax',kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='label')(fc1_output)

output = {'label': label}

In [None]:
#combined
model = Model(inputs=inputs, outputs=output, name='ROBERTA')
model.summary()

Model: "ROBERTA"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_ids (InputLayer)       [(None, 32)]              0         
_________________________________________________________________
bert (TFBertMainLayer)       TFBaseModelOutputWithPool 325522432 
_________________________________________________________________
pooled_output (Dropout)      (None, 1024)              0         
_________________________________________________________________
fc1 (Dense)                  (None, 512)               524800    
_________________________________________________________________
fc1_output (Dropout)         (None, 512)               0         
_________________________________________________________________
label (Dense)                (None, 3)                 1539      
Total params: 326,048,771
Trainable params: 326,048,771
Non-trainable params: 0
_____________________________________________

In [None]:
# optimizer from hugging face transformers
from transformers import AdamWeightDecay

# define the optimizer
optimizer = AdamWeightDecay(lr = 2e-5, weight_decay_rate=0.01)

In [None]:
loss = {'label': CategoricalCrossentropy(from_logits = True)}
metric = {'label': CategoricalAccuracy('accuracy')}

In [None]:
model.compile(optimizer = optimizer, loss = loss, metrics = metric)

In [None]:
model.load_weights('/content/gdrive/MyDrive/bert.ckpt')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fb64fc1b610>

# Load Data and Perform Inference

In [None]:
from tqdm import tqdm

In [None]:
files = glob.glob("gdrive/MyDrive/RMBI_FYP/August data/*.csv")
for f in tqdm(files):
    data = pd.read_csv(f)
    data['ct'] = data['ct'].astype('str')
    data_x = tokenizer(text=data['ct'].to_list(), add_special_tokens=True, max_length=max_length, truncation=True, padding=True, return_tensors='tf', return_token_type_ids = False, return_attention_mask = False, verbose = True)
    y_pred = model.predict(x={'input_ids': data_x['input_ids']})
    y_pred_df = pd.DataFrame(y_pred['label'],columns=['negative_probs','neutral_probs','positive_probs'])
    data[['negative_probs', 'neutral_probs', 'positive_probs']] = y_pred_df
    data.to_csv(f,index=False)

100%|██████████| 30/30 [28:52<00:00, 57.74s/it]


# Old Code for Reference

In [None]:
data = pd.read_csv("gdrive/MyDrive/RMBI_FYP/August data/20160801.csv")

In [None]:
data.head()

Unnamed: 0,_id,au,ct,hl,st,negative_probs,neutral_probs,positive_probs
0,201608013044537,ZDSZWDJN,一萬多億的盤子，二萬多億的公司資產\n,一萬多億的盤子，二萬多億的公司資產,600028.sh,0.160733,0.144969,0.694298
1,201608013044636,江西贛州網友,"京東方三大股東承諾半年不減持在7月28號到期，這些王八蛋7月28和29號連著倆天出貨,成本2...",告全體散戶朋友書,000725.sz,0.573081,0.089199,0.33772
2,201608013044693,美的克星,上央視，跌不止，坑散戶，真無恥\n,上央視，跌不止，坑散戶，真無恥,000725.sz,0.82699,0.07992,0.09309
3,201608013044698,竹凡語北燦,暴跌後怎麼辦？重大盤不如重個股，數股異軍突起或成明日之星，20點收益等待入場信號，可跟進之前\n,暴跌後怎麼辦？重大盤不如重個股，數股異軍突起或成明日之星，20點收益等待入場信號,600109.sh,0.15402,0.044327,0.801653
4,201608013044734,股友ssMPnT,每個股吧都有京東方打的廣告！\n,每個股吧都有京東方打的廣告！,000725.sz,0.145117,0.299188,0.555695


# Inference

In [None]:
data_x = tokenizer(text=data['ct'].to_list(), add_special_tokens=True, max_length=max_length, truncation=True, padding=True, return_tensors='tf', return_token_type_ids = False, return_attention_mask = False, verbose = True)

In [None]:
y_pred = model.predict(x={'input_ids': data_x['input_ids']})

In [None]:
y_pred

{'label': array([[0.24816762, 0.6553322 , 0.09650017],
        [0.09991831, 0.04282899, 0.8572527 ],
        [0.0935036 , 0.04463367, 0.8618627 ],
        ...,
        [0.05695517, 0.83923477, 0.10381005],
        [0.04714427, 0.859097  , 0.09375875],
        [0.11785687, 0.04806343, 0.8340797 ]], dtype=float32)}

In [None]:
y_pred['label']

array([[0.24816762, 0.6553322 , 0.09650017],
       [0.09991831, 0.04282899, 0.8572527 ],
       [0.0935036 , 0.04463367, 0.8618627 ],
       ...,
       [0.05695517, 0.83923477, 0.10381005],
       [0.04714427, 0.859097  , 0.09375875],
       [0.11785687, 0.04806343, 0.8340797 ]], dtype=float32)

In [None]:
y_pred_df = pd.DataFrame(y_pred['label'],columns=['negative_probs','neutral_probs','positive_prob'])

In [None]:
y_pred_df.head()

Unnamed: 0,negative_probs,neutral_probs,positive_prob
0,0.248168,0.655332,0.0965
1,0.099918,0.042829,0.857253
2,0.093504,0.044634,0.861863
3,0.099909,0.770508,0.129583
4,0.095239,0.047087,0.857675


In [None]:
y_pred_label = np.argmax(y_pred['label'], axis=1)

In [None]:
y_pred_label = y_pred_label-1

In [None]:
y_pred_df['label'] = y_pred_label

In [None]:
y_pred_df.head()

Unnamed: 0,negative_probs,neutral_probs,positive_prob,label
0,0.248168,0.655332,0.0965,0
1,0.099918,0.042829,0.857253,1
2,0.093504,0.044634,0.861863,1
3,0.099909,0.770508,0.129583,0
4,0.095239,0.047087,0.857675,1


In [None]:
data[['negative_probs', 'neutral_probs', 'positive_prob', 'label']] = y_pred_df

In [None]:
data.head()

Unnamed: 0,_id,au,ct,hl,st,negative_probs,neutral_probs,positive_prob,label
0,201608253079285,一汽領導,一汽集團為何就不知羞恥？\n,一汽集團為何就不知羞恥？,000800.sz,0.248168,0.655332,0.0965,0
1,201608253079286,春姑2009,應該是長期利好\n,應該是長期利好,600449.sh,0.099918,0.042829,0.857253,1
2,201608253079287,美好儲抱,體育產業多重利好發酵氣爽 布局奧運會概念 氣爽明天建倉，下周收網！\n,體育產業多重利好發酵氣爽 布局奧運會概念 氣爽明天建倉，下周收網！,600343.sh,0.093504,0.044634,0.861863,1
3,201608253079288,廣東深圳手機網友,災後材料\n,災後材料,002398.sz,0.099909,0.770508,0.129583,0
4,201608253079289,怎知魚之樂123,滬指神龍擺尾，驚天動作翹首以盼方法！八月必備板塊，等待起飛！\n,滬指神龍擺尾，驚天動作翹首以盼方法！八月必備板塊，等待起飛！,300143.sz,0.095239,0.047087,0.857675,1


In [None]:
data.to_csv("20160825_robertsentiment.csv")