In [103]:
import pandas as pd
from imblearn.over_sampling import SMOTE

In [104]:
df = pd.read_csv('40ktf.csv')

In [105]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, SimpleRNNCell, Conv2D 
from tensorflow.keras.regularizers import L2, L1L2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from transformers import TFDistilBertModel, DistilBertTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

LEMMA

In [106]:

df = df.dropna(subset=["text"])

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
X = vectorizer.fit_transform(df["text"])  # Text to numerical features
y = df["status"]  # True/False

print(y)

0        0
1        0
2        0
3        0
4        0
        ..
39514    1
39515    1
39516    1
39517    1
39518    1
Name: status, Length: 39519, dtype: int64


In [107]:
df = df.dropna()

In [108]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [109]:
def lemma_data(s):
    try:
        doc = nlp(s)
        tokens = []
        for token in doc:
            tokens.append(token)
        lemmatized_sentence = " ".join([token.lemma_ for token in doc])
        return lemmatized_sentence
    except:
        print(s)

In [110]:
def change_true_false(s):
    if s == 'True':
        return 1
    else:
        return 0

In [111]:
df['status'] = df['status'].apply(change_true_false)

In [112]:
# df['text'] = df['text'].apply(lemma_data)

In [113]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['status'], test_size=0.2, random_state=42)

In [114]:
df = pd.DataFrame()
df['text'] = X_train
df['status'] = y_train

In [115]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [116]:
def tokenize_data(texts, tokenizer, max_length):
    return tokenizer(
        texts, 
        max_length=max_length, 
        padding='max_length', 
        truncation=True, 
        return_tensors='tf'
    )

texts = df['text'].tolist()
labels = df['status'].tolist()
max_length = 128
tokenized_data = tokenize_data(texts, tokenizer, max_length)


In [117]:
class CustomModel(tf.keras.Model):
    def __init__(self, bert_model):
        super(CustomModel, self).__init__()
        self.bert = bert_model
        self.lstm = LSTM(512, return_sequences=True, kernel_regularizer=L2(0.02))
        self.lstm = LSTM(256, return_sequences=True, kernel_regularizer=L2(0.02))
        self.lstm = LSTM(128, return_sequences=True, kernel_regularizer=L2(0.02))
        self.lstm = LSTM(64, return_sequences=True, kernel_regularizer=L2(0.02))
        self.lstm = LSTM(32, return_sequences=True, kernel_regularizer=L2(0.02))
        self.dropout = Dropout(0.3)
        self.dense = Dense(1, activation='sigmoid')

    def call(self, inputs):
        bert_output = self.bert(inputs)[0]
        lstm_output = self.lstm(bert_output)
        dropout_output = self.dropout(lstm_output)
        output = self.dense(dropout_output[:, -1, :])
        return output

model = CustomModel(bert_model)


In [118]:
callback = EarlyStopping(monitor='loss',patience=15)

checkpoint_filepath = '/tmp/ckpt/checkpoint.model.keras'
model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

In [119]:
# Create a tf.data.Dataset from the tokenized data and labels
dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': tokenized_data['input_ids'], 'attention_mask': tokenized_data['attention_mask']},
    labels
)).batch(128)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [98]:
history = model.fit(
    dataset,
    epochs=50,
    callbacks=[callback]
)

Epoch 1/50




In [99]:
model.summary()

In [100]:
pred_data = ["A senior BlackRock Inc executive made donation unsuccessful U.S. presidential candidate last year, action may prohibit world’s largest asset manager collecting fees state government, company regulatory filing showed. Mark Wiedman, BlackRock senior managing director, donated $2,700 presidential campaign Ohio Governor John Kasich, seeking Republican Party nomination, according filing made earlier year U.S. Securities Exchange Commission reviewed Reuters Monday. Federal securities rules prohibit companies executive officers making donations government officials could influence hiring fund manager authority appoint person could providing asset management services governments fee. The ban effect two years contribution made. The state Ohio uses BlackRock-managed funds, relationship BlackRock “substantially predates” donation, filing said. In May filing, BlackRock asked SEC permission accept fees state Ohio, saying exceptions rule granted before. Without ruling, BlackRock said could face loss approximately $37 million. BlackRock said Wiedman asked received refund campaign contribution Kasich, “made personal political beliefs contributor desire influence award investment advisory business.” The SEC yet responded BlackRock’s request. In emailed statement, BlackRock said application “fair reasonable, consistent intent relevant rule.” “The contribution question made solely support Mr. Kasich’s presidential campaign purpose,” BlackRock spokeswoman Tara McDonnell said statement. “Through BlackRock’s robust policies procedures, discovered contribution question, BlackRock Mr. Wiedman worked together promptly address error, including obtaining refund $2,700 contribution,” McDonnell said. A spokesman SEC, Ryan White, declined comment filing. A spokesman Kasich immediately respond request comment donation. Wiedman global head iShares index investments, booming business within BlackRock includes exchange-traded funds. The iShares brand brought nearly $74 billion new cash BlackRock second quarter. BlackRock manages $5.7 trillion overall."]

pred_data = ['President Donald Trump will put in place 25% tariffs on imports from Canada and Mexico and 10% tariffs on goods from China effective on Saturday, the White House said, but it provided no word on whether there would be any exemptions to the measures that could result in swift price increases to U.S. consumers.']
tokenized_pred_data = tokenize_data(pred_data, tokenizer, max_length)

In [None]:
model.predict({'input_ids': tokenized_pred_data['input_ids'], 'attention_mask': tokenized_pred_data['attention_mask']})

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 811ms/step


array([[0.00154012]], dtype=float32)

In [None]:
loss, accuracy = model.evaluate(dataset)
print(f"Loss: {loss}, Accuracy: {accuracy}")

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 240ms/step - accuracy: 1.0000 - loss: 0.0083
Loss: 0.008289073593914509, Accuracy: 1.0
