In [3]:
import pandas as pd
from imblearn.over_sampling import SMOTE

In [4]:
df = pd.read_csv('scripts/all_merged.csv')

In [5]:
df = df[:40000]

In [6]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, SimpleRNNCell, Conv2D 
from tensorflow.keras.regularizers import L2, L1L2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from transformers import TFDistilBertModel, DistilBertTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

LEMMA

In [7]:

df = df.dropna(subset=["text"])

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
X = vectorizer.fit_transform(df["text"])  # Text to numerical features
y = df["status"]  # True/False

print(y)

0        False
1        False
2        False
3        False
4        False
         ...  
39995     True
39996     True
39997     True
39998     True
39999     True
Name: status, Length: 39519, dtype: object


In [8]:
df = df.dropna()

In [9]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [10]:
def lemma_data(s):
    try:
        doc = nlp(s)
        tokens = []
        for token in doc:
            tokens.append(token)
        lemmatized_sentence = " ".join([token.lemma_ for token in doc])
        return lemmatized_sentence
    except:
        print(s)

In [28]:
df['text'] = df['text'].apply(lemma_data)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['status'], test_size=0.2, random_state=42)

In [2]:
df = pd.DataFrame()
df['text'] = X_train
df['status'] = y_train

NameError: name 'pd' is not defined

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

In [30]:
def tokenize_data(texts, tokenizer, max_length):
    return tokenizer(
        texts, 
        max_length=max_length, 
        padding='max_length', 
        truncation=True, 
        return_tensors='tf'
    )

texts = df['text'].tolist()
labels = df['status'].tolist()
max_length = 128
tokenized_data = tokenize_data(texts, tokenizer, max_length)


In [48]:
class CustomModel(tf.keras.Model):
    def __init__(self, bert_model):
        super(CustomModel, self).__init__()
        self.bert = bert_model
        self.lstm = LSTM(512, return_sequences=True, kernel_regularizer=L2(0.02))
        self.lstm = LSTM(256, return_sequences=True, kernel_regularizer=L2(0.02))
        self.lstm = LSTM(128, return_sequences=True, kernel_regularizer=L2(0.02))
        self.lstm = LSTM(64, return_sequences=True, kernel_regularizer=L2(0.02))
        self.lstm = LSTM(32, return_sequences=True, kernel_regularizer=L2(0.02))
        self.dropout = Dropout(0.3)
        self.dense = Dense(1, activation='sigmoid')

    def call(self, inputs):
        bert_output = self.bert(inputs)[0]
        lstm_output = self.lstm(bert_output)
        dropout_output = self.dropout(lstm_output)
        output = self.dense(dropout_output[:, -1, :])
        return output

model = CustomModel(bert_model)


In [None]:
callback = EarlyStopping(monitor='loss',patience=15)

checkpoint_filepath = '/tmp/ckpt/checkpoint.model.keras'
model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

In [52]:
# Create a tf.data.Dataset from the tokenized data and labels
dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': tokenized_data['input_ids'], 'attention_mask': tokenized_data['attention_mask']},
    labels
)).batch(128)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [53]:
history = model.fit(
    dataset,
    epochs=50,
    validation_data=(X_test, y_test),
    callbacks=[callback]
)

Epoch 1/128
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 358ms/step - accuracy: 0.7750 - loss: 3.2398
Epoch 2/128
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 365ms/step - accuracy: 0.8930 - loss: 2.4129
Epoch 3/128
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 355ms/step - accuracy: 0.7908 - loss: 2.1165
Epoch 4/128
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 342ms/step - accuracy: 0.7720 - loss: 1.7637
Epoch 5/128
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 345ms/step - accuracy: 0.8220 - loss: 1.3980
Epoch 6/128
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 346ms/step - accuracy: 0.8353 - loss: 1.1440
Epoch 7/128
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 345ms/step - accuracy: 0.8367 - loss: 1.0042
Epoch 8/128
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 361ms/step - accuracy: 0.8598 - loss: 0.8327
Epoch 9/128
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━

KeyboardInterrupt: 

In [51]:
model.summary()

In [34]:
pred_data = ["A senior BlackRock Inc executive made donation unsuccessful U.S. presidential candidate last year, action may prohibit world’s largest asset manager collecting fees state government, company regulatory filing showed. Mark Wiedman, BlackRock senior managing director, donated $2,700 presidential campaign Ohio Governor John Kasich, seeking Republican Party nomination, according filing made earlier year U.S. Securities Exchange Commission reviewed Reuters Monday. Federal securities rules prohibit companies executive officers making donations government officials could influence hiring fund manager authority appoint person could providing asset management services governments fee. The ban effect two years contribution made. The state Ohio uses BlackRock-managed funds, relationship BlackRock “substantially predates” donation, filing said. In May filing, BlackRock asked SEC permission accept fees state Ohio, saying exceptions rule granted before. Without ruling, BlackRock said could face loss approximately $37 million. BlackRock said Wiedman asked received refund campaign contribution Kasich, “made personal political beliefs contributor desire influence award investment advisory business.” The SEC yet responded BlackRock’s request. In emailed statement, BlackRock said application “fair reasonable, consistent intent relevant rule.” “The contribution question made solely support Mr. Kasich’s presidential campaign purpose,” BlackRock spokeswoman Tara McDonnell said statement. “Through BlackRock’s robust policies procedures, discovered contribution question, BlackRock Mr. Wiedman worked together promptly address error, including obtaining refund $2,700 contribution,” McDonnell said. A spokesman SEC, Ryan White, declined comment filing. A spokesman Kasich immediately respond request comment donation. Wiedman global head iShares index investments, booming business within BlackRock includes exchange-traded funds. The iShares brand brought nearly $74 billion new cash BlackRock second quarter. BlackRock manages $5.7 trillion overall."]

pred_data = ['President Donald Trump will put in place 25% tariffs on imports from Canada and Mexico and 10% tariffs on goods from China effective on Saturday, the White House said, but it provided no word on whether there would be any exemptions to the measures that could result in swift price increases to U.S. consumers.']
tokenized_pred_data = tokenize_data(pred_data, tokenizer, max_length)

In [None]:
model.predict({'input_ids': tokenized_pred_data['input_ids'], 'attention_mask': tokenized_pred_data['attention_mask']})

In [None]:
loss, accuracy = model.evaluate(dataset)
print(f"Loss: {loss}, Accuracy: {accuracy}")