In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

2025-02-18 00:27:09.458196: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-18 00:27:09.467259: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1739818629.477677    8463 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1739818629.480703    8463 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-18 00:27:09.491506: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
import os

In [3]:
data_path=os.path.join("..","Data","lstm_model_data_v1.csv")

In [4]:
data=pd.read_csv(data_path)

In [5]:
data.dropna(inplace=True)

In [6]:
data

Unnamed: 0,processed_text_upto_300_words,label
0,students faculty georgetown law school gathere...,1
1,beirut parliament group lebanon hezbollah said...,1
2,kabul us ambassador afghanistan said monday wo...,1
3,berlin chancellor angela merkel friday said ge...,1
4,easy blame white man way accountable actions l...,0
...,...,...
44892,donald trump literally used national prayer br...,0
44893,time drag donald trump office act soon constit...,0
44895,donald trump misogynistic behavior towards wom...,0
44896,president donald trump declined friday explain...,1


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 44196 entries, 0 to 44897
Data columns (total 2 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   processed_text_upto_300_words  44196 non-null  object
 1   label                          44196 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.0+ MB


In [8]:
data_X=data["processed_text_upto_300_words"]
data_Y=data["label"]

## Tokenize Text

In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [10]:
tokenizer=Tokenizer()

In [11]:
tokenizer.fit_on_texts(data_X)

In [12]:
tokenized_text=tokenizer.texts_to_sequences(data_X)

In [13]:
vocab_length=len(tokenizer.word_index)

In [14]:
vocab_length

108786

In [15]:
word_index=tokenizer.word_index

In [16]:
word_index

{'trump': 1,
 'said': 2,
 'us': 3,
 'president': 4,
 'would': 5,
 'people': 6,
 'donald': 7,
 'one': 8,
 'state': 9,
 'new': 10,
 'house': 11,
 'also': 12,
 'obama': 13,
 'republican': 14,
 'clinton': 15,
 'government': 16,
 'states': 17,
 'white': 18,
 'told': 19,
 'united': 20,
 'like': 21,
 'last': 22,
 'could': 23,
 'news': 24,
 'campaign': 25,
 'two': 26,
 'election': 27,
 'time': 28,
 'party': 29,
 'hillary': 30,
 'first': 31,
 'even': 32,
 'year': 33,
 'country': 34,
 'former': 35,
 'presidential': 36,
 'via': 37,
 'get': 38,
 'police': 39,
 'made': 40,
 'years': 41,
 'going': 42,
 'say': 43,
 'court': 44,
 'make': 45,
 'many': 46,
 'may': 47,
 'national': 48,
 'security': 49,
 'media': 50,
 'since': 51,
 'law': 52,
 'republicans': 53,
 'political': 54,
 'week': 55,
 'know': 56,
 'bill': 57,
 'american': 58,
 'image': 59,
 'back': 60,
 'america': 61,
 'statement': 62,
 'percent': 63,
 'vote': 64,
 'democratic': 65,
 'senate': 66,
 'wednesday': 67,
 'tuesday': 68,
 'support': 69,

In [17]:
import json
with open("word_index.json","w") as file:
    json.dump(word_index,file)

In [18]:
tokenizer_json=tokenizer.to_json()

In [19]:
with open("../Models/tokenizer.json","w") as file:
    json.dump(tokenizer_json,file)

## Pad Sequences

In [20]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [21]:
padded_text=pad_sequences(tokenized_text,maxlen=300,truncating="post",padding="post")

In [22]:
padded_text.shape

(44196, 300)

## Embedding matrix

In [23]:
embedding_matrix=np.zeros((vocab_length+1,100))

In [24]:
embedding_path=os.path.join("..","Data","glove.6B.100d.txt")

In [25]:
embed_dict={}
with open(embedding_path,'r',encoding='utf-8') as file:
    for line in file:
        vector=line.split()
        word=vector[0]
        embed=np.asarray(vector[1:])
        embed_dict[word]=embed

for word,index in word_index.items():
    if word in embed_dict.keys():
        embedding_matrix[index]=embed_dict[word]
        

In [26]:
embedding_matrix.shape

(108787, 100)

### Important Note: There will be 6 Lstm models and each having 60 words per instance.

## Model Building

In [27]:
from tensorflow.keras.layers import Input


In [28]:
from multi_lstm_layer import MultiLSTMLayer

In [29]:
basic_text_processing_layer_config={
    "lstm_layer_config":[
        {
            "units":8,
            "activation":"relu",
            "return_sequences":True,
            "bidirection":True
        },
        {
            "units":32,
            "activation":"relu",
            "return_sequences":True,
            "bidirection":False
        },
        {
            "units":64,
            "activation":"relu",
            "return_sequences":False,
            "bidirection":False
        }
        
    ],
    "dense_layer_config":[
        {
           "units":128,
           "activation":"relu",
           "kernel_initializer":"he_uniform"
        },
        {
           "units":256,
           "activation":"relu",
           "kernel_initializer":"he_uniform"
        }
        
    ]
}

In [30]:
inp=Input(shape=(300,),dtype=tf.float32)

In [31]:
x=MultiLSTMLayer(
    basic_text_processing_layer_config=basic_text_processing_layer_config,
    embedding_matrix=embedding_matrix,
    num_models=6,
    vocab_size=vocab_length+1,
    embedding_output_dim=100
)(inp)

2025-02-18 00:27:21.452004: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [32]:
from tensorflow.keras.models import Model

In [33]:
model=Model(inputs=inp,outputs=x)

In [34]:
model.summary()

## Compile Model

In [35]:
from tensorflow.keras.optimizers import Adam

In [36]:
from tensorflow.keras.losses import BinaryCrossentropy

In [37]:
from tensorflow.keras.metrics import Accuracy

In [38]:
optimizer=Adam(
    learning_rate=0.001,
    beta_1=0.96,
    beta_2=0.98
)

In [39]:
# loss_fn=BinaryCrossentropy()

In [40]:
# metric=Accuracy()

In [41]:
model.compile(
    optimizer=optimizer,
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

## Model Training

In [42]:
X=padded_text
X.shape

(44196, 300)

In [43]:
Y=np.array(data_Y)
Y.shape

(44196,)

In [44]:
batch_size=100
epochs=2

In [45]:
history=model.fit(X,Y,batch_size=batch_size,epochs=epochs)

Epoch 1/2
[1m442/442[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 255ms/step - accuracy: 0.8221 - loss: 0.5318
Epoch 2/2
[1m442/442[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 260ms/step - accuracy: 0.9448 - loss: 2.2025


In [46]:
import os

In [47]:
model_weights_path=os.path.join("..","Models","lstm_model_weights.weights.h5")
model.save_weights(model_weights_path)

In [48]:
# lstm_model_path=os.path.join("..","Models","lstm_model.keras");

In [49]:
# model.save(lstm_model_path)

In [50]:
# for layer in model.layers:
#     layer.trainable=True

In [51]:
# for epoch in range(epochs):
#     print(f"Epoch:{epoch+1} started..")
    
#     dataset=tf.data.Dataset.from_tensor_slices((X,Y))
#     dataset=dataset.shuffle(buffer_size=1024).batch(batch_size)
    
    
#     for step,(data_X,data_Y) in enumerate(dataset):
#         with tf.GradientTape() as tape:
#             predictions=model(data_X,training=True)
#             loss=loss_fn(data_Y,predictions)
            
#         # print(f"Pred:{predictions}")
#         # print(f"Loss:{loss}")
        
#         gradients=tape.gradient(loss,model.trainable_variables)
#         optimizer.apply_gradients(zip(gradients,model.trainable_variables))
        
#         metric.update_state(data_Y, tf.argmax(predictions, axis=-1))
        
#         if step%20==0:
#             print(f"Epoch:{epoch+1} Step:{step+1} Accuracy:{metric.result().numpy()} Loss:{loss.numpy()}")
            
#     print(f"Epoch:{epoch+1} completed...")
#     print(f"Epoch:{epoch+1}  Accuracy:{metric.result().numpy()}")
        