<a href="https://colab.research.google.com/github/sriramreddy-7/RUMOUR-DETECTION/blob/main/RD2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import json
import os
import numpy as np
import pandas as pd
from PIL import Image
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.image import img_to_array, load_img

In [5]:
def load_json_data(json_path):
    with open(json_path, 'r') as f:
        data = json.load(f)
    return pd.DataFrame(data)

In [6]:
json_path="/content/drive/MyDrive/[07] Datasets/ML (1)/pheme/data.json"

In [7]:
pheme_df = load_json_data(json_path)

In [16]:
print(pheme_df.columns)

Index(['id', 'text', 'image', 'label', 'nodes', 'edges'], dtype='object')


In [20]:
image_column = 'image'

In [18]:
images_folder = '/content/drive/MyDrive/[07] Datasets/ML (1)/pheme/images'
existing_images = set(os.listdir(images_folder))

In [34]:
pheme_df['image_exists'] = pheme_df[image_column].apply(lambda x: x in existing_images)
filtered_pheme_df = pheme_df[pheme_df['image_exists']]

In [35]:
def load_images_from_folder(folder_path, img_height, img_width, filenames):
    images = []
    for filename in filenames:
        img_path = os.path.join(folder_path, filename)
        img = load_img(img_path, target_size=(img_height, img_width))
        img_array = img_to_array(img)
        images.append(img_array)
    return np.array(images)

In [36]:
IMG_HEIGHT = 224
IMG_WIDTH = 224
image_filenames = filtered_pheme_df[image_column].tolist()

In [37]:
image_data = load_images_from_folder(images_folder, IMG_HEIGHT, IMG_WIDTH, image_filenames)

# Update DataFrame to only include rows with loaded images
filtered_pheme_df = filtered_pheme_df.reset_index(drop=True)

In [38]:
MAX_NB_WORDS = 20000
MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(filtered_pheme_df['text'])

In [39]:
sequences = tokenizer.texts_to_sequences(filtered_pheme_df['text'])
word_index = tokenizer.word_index

In [40]:
text_data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [41]:
labels = filtered_pheme_df['label'].values

In [43]:
print(f'text_data length: {len(text_data)}, image_data length: {len(image_data)}, labels length: {len(labels)}')

text_data length: 920, image_data length: 920, labels length: 920


In [52]:
labels = filtered_pheme_df['label'].astype(int).values

In [53]:
text_data = np.array(text_data, dtype=np.float32)
image_data = np.array(image_data, dtype=np.float32)

In [55]:
from sklearn.model_selection import train_test_split

In [54]:
text_train, text_test, img_train, img_test, y_train, y_test = train_test_split(
    text_data, image_data, labels, test_size=0.2, stratify=labels)

In [56]:
print(f"text_train dtype: {text_train.dtype}, img_train dtype: {img_train.dtype}, y_train dtype: {y_train.dtype}")
print(f"text_test dtype: {text_test.dtype}, img_test dtype: {img_test.dtype}, y_test dtype: {y_test.dtype}")

text_train dtype: float32, img_train dtype: float32, y_train dtype: int64
text_test dtype: float32, img_test dtype: float32, y_test dtype: int64


In [57]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Conv2D, MaxPooling2D, Flatten, concatenate

In [58]:
text_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_text = Embedding(MAX_NB_WORDS, EMBEDDING_DIM)(text_input)
lstm_text = LSTM(128)(embedded_text)

In [59]:
image_input = Input(shape=(IMG_HEIGHT, IMG_WIDTH, 3), dtype='float32')
conv1 = Conv2D(32, (3, 3), activation='relu')(image_input)
pool1 = MaxPooling2D(pool_size=(2, 2))(conv1)
conv2 = Conv2D(64, (3, 3), activation='relu')(pool1)
pool2 = MaxPooling2D(pool_size=(2, 2))(conv2)
flatten_img = Flatten()(pool2)

In [60]:
concatenated = concatenate([lstm_text, flatten_img])

In [61]:
output = Dense(1, activation='sigmoid')(concatenated)

# Compile the model
model = Model(inputs=[text_input, image_input], outputs=output)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [62]:
history = model.fit([text_train, img_train], y_train,
                    epochs=10, batch_size=32, validation_data=([text_test, img_test], y_test))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [63]:
loss, accuracy = model.evaluate([text_test, img_test], y_test)
print(f'Test accuracy: {accuracy}')

Test accuracy: 0.6739130616188049


In [64]:
y_pred = (model.predict([text_test, img_test]) > 0.5).astype(int)

# Import classification report and accuracy score
from sklearn.metrics import classification_report, accuracy_score

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print accuracy score
print("Accuracy Score:", accuracy_score(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.86      0.78       125
           1       0.48      0.27      0.35        59

    accuracy                           0.67       184
   macro avg       0.60      0.57      0.57       184
weighted avg       0.64      0.67      0.64       184

Accuracy Score: 0.6739130434782609
