In [24]:
%logstart -o -t my_notebook.log

# Requirements and libraries
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from collections import defaultdict
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Activation, Flatten
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Concatenate

print("Done!")

Activating auto-logging. Current session state plus future input saved.
Filename       : my_notebook.log
Mode           : backup
Output logging : True
Raw input log  : False
Timestamping   : True
State          : active
Done!


In [3]:
# Download Requirements and libraries
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:

# # Read data source and show the head
# df1 = pd.read_table('./all_comments.tsv/all_comments.tsv',)
# df1.dataframeName = 'Fake News'
# nRow, nCol = df1.shape
# print(f'There are {nRow} rows and {nCol} columns')
# df1.head()

In [None]:

# # Drop unused column of data and slice
# df2=df1.drop(['Unnamed: 0','author','parent_id','submission_id', 'ups'],axis=1)
# df3 = df2[:100]
# print("Done!")

In [6]:
# Create a DataFrame with the predicted labels
# predicted_df = pd.DataFrame({'predicted_label': predicted_labels})
# Export the DataFrame to Excel
# df3.to_csv('slice_100.csv')
df3 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/slice_100.csv')
df3.head()

Unnamed: 0.1,Unnamed: 0,id,body,isTopLevel
0,0,f4deplg,"Scroll, scroll, scroll. Pause. Scroll back u...",True
1,1,f4d79bi,A lot of the people who felt quite strongly ab...,True
2,2,f4ddmlk,T H E S P H E R E S H A L L R I S E A ...,True
3,3,f4dknfn,All hail the cube of justice,True
4,4,f4dgdur,That is glorious.,True


In [8]:

# Extract necessary column of data and convert into list
id = df3['id'].tolist()
body = df3['body'].tolist()
isTopLevel = df3['isTopLevel'].tolist()
print("Done!")

Done!


In [9]:


# Define the preprocessing functions
def preprocess_text(text):
    # Remove stopwords, punctuations, numbers, and multiple spaces
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    filtered_tokens = [token.lower() for token in tokens if token.isalpha() and token.lower() not in stop_words]
    return filtered_tokens

def lemmatize_text(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]
    return lemmatized_tokens

def get_wordnet_pos(token):
    # Map POS tag to first character lemmatize() accepts
    tag = nltk.pos_tag([token])[0][1][0].upper()
    tag_dict = {'J': wordnet.ADJ, 'N': wordnet.NOUN, 'V': wordnet.VERB, 'R': wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)


print("Done!")

Done!


In [10]:
# Sample Fakeddit dataset
data = {
    'id': id,
    'body': body,
    'isTopLevel': isTopLevel
}

#Convert the dictionary to pandas Dataframe
df = pd.DataFrame(data)

# Preprocess the 'body' column
df['body'] = df['body'].fillna("")
df['body'] = df['body'].apply(preprocess_text)
df['body'] = df['body'].apply(lemmatize_text)

# Building the vocabulary and mapping words to integer numbers
word_to_int = defaultdict(lambda: len(word_to_int) + 1)  # Assign unique integer to each word

# Transforming each text into a sequence of integers
df['body'] = df['body'].apply(lambda tokens: [word_to_int[token] for token in tokens])
print("Done")

Done


In [11]:
#Extract the body and padding the sequence in an uniform length
max_length = 15
df_body = []
for i in df["body"]:
    df_body.append(i[:max_length])


for item in df_body:
    if len(item) < max_length:
        for i in range(max_length-len(item)):
            item.append(0)

#Split the train data and validation data as X_train-(80%) and x_val-(20%)
X_train = list()
X_val = list()
len_df_body = len(df_body)
X_percent_80 = ((len_df_body*80)//100)

for i in range(len_df_body):
    if i <= X_percent_80-1:
        X_train.append(df_body[i])
    if i >= X_percent_80:
        X_val.append(df_body[i])


#Extract the isTopLevel and convert the data in numeric type
isTopLevel_trans = list()
for item in isTopLevel:
    if item == True or item == 'True':
        isTopLevel_trans.append(1)
    if item == False or item == 'False':
        isTopLevel_trans.append(0)

#Split the train data and validation data as y_train-(80%) and y_val-(20%)
y_train = list()
y_val = list()
len_isTopLevel_trans = len(isTopLevel_trans)
y_percent_80 = ((len_isTopLevel_trans*80)//100)


for i in range(len_isTopLevel_trans):
    if i <= y_percent_80-1:
        y_train.append(isTopLevel_trans[i])
    if i >= y_percent_80:
        y_val.append(isTopLevel_trans[i])


#Cast the data type as tensor float
X_train = tf.cast(X_train, dtype=tf.float32)
X_val = tf.cast(X_val, dtype=tf.float32)
y_train = tf.cast(y_train, dtype=tf.float32)
y_val = tf.cast(y_val, dtype=tf.float32)

print("Done")

Done


In [12]:

# Generate embedding dimension
embedding_dim = 300
vocab_size = len(word_to_int) + 1

# X_train = tf.convert_to_tensor(X_train)
# X_val = tf.convert_to_tensor(X_val)
# y_train = tf.convert_to_tensor(y_train)
# y_val = tf.convert_to_tensor(y_val)


# Create the model (CNN)
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length,  trainable=True))
model.add(Conv1D(filters=50, kernel_size=2, activation='relu'))
model.add(Conv1D(filters=50, kernel_size=3, activation='relu'))
model.add(Conv1D(filters=50, kernel_size=4, activation='relu'))
model.add(Conv1D(filters=50, kernel_size=5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Flatten())
model.add(Dense(100, activation='relu'))
model.add(Dense(1, activation='sigmoid'))


# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 15, 300)           156000    
                                                                 
 conv1d (Conv1D)             (None, 14, 50)            30050     
                                                                 
 conv1d_1 (Conv1D)           (None, 12, 50)            7550      
                                                                 
 conv1d_2 (Conv1D)           (None, 9, 50)             10050     
                                                                 
 conv1d_3 (Conv1D)           (None, 5, 50)             12550     
                                                                 
 global_max_pooling1d (Globa  (None, 50)               0         
 lMaxPooling1D)                                                  
                                                        

In [13]:

# Train the model
batch_size = 64
epochs = 10
model.fit(x=X_train, y=y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val))

# Evaluate the model on the validation set
loss, accuracy = model.evaluate(np.array(X_val), np.array(y_val))
print(f'Validation Loss: {loss:.4f}')
print(f'Validation Accuracy: {accuracy:.4f}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Validation Loss: 0.7390
Validation Accuracy: 0.4000


In [15]:
# Predict the labels for the test set
predicted_labels = model.predict(np.array(X_train))
predicted_labels = np.round(predicted_labels).flatten()

# Create a DataFrame with the predicted labels
predicted_df = pd.DataFrame({'predicted_label': predicted_labels})
# Export the DataFrame to Excel
predicted_df.to_csv('predictions.csv', index=False)
predicted_df



Unnamed: 0,predicted_label
0,1.0
1,1.0
2,0.0
3,0.0
4,0.0
...,...
75,0.0
76,0.0
77,0.0
78,0.0


In [16]:
from PIL import Image, ImageChops, ImageEnhance
import os
import itertools
from keras.layers import Input
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder


In [None]:
# def convert_to_ela_image(path, quality):
#     temp_filename = 'temp_file_name.jpg'
#     ela_filename = 'temp_ela.png'

#     image = Image.open(path).convert('RGB')
#     image.save(temp_filename, 'JPEG', quality = quality)
#     temp_image = Image.open(temp_filename)

#     ela_image = ImageChops.difference(image, temp_image)

#     extrema = ela_image.getextrema()
#     max_diff = max([ex[1] for ex in extrema])
#     if max_diff == 0:
#         max_diff = 1
#     scale = 255.0 / max_diff

#     ela_image = ImageEnhance.Brightness(ela_image).enhance(scale)

#     return ela_image

In [None]:
# image_size = (128, 128)
# def prepare_image(image_path):
#     return np.array(convert_to_ela_image(image_path, 90).resize(image_size)).flatten()/255.0

In [None]:

# path = './image_data'
# X = []
# for dirname, _, filenames in os.walk(path):
#     for filename in filenames:
#         if filename.endswith('jpg') or filename.endswith('png'):
#             full_path = os.path.join(dirname, filename)
#             X.append(prepare_image(full_path))

# # print(X)

In [17]:
import cv2
import numpy as np

# def image_pross(path):
#     path = path
#     #     print(path)
#     # Load the images
#     image_data = []
#     image_width = 560
#     image_height = 560
#     for dirname, _, filenames in os.walk(path):
#         for filename in filenames:
#             if filename.endswith('jpg') or filename.endswith('png'):
#                 full_path = os.path.join(dirname, filename)
#                 image = cv2.imread(full_path)  # Load the image using OpenCV
#                 image = cv2.resize(image, (image_width, image_height))  # Resize the image to match your model's input shape
#                 image_data.append(image)


#     # Convert the list of images to a NumPy array
#     image_data = np.array(image_data)

#     # Normalize the image data if necessary
#     image_data = image_data / 255.0  # Normalize pixel values to the range [0, 1]

#     # Predict using the model
#     # predictions = model.predict(image_data)
# #     print(image_data)
#     return image_data
#     # print("Done!")

# Function to preprocess image data
def preprocess_image_data(path, image_width, image_height):
    image_data = []
    for dirname, _, filenames in os.walk(path):
        for filename in filenames:
            if filename.endswith('jpg') or filename.endswith('png'):
                full_path = os.path.join(dirname, filename)
                image = cv2.imread(full_path)
                image = cv2.resize(image, (image_width, image_height))
                image_data.append(image)

    image_data = np.array(image_data)
    image_data = image_data / 255.0  # Normalize pixel values to the range [0, 1]
    return image_data


In [18]:
import os
import random
import shutil

# Define the paths
source_dir = '/content/drive/MyDrive/Colab Notebooks/image_data'  # Directory containing all 100 images
train_dir = './train'  # Directory for training images
val_dir = './validation'  # Directory for validation images

# Create the directories if they don't exist
# os.makedirs(image_data, exist_ok=True)
os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)

# Shuffle the image list
image_list = os.listdir(source_dir)
random.shuffle(image_list)

# len(image_list)
# Split the images into train and validation sets
train_images = image_list[:80]  # Select the first 80 images for training
val_images = image_list[80:]  # Select the remaining 20 images for validation

# print(train_images)

# for i in train_images:
#     if "fake" in i:
#         print(i)
# Move the images to the corresponding directories
i=1
for image in train_images:
    image_path = os.path.join(source_dir, image)
    if 'fake' in image:
        shutil.copy(image_path, os.path.join(train_dir, f'fake-{i}.jpg'))
    else:
        shutil.copy(image_path, os.path.join(train_dir, f'real-{i}.jpg'))
    i += 1

i=1
for image in val_images:
    image_path = os.path.join(source_dir, image)
    if 'fake' in image:
        shutil.copy(image_path, os.path.join(val_dir, f'fake-{i}.jpg'))
    else:
        shutil.copy(image_path, os.path.join(val_dir, f'real-{i}.jpg'))
    i += 1

In [19]:
len(val_images)

20

In [21]:

# Preprocess image data
image_width = 560
image_height = 560
image_train = preprocess_image_data(train_dir, image_width, image_height)
image_val = preprocess_image_data(val_dir, image_width, image_height)
print(len(image_train))

80


In [22]:
# Define the text input
text_input = Input(shape=(15,), name='text_input')

# Generate embedding dimension
embedding_dim = 300
vocab_size = len(word_to_int) + 1

# X_train = tf.convert_to_tensor(X_train)
# X_val = tf.convert_to_tensor(X_val)
# y_train = tf.convert_to_tensor(y_train)
# y_val = tf.convert_to_tensor(y_val)


# Create the model (CNN)
# model = Sequential()
# model.add()
# Text CNN model
text_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length,  trainable=True)(text_input)
text_conv1 = Conv1D(filters=50, kernel_size=2, activation='relu')(text_embedding)
text_conv2 = Conv1D(filters=50, kernel_size=3, activation='relu')(text_embedding)
text_conv3 = Conv1D(filters=50, kernel_size=4, activation='relu')(text_embedding)
text_conv4 = Conv1D(filters=50, kernel_size=5, activation='relu')(text_embedding)
text_pool1 = GlobalMaxPooling1D()(text_conv1)
text_pool2 = GlobalMaxPooling1D()(text_conv2)
text_pool3 = GlobalMaxPooling1D()(text_conv3)
text_pool4 = GlobalMaxPooling1D()(text_conv4)
text_concat = Concatenate()([text_pool1, text_pool2, text_pool3, text_pool4])
text_output = Dense(100, activation='relu')(text_concat)
text_output = Dense(1, activation='sigmoid')(text_output)



# Define the image input
image_input = Input(shape=(image_width, image_height, 3), name='image_input')

# Image CNN model
image_conv1 = Conv2D(filters=6, kernel_size=(5, 5), strides=1, padding='valid')(image_input)
image_activation1 = Activation('relu')(image_conv1)
image_maxpool1 = MaxPooling2D(pool_size=(2, 2), strides=2)(image_activation1)
image_conv2 = Conv2D(filters=3, kernel_size=(5, 5), strides=1, padding='valid')(image_maxpool1)
image_activation2 = Activation('relu')(image_conv2)
image_maxpool2 = MaxPooling2D(pool_size=(2, 2), strides=2)(image_activation2)
image_flatten = Flatten()(image_maxpool2)

# Concatenate the outputs from text and image models
concatenated = Concatenate()([text_output, image_flatten])

# Dense layers and final output
dense1 = Dense(100, activation='relu')(concatenated)
dense2 = Dense(1, activation='sigmoid')(dense1)
output = Activation('sigmoid')(dense2)

final_model = Model(inputs=[text_input, image_input], outputs=output)
# Compile the model
# final_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
final_model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
# Print the model summary
final_model.summary()
print("---------------------------")
print(text_output)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 image_input (InputLayer)       [(None, 560, 560, 3  0           []                               
                                )]                                                                
                                                                                                  
 text_input (InputLayer)        [(None, 15)]         0           []                               
                                                                                                  
 conv2d (Conv2D)                (None, 556, 556, 6)  456         ['image_input[0][0]']            
                                                                                                  
 embedding_1 (Embedding)        (None, 15, 300)      156000      ['text_input[0][0]']         

In [27]:
# %%capture captured_output
# Train the multimodal model
image_train = tf.cast(image_train, dtype=tf.float32)
image_val = tf.cast(image_val, dtype=tf.float32)
# Preprocess target labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)  # Encode the labels
y_val = label_encoder.transform(y_val)  # Encode the labels

# y_train = tf.cast(y_train, dtype=tf.float32)
# y_val = tf.cast(y_val, dtype=tf.float32)

# Train the multimodal model
history = final_model.fit(
    x=[X_train, image_train],  # Pass both text and image inputs
    y=y_train,  # Target labels
    validation_data=([X_val, image_val], y_val),  # Validation data
    epochs=10,
    batch_size=64
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
len(X_val)

20

In [None]:
len(image_val)

20

In [None]:
len(X_train)

80

In [None]:
len(image_train)

80

In [None]:
image_train = tf.cast(image_train, dtype=tf.float32)
image_train.shape

TensorShape([80, 128, 128, 3])

In [None]:
from keras.layers import Input


# Define the image input
image_input = Input(shape=(15, 15, 3), name='image_data')


image_model = Sequential()
image_model.add(Conv2D(filters=6, kernel_size=(5, 5), strides=1, padding='valid', input_shape=(560, 560, 3)))
image_model.add(Activation('relu'))
image_model.add(MaxPooling2D(pool_size=(2, 2), strides=2))
image_model.add(Conv2D(filters=3, kernel_size=(5, 5), strides=1, padding='valid'))
image_model.add(Activation('relu'))
image_model.add(MaxPooling2D(pool_size=(2, 2), strides=2))
image_model.add(Flatten())


concatenated = tf.keras.layers.Concatenate()([model.output, image_model.output])
dense1 = Dense(100, activation='relu')(concatenated)
output = Dense(1, activation='sigmoid')(dense1)
final_model = Model(inputs=[model.input, image_input], outputs=output)
# Compile the model
final_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print the model summary
final_model.summary()

# Train the multimodal model
history = final_model.fit(
    [text_train, image_train],  # Pass both text and image inputs
    y_train,  # Target labels
    validation_data=([text_val, image_val], y_val),  # Validation data
    epochs=15,
    batch_size=16
)

# # Evaluate the model
# loss, accuracy = model.evaluate([text_test, image_test], y_test)
# print("Test Loss:", loss)
# print("Test Accuracy:", accuracy)

# # Export the predicted values
# y_pred = model.predict([text_test, image_test])
# y_pred = [1 if pred > 0.5 else 0 for pred in y_pred]

# import pandas as pd

# # Create a DataFrame with the predicted values
# df_pred =
# df_pred = pd.DataFrame({'predicted_class': y_pred})

# # Export the DataFrame to a CSV file
# df_pred.to_csv('predicted_values.csv', index=False)


2023-06-19 11:10:07.359043: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 22523200 exceeds 10% of free system memory.
2023-06-19 11:10:07.386887: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 22523200 exceeds 10% of free system memory.
2023-06-19 11:10:07.396564: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 22523200 exceeds 10% of free system memory.


ValueError: Graph disconnected: cannot obtain value for tensor KerasTensor(type_spec=TensorSpec(shape=(None, 560, 560, 3), dtype=tf.float32, name='conv2d_input'), name='conv2d_input', description="created by layer 'conv2d_input'") at layer "conv2d". The following previous layers were accessed without issue: ['embedding', 'conv1d']