<a href="https://colab.research.google.com/github/Stefi96/DetectingNFTs-Master/blob/main/Images_NFTs_Master.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import numpy as np
import tensorflow as tf
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.layers import Dropout
from keras.callbacks import EarlyStopping
from keras.preprocessing.image import ImageDataGenerator
from PIL import Image

In [2]:
# Paths to all the scam and legit images
# NOTE: One option is the paths for google cloud and the other for running locally
scam_image_paths = [os.path.join("/content/drive/MyDrive/Master/Project/Images/Images_scam", image) for image in os.listdir("/content/drive/MyDrive/Master/Project/Images/Images_scam")]
legit_image_paths = [os.path.join("/content/drive/MyDrive/Master/Project/Images/Images_legit", image) for image in os.listdir("/content/drive/MyDrive/Master/Project/Images/Images_legit")]
#scam_image_paths = [os.path.join(r"C:\Users\stefanve\Desktop\Project\Images\Images_scam", image) for image in os.listdir(r"C:\Users\stefanve\Desktop\Project\Images\Images_scam")]
#legit_image_paths = [os.path.join(r"C:\Users\stefanve\Desktop\Project\Images\Images_legit", image) for image in os.listdir(r"C:\Users\stefanve\Desktop\Project\Images\Images_legit")]

In [3]:
# Paths to your scam and legit images on Google Drive
scam_dir = "/content/drive/MyDrive/Master/Project/Images/Images_scam"
legit_dir = "/content/drive/MyDrive/Master/Project/Images/Images_legit"

# List the images in the directories
scam_images = os.listdir(scam_dir)
legit_images = os.listdir(legit_dir)

# Print the number of images detected in each directory
print(f"Number of scam images detected: {len(scam_images)}")
print(f"Number of legit images detected: {len(legit_images)}")

Number of scam images detected: 1082
Number of legit images detected: 1762


In [4]:
def try_loading_images(image_paths):
    loaded_images = []
    for img_path in image_paths[:10]:  # Limiting to the first 10 images for testing
        try:
            img = Image.open(img_path).resize((128, 128))
            img_array = np.array(img) / 255.0
            loaded_images.append(img_array)
        except Exception as e:
            print(f"Error with image {img_path}: {e}")
    return loaded_images

# Attempt to load and process a subset of scam and legit images
loaded_scam_images = try_loading_images(scam_image_paths)
loaded_legit_images = try_loading_images(legit_image_paths)

print(f"Number of scam images loaded: {len(loaded_scam_images)}")
print(f"Number of legit images loaded: {len(loaded_legit_images)}")

Number of scam images loaded: 10
Number of legit images loaded: 10


In [5]:
def load_and_preprocess_images_modified(image_paths, label):
    data = []
    labels = []
    for img_path in image_paths:
        try:
            # Open the image and convert to RGB (removing any alpha channel)
            img = Image.open(img_path).convert("RGB").resize((128, 128))
            img_array = np.array(img) / 255.0
            data.append(img_array)
            labels.append(label)
        except Exception as e:
            print(f"Error with image {img_path}: {e}")
    return data, labels

# Load and preprocess the images using the modified function
scam_data, scam_labels = load_and_preprocess_images_modified(scam_image_paths, "scam")
legit_data, legit_labels = load_and_preprocess_images_modified(legit_image_paths, "legit")

# Combine data and labels
all_data = scam_data + legit_data
all_labels = scam_labels + legit_labels

print(f"Total number of images in all_data: {len(all_data)}")
print(f"Total number of labels in all_labels: {len(all_labels)}")

Total number of images in all_data: 2844
Total number of labels in all_labels: 2844


In [6]:
# Use the debug function to load and preprocess images
scam_data, scam_labels = load_and_preprocess_images_modified(scam_image_paths, "scam")
legit_data, legit_labels = load_and_preprocess_images_modified(legit_image_paths, "legit")

In [7]:
all_data = scam_data + legit_data
all_labels = scam_labels + legit_labels

print(f"Total number of images in all_data: {len(all_data)}")
print(f"Total number of labels in all_labels: {len(all_labels)}")

Total number of images in all_data: 2844
Total number of labels in all_labels: 2844


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(all_data, all_labels, test_size=0.3, random_state=42, stratify=all_labels)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print(f"Training data size: {len(X_train)}")
print(f"Validation data size: {len(X_val)}")
print(f"Test data size: {len(X_test)}")

Training data size: 1990
Validation data size: 427
Test data size: 427


In [9]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_val = label_encoder.transform(y_val)
y_test = label_encoder.transform(y_test)


In [10]:
y_train = to_categorical(y_train, 2)
y_val = to_categorical(y_val, 2)
y_test = to_categorical(y_test, 2)

In [11]:
input_shape = (128, 128, 3)

# Adding dropout layers
model = Sequential()
model.add(Conv2D(32, (3, 3), activation='relu', input_shape=input_shape))
model.add(MaxPooling2D(2, 2))
model.add(Dropout(0.25))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(2, 2))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))

In [12]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [13]:
# Adding early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=5, verbose=1, restore_best_weights=True)

In [14]:
# Implementing data augmentation
datagen = ImageDataGenerator(
    rotation_range=15,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.2,
    zoom_range=0.1,
    horizontal_flip=True,
    fill_mode='nearest'
)

datagen.fit(X_train)

In [15]:
X_train = np.array(X_train)
y_train = np.array(y_train)
X_val = np.array(X_val)
y_val = np.array(y_val)

In [16]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

X_train shape: (1990, 128, 128, 3)
y_train shape: (1990, 2)
X_val shape: (427, 128, 128, 3)
y_val shape: (427, 2)


In [17]:
number_of_epochs = 50  # Adjust this based on your requirements
history = model.fit(datagen.flow(np.array(X_train), np.array(y_train), batch_size=32),
                    validation_data=(X_val, y_val),
                    epochs=number_of_epochs,
                    callbacks=[early_stop])
#history = model.fit(np.array(X_train), np.array(y_train), epochs=10, batch_size=32, validation_data=(np.array(X_val), np.array(y_val)), callbacks=[early_stop])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50


KeyboardInterrupt: ignored

In [None]:
test_loss, test_acc = model.evaluate(np.array(X_test), np.array(y_test))
print(f"Test accuracy: {test_acc}")

In [None]:
print(f"Original X_test shape: {np.array(X_test).shape}")

# Reshape if necessary
X_test_reshaped = np.array(X_test).reshape(-1, 128, 128, 3)  # Assuming your images are 128x128 and RGB
print(f"Reshaped X_test shape: {X_test_reshaped.shape}")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Predict the labels for the test set using reshaped data
y_pred = model.predict(X_test_reshaped)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test, axis=1)

# Classification Report
print("Classification Report:")
print(classification_report(y_true, y_pred_classes))

# Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_true, y_pred_classes))

# ROC-AUC
# Note: This is for binary classification. If your task is multi-class, this will need adjustments.
roc_auc = roc_auc_score(y_true, y_pred_classes)
print(f"\nROC-AUC: {roc_auc:.4f}")


In [None]:
import matplotlib.pyplot as plt

# Plotting training & validation accuracy
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.legend()
plt.title('Training vs Validation Accuracy')

# Plotting training & validation loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.title('Training vs Validation Loss')
plt.tight_layout()
plt.show()

In [None]:
# Transfer learning
from keras.applications import ResNet50
from keras.models import Model
from keras.layers import Dense, GlobalAveragePooling2D, Dropout
from keras.optimizers import Adam

In [None]:
# Data Augmentation
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True)

datagen.fit(X_train)

In [None]:
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(128, 128, 3))

In [None]:
x = base_model.output
x = GlobalAveragePooling2D()(x)  # Global pooling layer
x = Dropout(0.5)(x)  # Add dropout for regularization
predictions = Dense(2, activation='softmax')(x)  # Final classification layer

model = Model(inputs=base_model.input, outputs=predictions)

In [None]:
for layer in base_model.layers:
    layer.trainable = False

In [None]:
model.compile(optimizer=Adam(learning_rate=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# only for collab
X_train = np.array(X_train)
y_train = np.array(y_train)
X_val = np.array(X_val)
y_val = np.array(y_val)

In [None]:
# # only for collab
# from keras.utils import to_categorical

# # Map the string labels to integers
# label_mapping = {"scam": 0, "legit": 1}

# y_train_int = np.array([label_mapping[label] for label in y_train])
# y_val_int = np.array([label_mapping[label] for label in y_val])

# # Now apply one-hot encoding
# y_train = to_categorical(y_train_int, num_classes=2)
# y_val = to_categorical(y_val_int, num_classes=2)

In [None]:
# only for collab
print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)


In [None]:
# Define early stopping, if necesarry patience can be 15
early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=1)

# Train using data augmentation
history = model.fit(datagen.flow(X_train, y_train, batch_size=32),
                    validation_data=(X_val, y_val),
                    epochs=10)

In [None]:
# Fine-tuning: Unfreeze the last 4 layers of the base model
for layer in base_model.layers[-4:]:
    layer.trainable = True

In [None]:
model.compile(optimizer=Adam(learning_rate=0.00001), loss='categorical_crossentropy', metrics=['accuracy'])

# Continue training with a lower learning rate
history_fine = model.fit(datagen.flow(X_train, y_train, batch_size=32),
                         validation_data=(X_val, y_val),
                         epochs=50)

In [None]:
test_loss, test_acc = model.evaluate(np.array(X_test), np.array(y_test))
print(f"Test accuracy: {test_acc}")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Predict the labels for the test set using reshaped data
X_test = np.array(X_test)
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test, axis=1)

# Classification Report
print("Classification Report:")
print(classification_report(y_true, y_pred_classes))

# Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_true, y_pred_classes))

# ROC-AUC
# Note: This is for binary classification. If your task is multi-class, this will need adjustments.
roc_auc = roc_auc_score(y_true, y_pred_classes)
print(f"\nROC-AUC: {roc_auc:.4f}")


In [None]:
# import matplotlib.pyplot as plt

# # Plotting training & validation accuracy
# plt.figure(figsize=(12, 4))
# plt.subplot(1, 2, 1)
# plt.plot(history.history['accuracy'], label='Training Accuracy')
# plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
# plt.legend()
# plt.title('Training vs Validation Accuracy')

# # Plotting training & validation loss
# plt.subplot(1, 2, 2)
# plt.plot(history.history['loss'], label='Training Loss')
# plt.plot(history.history['val_loss'], label='Validation Loss')
# plt.legend()
# plt.title('Training vs Validation Loss')
# plt.tight_layout()
# plt.show()
# Plotting
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'] + history_fine.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'] + history_fine.history['val_accuracy'], label='Validation Accuracy')
plt.legend()
plt.title('Training vs Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'] + history_fine.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'] + history_fine.history['val_loss'], label='Validation Loss')
plt.legend()
plt.title('Training vs Validation Loss')
plt.tight_layout()
plt.show()

In [None]:
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.layers import Bidirectional
from keras.callbacks import ReduceLROnPlateau

# Assuming X_train and X_val are your training and validation data
# Reshape the data to match the expected input shape
X_train_reshaped = X_train.reshape(X_train.shape[0], 128, 128 * 3)
X_val_reshaped = X_val.reshape(X_val.shape[0], 128, 128 * 3)

# Define the RNN model
model = Sequential()

# Add Bidirectional LSTM layers with increased units
model.add(Bidirectional(LSTM(100, return_sequences=True, activation='tanh'), input_shape=(128, 128 * 3)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(100, activation='tanh')))
model.add(Dropout(0.5))

# Classification layer
model.add(Dense(2, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


# Define early stopping and learning rate reduction on plateau
from keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.001)

# Train the model
history = model.fit(
    X_train_reshaped, y_train,
    validation_data=(X_val_reshaped, y_val),
    epochs=50,
    batch_size=32,
    callbacks=[early_stopping, reduce_lr]
)

In [None]:
X_test = np.array(X_test)
X_test_reshaped = X_test.reshape(X_test.shape[0], 128, 128 * 3)
y_test = np.array(y_test)
print("Shape of X_test_reshaped:", X_test_reshaped.shape)
print("Shape of y_test:", y_test.shape)

In [None]:
test_loss, test_acc = model.evaluate(X_test_reshaped, y_test)
print(f"Test accuracy: {test_acc}")

In [None]:
# Predict the labels for the test set using reshaped data
y_pred = model.predict(X_test_reshaped)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test, axis=1)

# Classification Report
print("Classification Report:")
print(classification_report(y_true, y_pred_classes))

# Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_true, y_pred_classes))

# ROC-AUC
# Note: This is for binary classification. If your task is multi-class, this will need adjustments.
roc_auc = roc_auc_score(y_true, y_pred_classes)
print(f"\nROC-AUC: {roc_auc:.4f}")

In [None]:
import matplotlib.pyplot as plt

# Plotting training & validation accuracy
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.legend()
plt.title('Training vs Validation Accuracy')

# Plotting training & validation loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.title('Training vs Validation Loss')
plt.tight_layout()
plt.show()

In [None]:
from keras.applications import VGG16
from keras.models import Model
from keras.layers import GlobalAveragePooling2D, Dense, Dropout
from keras.optimizers import Adam

# Load VGG16 without top classification layer
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(128, 128, 3))

# Add custom layers
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dropout(0.5)(x)
predictions = Dense(2, activation='softmax')(x)

model = Model(inputs=base_model.input, outputs=predictions)

# Freeze VGG16 layers
for layer in base_model.layers:
    layer.trainable = False

model.compile(optimizer=Adam(learning_rate=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])

# Data Augmentation (optional but recommended for small datasets)
from keras.preprocessing.image import ImageDataGenerator

datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True
)
datagen.fit(X_train)

# Train the model using augmented data
history = model.fit(datagen.flow(X_train, y_train, batch_size=32),
                    validation_data=(X_val, y_val),
                    epochs=10)


In [None]:
# X_test = np.array(X_test)
# X_test_reshaped = X_test.reshape(X_test.shape[0], 128, 128 * 3)
# y_test = np.array(y_test)
# print("Shape of X_test_reshaped:", X_test_reshaped.shape)
# print("Shape of y_test:", y_test.shape)

In [None]:
X_test = preprocess_input(X_test)

In [None]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test accuracy: {test_acc}")

In [None]:
import matplotlib.pyplot as plt

# Convert one-hot encoded labels back to label format for visualization
y_train_labels = np.argmax(y_train, axis=1)

# Count class distribution
(unique, counts) = np.unique(y_train_labels, return_counts=True)

# Plot
plt.figure(figsize=(8, 5))
plt.bar(unique, counts, tick_label=["Scam", "Legit"])  # Assuming 0: Scam and 1: Legit
plt.xlabel('Class')
plt.ylabel('Number of samples')
plt.title('Class Distribution in Training Set')
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Define a function to display sample images from each class
def display_sample_images(X, y, label_mapping, num_samples=5):
    fig, axes = plt.subplots(2, num_samples, figsize=(15, 6))

    for label, label_str in label_mapping.items():
        # Get indices of images belonging to the current label
        indices = np.where(y == label)[0]

        # Randomly select 'num_samples' indices
        selected_indices = np.random.choice(indices, num_samples, replace=False)

        for idx, ax in enumerate(axes[label]):
            ax.imshow(X[selected_indices[idx]])
            ax.set_title(label_str)
            ax.axis('off')

    plt.tight_layout()
    plt.show()

# Convert one-hot encoded labels back to integer labels for easier indexing
y_train_int = np.argmax(y_train, axis=1)

# Define the label mapping based on the y_train data
label_mapping = {0: 'Legit', 1: 'Scam'}

# Display sample images
display_sample_images(X_train, y_train_int, label_mapping)

In [None]:
# from keras.applications.vgg16 import VGG16
# from keras.layers import LSTM, Dense, Dropout, TimeDistributed
# from keras.models import Sequential

# # 1. Load the VGG16 model without the top classification layer
# base_model = VGG16(weights='imagenet', include_top=False, input_shape=(128, 128, 3))

# # Make sure VGG16 layers are not trainable
# for layer in base_model.layers:
#     layer.trainable = False

# # 2. Define the sequential model
# model = Sequential()

# # Add VGG16 as the feature extractor
# model.add(base_model)

# # Convert the features to sequences for LSTM
# model.add(TimeDistributed(Flatten()))

# # 3. Add LSTM layer
# model.add(LSTM(100, activation='tanh'))

# # Some dropout for regularization
# model.add(Dropout(0.5))

# # Classification layer
# model.add(Dense(2, activation='softmax'))

# model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# # Train the model
# # Assuming X_train_reshaped, y_train, X_val_reshaped, and y_val are already prepared
# history = model.fit(X_train_reshaped, y_train, validation_data=(X_val_reshaped, y_val), epochs=10, batch_size=32)
