# CNN for classifying NFTs with secondary sales

This model is not trained with the whole dataset to save time and get a feel for building CNN architectures and image loading.

# Dependencies

Pray to google colab gods for a T4.

In [None]:
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-6b072d41-e658-8754-5ab5-169307f4897a)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import os.path
from sklearn.model_selection import train_test_split
import tensorflow as tf
from sklearn.metrics import r2_score
from tensorflow.keras.callbacks import EarlyStopping
import time

In [None]:
!pip install visualkeras
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting visualkeras
  Downloading visualkeras-0.0.2-py3-none-any.whl (12 kB)
Collecting aggdraw>=1.3.11
  Downloading aggdraw-1.3.15-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (989 kB)
[K     |████████████████████████████████| 989 kB 37.9 MB/s 
Installing collected packages: aggdraw, visualkeras
Successfully installed aggdraw-1.3.15 visualkeras-0.0.2


# Create dataset

In [None]:
#@title Read data
nft_transactions = pd.read_csv('data/Data_API.csv',usecols=['Unique_id_collection','Collection_cleaned','Category'])
image_names = pd.read_csv('data/image_names.csv')
secondary_sales = pd.read_csv('data/secondary_sale.csv')

In [None]:
nft_transactions.Category.unique()

array(['Games', 'Art', 'Other', 'Collectible', 'Metaverse', 'Utility'],
      dtype=object)

In [None]:
#@title Combine and filter data
df_filtered = nft_transactions.copy(deep = True)
df_filtered['image_name'] = image_names['image_name']
df_filtered.drop_duplicates(subset=['image_name'], inplace=True)
batch_numbers = (df_filtered.loc[:,'image_name']//10000).astype(str).str.zfill(3)
df_filtered.loc[:,'image_name'] = 'images/batch' + batch_numbers + '/' + df_filtered.loc[:,"image_name"].astype(str) + '.png'
df_filtered = df_filtered.merge(secondary_sales, on='Unique_id_collection', how='left')
df_filtered.drop(df_filtered[df_filtered.Category != 'Games'].index, inplace=True)
df_filtered.drop(df_filtered.columns.difference(['image_name','secondary_sale','Collection_cleaned','Category']), axis=1, inplace=True)
df_filtered.secondary_sale = df_filtered.secondary_sale.astype(str)
df_train = df_filtered.sample(frac=0.8)
df_test = df_filtered.loc[df_filtered.index.difference(df_train.index)]

In [None]:
# oversample minority class
from imblearn.over_sampling import RandomOverSampler
oversample = RandomOverSampler(sampling_strategy='minority')
X = df_train.image_name.values
y = df_train.secondary_sale.values
X_over, y_over = oversample.fit_resample(X.reshape(-1,1), y.reshape(-1,1))
df_train = pd.DataFrame(np.hstack((X_over,y_over.reshape(-1,1))), columns=['image_name','secondary_sale']).sample(frac=0.4)

# Prepare

In [None]:
#@title Carry images from drive to local session storage for better read speeds
for i in range(0,16+1):
  batch_name = f'zip_batch{str(i).zfill(2)}.zip'
  print(batch_name)
  os.system(f'cp data/images/{batch_name} .')
  os.system(f'unzip {batch_name} -d images/')
  os.system(f'rm {batch_name}')
  os.system('cd images && unzip "*.zip"')
  os.system('cd images && rm *.zip')
!rm -rf images/batch000/drive
!rm -rf images/batch{005..024}/transformed
!rm -rf images/drive
#!cd images && find . -type f | cut -d/ -f2 | sort | uniq -c | awk '{s+=$1} END {print total images downloded: s}'

zip_batch00.zip
zip_batch01.zip
zip_batch02.zip
zip_batch03.zip
zip_batch04.zip
zip_batch05.zip
zip_batch06.zip
zip_batch07.zip
zip_batch08.zip
zip_batch09.zip
zip_batch10.zip
zip_batch11.zip
zip_batch12.zip
zip_batch13.zip
zip_batch14.zip
zip_batch15.zip
zip_batch16.zip


In [None]:
#@title Prepare Dataset generators
train_generator = tf.keras.preprocessing.image.ImageDataGenerator(
  rescale=1./255,
  validation_split=0.2
)

test_generator = tf.keras.preprocessing.image.ImageDataGenerator(
  rescale=1./255
)

train_images = train_generator.flow_from_dataframe(
  dataframe=df_train,
  x_col='image_name',
  y_col='secondary_sale',
  target_size=(28, 28),
  color_mode='rgb',
  class_mode='binary',
  batch_size=64,
  shuffle=True,
  seed=42,
  subset='training'
)

val_images = train_generator.flow_from_dataframe(
  dataframe=df_train,
  x_col='image_name',
  y_col='secondary_sale',
  target_size=(28, 28),
  color_mode='rgb',
  class_mode='binary',
  batch_size=64,
  shuffle=True,
  seed=42,
  subset='validation'
)

test_images = test_generator.flow_from_dataframe(
  dataframe=df_test,
  x_col='image_name',
  y_col='secondary_sale',
  target_size=(28, 28),
  color_mode='rgb',
  class_mode='binary',
  batch_size=64,
  shuffle=False,
  seed=42,
)

  .format(n_invalid, x_col)


Found 85224 validated image filenames belonging to 2 classes.
Found 21306 validated image filenames belonging to 2 classes.
Found 47523 validated image filenames belonging to 2 classes.


  .format(n_invalid, x_col)


In [None]:
#@title Initialize wandb run
import wandb
wandb.init(project="cnn-secondary-sale-prediction")
wandb.config = {
  'architecture': 'CNN-mini',
  'accelerator': 'P100',
  'batch_size': 64,
  'epochs': 15,
  'starting_learning_rate': 0.001,
  'actiovation':'sigmoid',
  'optimizer': 'adam'
}

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

# Fit

In [None]:
#@title Build and fit model
from PIL import ImageFile
import tensorflow.keras as keras
from wandb.keras import WandbCallback
from keras import backend as K
import math
from tensorflow.keras.callbacks import LearningRateScheduler
ImageFile.LOAD_TRUNCATED_IMAGES = True

def lr_scheduler(epoch, lr):
    if wandb.run is None:
        raise wandb.Error("You must call wandb.init() before WandbCallback()")
    wandb.log({'learning_rate': lr}, commit=False)
    if epoch < 7:
        return lr
    else:
        return lr * tf.math.exp(-0.1)

def f1(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

model = keras.models.Sequential([
    keras.layers.Conv2D(32, kernel_size=(3, 3), activation="relu", input_shape=(256,256,3)),
    keras.layers.BatchNormalization(),
    keras.layers.MaxPooling2D(pool_size=(2, 2)),
    keras.layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
    keras.layers.BatchNormalization(),
    keras.layers.MaxPooling2D(pool_size=(2, 2)),
    keras.layers.Flatten(),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(1, activation="sigmoid"),
])

model.summary()
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss = 'binary_crossentropy', optimizer=optimizer,metrics=[f1,'accuracy'])
wandb_callback = WandbCallback(
    input_type="image",
    generator=val_images,
    labels=["No secondary sale", "One or more secondary sale"],
    log_evaluation=True,
    validation_steps=5
)
lr_callback = tf.keras.callbacks.LearningRateScheduler(lr_scheduler)

history = model.fit(train_images,
  validation_data=val_images,
  verbose = 1,
    callbacks=[
      wandb_callback,
      lr_callback
    ], epochs=15
)

wandb.finish()
model.evaluate(test_images)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 26, 26, 32)        896       
                                                                 
 batch_normalization (BatchN  (None, 26, 26, 32)       128       
 ormalization)                                                   
                                                                 
 max_pooling2d (MaxPooling2D  (None, 13, 13, 32)       0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 11, 11, 64)        18496     
                                                                 
 batch_normalization_1 (Batc  (None, 11, 11, 64)       256       
 hNormalization)                                                 
                                                        

  "Palette images with Transparency expressed in bytes should be "




[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20220813_182727-29lk2mmr/files/model-best)... Done. 0.1s


Epoch 2/15

  "Palette images with Transparency expressed in bytes should be "




[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20220813_182727-29lk2mmr/files/model-best)... Done. 0.1s


Epoch 3/15

  "Palette images with Transparency expressed in bytes should be "


Epoch 4/15
Epoch 5/15

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20220813_182727-29lk2mmr/files/model-best)... Done. 0.1s


Epoch 6/15
 129/1332 [=>............................] - ETA: 3:51 - loss: 0.6306 - f1: 0.6811 - accuracy: 0.6485

  "Palette images with Transparency expressed in bytes should be "


Epoch 7/15

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20220813_182727-29lk2mmr/files/model-best)... Done. 0.1s


Epoch 8/15

  "Palette images with Transparency expressed in bytes should be "




[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20220813_182727-29lk2mmr/files/model-best)... Done. 0.1s


Epoch 9/15
  15/1332 [..............................] - ETA: 4:31 - loss: 0.6255 - f1: 0.6994 - accuracy: 0.6427

  "Palette images with Transparency expressed in bytes should be "


Epoch 10/15
Epoch 11/15

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20220813_182727-29lk2mmr/files/model-best)... Done. 0.1s


Epoch 12/15

  "Palette images with Transparency expressed in bytes should be "




[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20220813_182727-29lk2mmr/files/model-best)... Done. 0.1s


Epoch 13/15

  "Palette images with Transparency expressed in bytes should be "




[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20220813_182727-29lk2mmr/files/model-best)... Done. 0.1s


Epoch 14/15
 200/1332 [===>..........................] - ETA: 3:35 - loss: 0.6093 - f1: 0.6825 - accuracy: 0.6617

  "Palette images with Transparency expressed in bytes should be "




[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20220813_182727-29lk2mmr/files/model-best)... Done. 0.1s


Epoch 15/15
  56/1332 [>.............................] - ETA: 4:06 - loss: 0.6053 - f1: 0.7169 - accuracy: 0.6708

  "Palette images with Transparency expressed in bytes should be "




[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20220813_182727-29lk2mmr/files/model-best)... Done. 0.1s




VBox(children=(Label(value='6.908 MB of 6.925 MB uploaded (0.018 MB deduped)\r'), FloatProgress(value=0.997478…

0,1
accuracy,▁▄▄▅▅▆▆▆▇▇▇▇███
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
f1,▁▄▅▆▆▇▆▇▇▇▇████
learning_rate,████████▇▅▄▃▃▂▁
loss,█▄▃▃▃▂▂▂▂▂▂▁▁▁▁
val_accuracy,▅▅▅▁▆▃▆▆▆▄▆▇▇▆█
val_f1,▅█▅█▅█▇▅▅▁▅▆██▇
val_loss,▅▄▄█▃▄▃▂▃▄▂▂▂▂▁

0,1
GFLOPs,0.00284
accuracy,0.66161
best_epoch,14.0
best_val_loss,0.61659
epoch,14.0
f1,0.69307
learning_rate,0.0005
loss,0.61356
val_accuracy,0.66155
val_f1,0.69521


 32/743 [>.............................] - ETA: 3:08 - loss: 0.4990 - f1: 0.1049 - accuracy: 0.8203

  "Palette images with Transparency expressed in bytes should be "




[0.6352639198303223, 0.4323253929615021, 0.6149232983589172]

In [None]:
#@title Reset GPU when memory errors happen
from numba import cuda
cuda.select_device(0)
cuda.close()