# CNN for classifying NFTs with secondary sales

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
import os
import os.path
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
import time
from imblearn.over_sampling import RandomOverSampler

# Create dataset

In [3]:
#@title Read data
nft_transactions = pd.read_csv('data/Data_API.csv',usecols=['Unique_id_collection','Collection_cleaned','Category'])
image_names = pd.read_csv('data/image_names.csv')
secondary_sales = pd.read_csv('data/secondary_sale_classification_data.csv')

In [29]:
nft_transactions.Category.unique()

array(['Games', 'Art', 'Other', 'Collectible', 'Metaverse', 'Utility'],
      dtype=object)

In [12]:
category = "Art" #Keep only Games category

df_filtered = nft_transactions.copy(deep = True)
df_filtered['image_name'] = image_names['image_name']
df_filtered.drop_duplicates(subset=['image_name'], inplace=True)
df_filtered.loc[:,'image_name'] = 'images/' + df_filtered.loc[:,"image_name"].astype(str) + '.png'
files = ["images/{0}".format(fname) for fname in os.listdir("images")]
df_filtered = df_filtered[df_filtered["image_name"].isin(files)]

df_filtered = df_filtered.merge(secondary_sales, on='Unique_id_collection', how='left', suffixes=('_left', '_right'))
df_filtered.drop(df_filtered[df_filtered.Category_left != category].index, inplace=True)
df_filtered = df_filtered[(df_filtered["secondary_sale"] == 0.0) | (df_filtered["secondary_sale"] == 1.0)]
df_filtered.drop(df_filtered.columns.difference(['image_name_left','secondary_sale','Collection_cleaned_left','Category_left']), axis=1, inplace=True)
df_filtered.secondary_sale = df_filtered.secondary_sale.astype(int).astype(str)
df_filtered.rename(columns={"image_name_left": "image_name", "Collection_cleaned_left": "Collection_cleaned", 'Category_left': 'Category'}, inplace=True)
df_train = df_filtered.sample(frac=0.8)
df_test = df_filtered.loc[df_filtered.index.difference(df_train.index)]

In [13]:
print(len(df_train))
print(df_train.head(5))

df1 = df_train[df_train["secondary_sale"] == '0']
df2 = df_train[df_train["secondary_sale"] == '1']
print(len(df1))
print(len(df2))
print("done")
#import sys
#sys.exit(1)

# oversample minority class
samplefrac = 0.05
oversample = RandomOverSampler(sampling_strategy='minority')
X = df_train.image_name.values
y = df_train.secondary_sale.values
X_over, y_over = oversample.fit_resample(X.reshape(-1,1), y.reshape(-1,1))
df_train = pd.DataFrame(np.hstack((X_over,y_over.reshape(-1,1))), columns=['image_name','secondary_sale']).sample(frac=samplefrac)

print(len(df_train))
df_test = df_test.sample(frac=0.05)
print(len(df_test))
df1 = df_train[df_train["secondary_sale"] == '0']
df2 = df_train[df_train["secondary_sale"] == '1']
print(len(df1))
print(len(df2))

92011
       Collection_cleaned Category         image_name secondary_sale
85066        Cryptokittie      Art   images/79536.png              0
75791        Cryptokittie      Art   images/70613.png              0
64603        Cryptokittie      Art   images/59851.png              0
13105        Cryptokittie      Art   images/12275.png              0
114148       Cryptokittie      Art  images/107358.png              0
86424
5587
done
8642
1150
4227
4415


# Prepare

In [14]:
train_generator = tf.keras.preprocessing.image.ImageDataGenerator(
  rescale=1./255,
  validation_split=0.2
)

test_generator = tf.keras.preprocessing.image.ImageDataGenerator(
  rescale=1./255
)

train_images = train_generator.flow_from_dataframe(
  dataframe=df_train,
  x_col='image_name',
  y_col='secondary_sale',
  target_size=(28, 28),
  color_mode='rgb',
  class_mode='binary',
  batch_size=128,
  shuffle=True,
  seed=42,
  subset='training'
)

val_images = train_generator.flow_from_dataframe(
  dataframe=df_train,
  x_col='image_name',
  y_col='secondary_sale',
  target_size=(28, 28),
  color_mode='rgb',
  class_mode='binary',
  batch_size=128,
  shuffle=True,
  seed=42,
  subset='validation'
)

test_images = test_generator.flow_from_dataframe(
  dataframe=df_test,
  x_col='image_name',
  y_col='secondary_sale',
  target_size=(28, 28),
  color_mode='rgb',
  class_mode='binary',
  batch_size=128,
  shuffle=False,
  seed=42,
)

Found 6914 validated image filenames belonging to 2 classes.
Found 1728 validated image filenames belonging to 2 classes.
Found 1150 validated image filenames belonging to 2 classes.


In [7]:
#import wandb
#wandb.init(project="cnn-secondary-sale")

wandb.config = {
  'architecture': 'CNN-mini',
  'accelerator': 'P100',
  'batch_size': 128,
  'epochs': 15,
  'starting_learning_rate': 0.001,
  'activation':'sigmoid',
  'optimizer': 'adam'
}

NameError: name 'wandb' is not defined

# Fit

In [21]:
from PIL import ImageFile
import tensorflow.keras as keras
#from wandb.keras import WandbCallback
from keras import backend as K
import math
from tensorflow.keras.callbacks import LearningRateScheduler
ImageFile.LOAD_TRUNCATED_IMAGES = True

def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall_keras = true_positives / (possible_positives + K.epsilon())
    return recall_keras

def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision_keras = true_positives / (predicted_positives + K.epsilon())
    return precision_keras

def f1(y_true, y_pred):
    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    return 2 * ((p * r) / (p + r + K.epsilon()))

def lr_scheduler(epoch, lr):
    #if wandb.run is None:
    #    raise wandb.Error("You must call wandb.init() before WandbCallback()")
    #wandb.log({'learning_rate': lr}, commit=False)
    if epoch < 7:
        return lr
    else:
        return lr * tf.math.exp(-0.1)

model = keras.models.Sequential([
    keras.layers.Conv2D(32, kernel_size=(3, 3), activation="relu", input_shape=(28,28,3)),
    keras.layers.BatchNormalization(),
    keras.layers.MaxPooling2D(pool_size=(2, 2)),
    keras.layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
    keras.layers.BatchNormalization(),
    keras.layers.MaxPooling2D(pool_size=(2, 2)),
    keras.layers.Flatten(),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(1, activation="sigmoid"),
])

model.summary()

metrics = [precision, recall, f1, "accuracy"]
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss = 'binary_crossentropy', optimizer=optimizer,metrics=metrics)
#wandb_callback = WandbCallback(
#    input_type="image",
#    generator=val_images,
#    labels=["No secondary sale", "One or more secondary sale"],
#    log_evaluation=True,
#    validation_steps=5
#)
lr_callback = tf.keras.callbacks.LearningRateScheduler(lr_scheduler)

history = model.fit(train_images,
  validation_data=val_images,
  verbose = 1,
    callbacks=[
      #wandb_callback,
      lr_callback
    ], epochs=15
)

#wandb.finish()
model.evaluate(test_images)

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_10 (Conv2D)          (None, 26, 26, 32)        896       
                                                                 
 batch_normalization_10 (Ba  (None, 26, 26, 32)        128       
 tchNormalization)                                               
                                                                 
 max_pooling2d_10 (MaxPooli  (None, 13, 13, 32)        0         
 ng2D)                                                           
                                                                 
 conv2d_11 (Conv2D)          (None, 11, 11, 64)        18496     
                                                                 
 batch_normalization_11 (Ba  (None, 11, 11, 64)        256       
 tchNormalization)                                               
                                                      



Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


[0.7319831848144531,
 0.05523551627993584,
 0.6303350925445557,
 0.0992950052022934,
 0.3591304421424866]