### Imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib as plt
%matplotlib inline

### Data

In [4]:
#import just ids of original data set on which model was built
df_original = pd.read_csv('../data/1_data_initial.csv', usecols=['id'])

In [6]:
#import the full validation set
parse_dates = ['listing_date', 
               'asset_contract.created_date', 
               'collection.created_date', 
               'last_sale.created_date']
df_full = pd.read_csv('../data/1b_data_initial_validation_set.csv', low_memory=False, parse_dates=parse_dates)

In [16]:
# create a list of all unique ids in the validation set that are not present in the original set
df_orig_ids = df_original['id']
df_full_ids = df_full['id']
new_unique_ids = [x for x in df_full_ids if x not in df_orig_ids]

In [18]:
len(new_unique_ids)

26241

In [23]:
# create new dataframe of unique NFTs
df_full = df_full[df_full['id'].isin(new_unique_ids)]

In [25]:
df_full.head()

Unnamed: 0,id,token_id,num_sales,background_color,image_url,image_preview_url,image_thumbnail_url,image_original_url,animation_url,animation_original_url,...,last_sale.transaction.to_account.user,last_sale.transaction.from_account.user,transfer_fee_payment_token.id,transfer_fee_payment_token.symbol,transfer_fee_payment_token.address,transfer_fee_payment_token.image_url,transfer_fee_payment_token.name,transfer_fee_payment_token.decimals,transfer_fee_payment_token.eth_price,transfer_fee_payment_token.usd_price
0,31278619,6715461703434003445645336638579260494260736795...,0,,https://lh3.googleusercontent.com/Pd7PXs5ZHtgv...,https://lh3.googleusercontent.com/Pd7PXs5ZHtgv...,https://lh3.googleusercontent.com/Pd7PXs5ZHtgv...,,,,...,,,,,,,,,,
1,31278618,3108049546517070910075436470153648977290720661...,0,,https://lh3.googleusercontent.com/KfteuZFSCiCO...,https://lh3.googleusercontent.com/KfteuZFSCiCO...,https://lh3.googleusercontent.com/KfteuZFSCiCO...,,,,...,,,,,,,,,,
2,31278617,9057683344706687937980539802150068508547322538...,0,,https://lh3.googleusercontent.com/caAUHZLbhExp...,https://lh3.googleusercontent.com/caAUHZLbhExp...,https://lh3.googleusercontent.com/caAUHZLbhExp...,,,,...,,,,,,,,,,
3,31278616,3268304683559065074852929730958675774624902165...,0,,https://lh3.googleusercontent.com/4XUpEHLMNbJl...,https://lh3.googleusercontent.com/4XUpEHLMNbJl...,https://lh3.googleusercontent.com/4XUpEHLMNbJl...,,,,...,,,,,,,,,,
4,31278614,7779220728716121010175700030620486582973273824...,0,,https://lh3.googleusercontent.com/pcJ0zq1Gr-ND...,https://lh3.googleusercontent.com/pcJ0zq1Gr-ND...,https://lh3.googleusercontent.com/pcJ0zq1Gr-ND...,,,,...,,,,,,,,,,


In [26]:
df_full.id.nunique()

26241

In [27]:
df_full.token_id.nunique()

26241

In [28]:
list(df_full.columns)

['id',
 'token_id',
 'num_sales',
 'background_color',
 'image_url',
 'image_preview_url',
 'image_thumbnail_url',
 'image_original_url',
 'animation_url',
 'animation_original_url',
 'name',
 'description',
 'external_link',
 'permalink',
 'decimals',
 'token_metadata',
 'sell_orders',
 'traits',
 'last_sale',
 'top_bid',
 'listing_date',
 'is_presale',
 'transfer_fee_payment_token',
 'transfer_fee',
 'asset_contract.address',
 'asset_contract.asset_contract_type',
 'asset_contract.created_date',
 'asset_contract.name',
 'asset_contract.nft_version',
 'asset_contract.opensea_version',
 'asset_contract.owner',
 'asset_contract.schema_name',
 'asset_contract.symbol',
 'asset_contract.total_supply',
 'asset_contract.description',
 'asset_contract.external_link',
 'asset_contract.image_url',
 'asset_contract.default_to_fiat',
 'asset_contract.dev_buyer_fee_basis_points',
 'asset_contract.dev_seller_fee_basis_points',
 'asset_contract.only_proxied_transfers',
 'asset_contract.opensea_buyer

In [29]:
# checking for percentage of null values

for column in list(df_full.columns):
    print(f"{df_full[str(column)].isna().sum()/len(df_full):.2%} of {column} is NAN")

0.00% of id is NAN
0.00% of token_id is NAN
0.00% of num_sales is NAN
90.91% of background_color is NAN
20.49% of image_url is NAN
20.49% of image_preview_url is NAN
20.49% of image_thumbnail_url is NAN
62.40% of image_original_url is NAN
88.43% of animation_url is NAN
88.40% of animation_original_url is NAN
23.04% of name is NAN
32.29% of description is NAN
71.28% of external_link is NAN
0.00% of permalink is NAN
66.28% of decimals is NAN
61.19% of token_metadata is NAN
70.58% of sell_orders is NAN
0.00% of traits is NAN
100.00% of last_sale is NAN
100.00% of top_bid is NAN
100.00% of listing_date is NAN
0.00% of is_presale is NAN
100.00% of transfer_fee_payment_token is NAN
99.82% of transfer_fee is NAN
0.00% of asset_contract.address is NAN
0.00% of asset_contract.asset_contract_type is NAN
0.00% of asset_contract.created_date is NAN
0.00% of asset_contract.name is NAN
47.60% of asset_contract.nft_version is NAN
60.00% of asset_contract.opensea_version is NAN
27.19% of asset_contrac

In [30]:
df_isna = pd.DataFrame()

for column in df_full.columns:
    df_isna.loc['Null', column] = f"{df_full[column].isna().sum()/len(df_full):.2%}"
    df_isna.loc['Count', column] = df_full[column].count()
    df_isna.loc['Unique', column] = df_full[column].nunique()

In [31]:
df_isna

Unnamed: 0,id,token_id,num_sales,background_color,image_url,image_preview_url,image_thumbnail_url,image_original_url,animation_url,animation_original_url,...,last_sale.transaction.to_account.user,last_sale.transaction.from_account.user,transfer_fee_payment_token.id,transfer_fee_payment_token.symbol,transfer_fee_payment_token.address,transfer_fee_payment_token.image_url,transfer_fee_payment_token.name,transfer_fee_payment_token.decimals,transfer_fee_payment_token.eth_price,transfer_fee_payment_token.usd_price
Null,0.00%,0.00%,0.00%,90.91%,20.49%,20.49%,20.49%,62.40%,88.43%,88.40%,...,100.00%,100.00%,99.82%,99.82%,99.82%,99.82%,99.82%,99.82%,99.82%,99.82%
Count,26241,26241,26241,2384,20864,20864,20864,9866,3037,3045,...,0,0,47,47,47,47,47,47,47,47
Unique,26241,26241,206,41,18189,18189,18189,8557,2312,2322,...,0,0,1,1,1,1,1,1,1,1


In [32]:
# also looking at count and unique to help with deciding which columns to keep

with pd.option_context('display.max_rows', 150):
    display(df_isna.T.sort_values(by=['Count','Unique','Null'], ascending=True))

Unnamed: 0,Null,Count,Unique
last_sale,100.00%,0,0
top_bid,100.00%,0,0
listing_date,100.00%,0,0
transfer_fee_payment_token,100.00%,0,0
creator.user,100.00%,0,0
owner.user,100.00%,0,0
creator,100.00%,0,0
last_sale.asset_bundle,100.00%,0,0
last_sale.transaction.to_account.discord_id,100.00%,0,0
last_sale.transaction.to_account.user,100.00%,0,0


In [33]:
# selecting target columns and desired predictors

df = df_full[[
    # target
    'num_sales', 'sell_orders', 'last_sale.event_type', 
    
    # basic attributes    
    'id', 'token_id', 'name', 'traits', 'description', 'is_presale', 
    'image_url', 'background_color', 'external_link', 'token_metadata',
    
    # asset attributes
    'asset_contract.created_date', 'asset_contract.name', 'asset_contract.description', 'asset_contract.total_supply',
    'asset_contract.symbol', 'asset_contract.schema_name', 'asset_contract.asset_contract_type', 
    'asset_contract.nft_version', 'asset_contract.opensea_version',
    'asset_contract.seller_fee_basis_points', 'asset_contract.dev_seller_fee_basis_points',
    'asset_contract.opensea_seller_fee_basis_points',
    'asset_contract.external_link', 'asset_contract.image_url', 
    
    # collection attributes
    'collection.created_date', 'collection.slug', 'collection.safelist_request_status', 'collection.featured', 
    'collection.description', 'collection.display_data.card_display_style',
    'collection.dev_seller_fee_basis_points', 'collection.opensea_seller_fee_basis_points',
    'collection.external_url', 'collection.image_url', 'collection.large_image_url', 'collection.display_data.images', 
    'collection.twitter_username', 'collection.instagram_username', 'collection.discord_url', 
    'collection.telegram_url', 'collection.medium_username', 'collection.chat_url', 
    
    # creator attributes
    'creator.user.username', 'creator.config', 'creator.profile_img_url', 'creator.discord_id',
    
    # owner attributes
    'owner.address', 'owner.config', 'owner.profile_img_url', 'owner.discord_id' 
    ]]

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26241 entries, 0 to 36100
Data columns (total 53 columns):
num_sales                                         26241 non-null int64
sell_orders                                       7721 non-null object
last_sale.event_type                              6919 non-null object
id                                                26241 non-null int64
token_id                                          26241 non-null object
name                                              20194 non-null object
traits                                            26241 non-null object
description                                       17767 non-null object
is_presale                                        26241 non-null bool
image_url                                         20864 non-null object
background_color                                  2384 non-null object
external_link                                     7536 non-null object
token_metadata                     

In [36]:
with pd.option_context('display.max_columns', 60):
    display(df.head())

Unnamed: 0,num_sales,sell_orders,last_sale.event_type,id,token_id,name,traits,description,is_presale,image_url,background_color,external_link,token_metadata,asset_contract.created_date,asset_contract.name,asset_contract.description,asset_contract.total_supply,asset_contract.symbol,asset_contract.schema_name,asset_contract.asset_contract_type,asset_contract.nft_version,asset_contract.opensea_version,asset_contract.seller_fee_basis_points,asset_contract.dev_seller_fee_basis_points,asset_contract.opensea_seller_fee_basis_points,asset_contract.external_link,asset_contract.image_url,collection.created_date,collection.slug,collection.safelist_request_status,collection.featured,collection.description,collection.display_data.card_display_style,collection.dev_seller_fee_basis_points,collection.opensea_seller_fee_basis_points,collection.external_url,collection.image_url,collection.large_image_url,collection.display_data.images,collection.twitter_username,collection.instagram_username,collection.discord_url,collection.telegram_url,collection.medium_username,collection.chat_url,creator.user.username,creator.config,creator.profile_img_url,creator.discord_id,owner.address,owner.config,owner.profile_img_url,owner.discord_id
0,0,,,31278619,6715461703434003445645336638579260494260736795...,[WordA]Chinese characters #6634,"[{'trait_type': 'Chinese', 'value': 'character...",ID:6634 \n\nChinese characters: 柩 // 706 x 7...,True,https://lh3.googleusercontent.com/Pd7PXs5ZHtgv...,,,,2020-12-02 17:40:53.232025,OpenSea Collection,,,OPENSTORE,ERC1155,semi-fungible,,2.0.0,250,0,250,,,2021-07-06 08:28:42.215335,worda-chinese-characters,not_requested,False,The present Chinese characters have evolved fr...,contain,100,250,,https://lh3.googleusercontent.com/H_pQIUXGWB7q...,https://lh3.googleusercontent.com/vTPDnec7_PuC...,,,,,,,,Suqingyan,,https://storage.googleapis.com/opensea-static/...,,0x0000000000000000000000000000000000000000,,https://storage.googleapis.com/opensea-static/...,
1,0,,,31278618,3108049546517070910075436470153648977290720661...,Moosy Rare 1998,"[{'trait_type': 'Level', 'value': 'Rare', 'dis...",Moosy Rare 1998,True,https://lh3.googleusercontent.com/KfteuZFSCiCO...,,,,2020-12-02 17:40:53.232025,OpenSea Collection,,,OPENSTORE,ERC1155,semi-fungible,,2.0.0,250,0,250,,,2021-07-01 22:52:11.315505,the-bullishbears,not_requested,False,"The Bullish Bears is a 10,000 token NFT projec...",contain,1000,250,http://thebullishbearsnft.com,https://lh3.googleusercontent.com/1dsSBHyqwkMq...,https://lh3.googleusercontent.com/pzfCIbTxca4F...,,bullishbearsnft,thebullishbearsnft,,,,,Thebullishbearsnft,,https://storage.googleapis.com/opensea-static/...,,0x0000000000000000000000000000000000000000,,https://storage.googleapis.com/opensea-static/...,
2,0,,,31278617,9057683344706687937980539802150068508547322538...,Fortune Cookie # 22 - Unlockable Fortune FOR...,[],Fortune Cookie # 22\n\nThis collection of uniq...,True,https://lh3.googleusercontent.com/caAUHZLbhExp...,,,,2020-12-02 17:40:53.232025,OpenSea Collection,,,OPENSTORE,ERC1155,semi-fungible,,2.0.0,250,0,250,,,2021-07-11 19:59:59.924875,unique-fortune-cookie-fortunes,not_requested,False,This collection of unique fortune cookie fortu...,contain,10,250,,https://lh3.googleusercontent.com/caAUHZLbhExp...,https://lh3.googleusercontent.com/caAUHZLbhExp...,,,,,,,,FirstThingsFirst,,https://storage.googleapis.com/opensea-static/...,,0x0000000000000000000000000000000000000000,,https://storage.googleapis.com/opensea-static/...,
3,0,[{'created_date': '2021-07-13T00:17:14.492573'...,,31278616,3268304683559065074852929730958675774624902165...,IconPunk #243 Ugandan Knuckles,"[{'trait_type': 'person', 'value': 'Ugandan Kn...",24x24 pixels and pushing the limit to the max!...,True,https://lh3.googleusercontent.com/4XUpEHLMNbJl...,,,,2020-12-02 17:40:53.232025,OpenSea Collection,,,OPENSTORE,ERC1155,semi-fungible,,2.0.0,250,0,250,,,2021-05-03 18:55:32.762585,iconpunks,not_requested,False,24x24 pixels and pushing the limit to the max!...,padded,500,250,,https://lh3.googleusercontent.com/2dVGNcqjWQCX...,https://lh3.googleusercontent.com/Y0XW9uG7RyV1...,,NFTPUNKS,,,,,,iconpunks,,https://storage.googleapis.com/opensea-static/...,,0x0000000000000000000000000000000000000000,,https://storage.googleapis.com/opensea-static/...,
4,0,[{'created_date': '2021-07-13T00:17:15.130145'...,,31278614,7779220728716121010175700030620486582973273824...,PlagueDoctor #107,"[{'trait_type': 'id', 'value': '107', 'display...",Where are the sick - There I Am. Healing since...,True,https://lh3.googleusercontent.com/pcJ0zq1Gr-ND...,,,,2020-12-02 17:40:53.232025,OpenSea Collection,,,OPENSTORE,ERC1155,semi-fungible,,2.0.0,250,0,250,,,2021-05-19 07:24:46.125544,plaguedoctor-1,not_requested,False,Where are the sick - There I Am. Healing since...,padded,0,250,,https://lh3.googleusercontent.com/ajZlyodqeIqG...,,,,,,,,,PlagueDoctor,,https://storage.googleapis.com/opensea-static/...,,0x0000000000000000000000000000000000000000,,https://storage.googleapis.com/opensea-static/...,


### Final Output
- First three columns are target
- Needs further preprocessing

In [38]:
df.to_csv('../data/2b_data_condensed_validation_set.csv', index=False)