In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
%matplotlib inline

In [None]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [2]:
parse_dates = ['listing_date', 
               'asset_contract.created_date', 
               'collection.created_date', 
               'last_sale.created_date']
df_full = pd.read_csv('1_data_initial.csv', low_memory=False, parse_dates=parse_dates)

In [None]:
df_full.info()

In [None]:
df_full.head()

In [None]:
df_full.id.nunique()

In [None]:
df_full.token_id.nunique()

In [None]:
list(df_full.columns)

In [None]:
for column in list(df_full.columns):
    print(f"{df_full[str(column)].isna().sum()/len(df_full):.2%} of {column} is NAN")

In [30]:
df_isna = pd.DataFrame()

for column in df_full.columns:
    df_isna.loc['Null', column] = f"{df_full[column].isna().sum()/len(df_full):.2%}"
    df_isna.loc['Count', column] = df_full[column].count()
    df_isna.loc['Unique', column] = df_full[column].nunique()

In [None]:
df_isna

In [36]:
with pd.option_context('display.max_rows', 150):
    display(df_isna.T.sort_values(by=['Count','Unique','Null'], ascending=True))

Unnamed: 0,Null,Count,Unique
last_sale,100.00%,0,0
top_bid,100.00%,0,0
listing_date,100.00%,0,0
transfer_fee_payment_token,100.00%,0,0
owner.user,100.00%,0,0
creator.user,100.00%,0,0
creator,100.00%,0,0
last_sale.asset_bundle,100.00%,0,0
last_sale.transaction.to_account.discord_id,100.00%,0,0
last_sale.transaction.from_account.user,100.00%,0,0


In [3]:
df = df_full[[
    # target
    'num_sales', 'sell_orders', 'last_sale.event_type', 
    
    # basic attributes    
    'id', 'token_id', 'name', 'traits', 'description', 'is_presale', 
    'image_url', 'background_color', 'external_link', 'token_metadata',
    
    # asset attributes
    'asset_contract.created_date', 'asset_contract.name', 'asset_contract.description', 'asset_contract.total_supply',
    'asset_contract.symbol', 'asset_contract.schema_name', 'asset_contract.asset_contract_type', 
    'asset_contract.nft_version', 'asset_contract.opensea_version',
    'asset_contract.seller_fee_basis_points', 'asset_contract.dev_seller_fee_basis_points',
    'asset_contract.opensea_seller_fee_basis_points',
    'asset_contract.external_link', 'asset_contract.image_url', 
    
    # collection attributes
    'collection.created_date', 'collection.slug', 'collection.safelist_request_status', 'collection.featured', 
    'collection.description', 'collection.display_data.card_display_style',
    'collection.dev_seller_fee_basis_points', 'collection.opensea_seller_fee_basis_points',
    'collection.external_url', 'collection.image_url', 'collection.large_image_url', 'collection.display_data.images', 
    'collection.twitter_username', 'collection.instagram_username', 'collection.discord_url', 
    'collection.telegram_url', 'collection.medium_username', 'collection.chat_url', 
    
    # creator attributes
    'creator.user.username', 'creator.config', 'creator.profile_img_url', 'creator.discord_id',
    
    # owner attributes
    'owner.address', 'owner.config', 'owner.profile_img_url', 'owner.discord_id' 
    ]]

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44752 entries, 0 to 44751
Data columns (total 53 columns):
num_sales                                         44752 non-null int64
sell_orders                                       10052 non-null object
last_sale.event_type                              15713 non-null object
id                                                44752 non-null int64
token_id                                          44752 non-null object
name                                              36072 non-null object
traits                                            44752 non-null object
description                                       33741 non-null object
is_presale                                        44752 non-null bool
image_url                                         40031 non-null object
background_color                                  21219 non-null object
external_link                                     18185 non-null object
token_metadata                 

In [6]:
df.head()

Unnamed: 0,num_sales,sell_orders,last_sale.event_type,id,token_id,name,traits,description,is_presale,image_url,...,collection.medium_username,collection.chat_url,creator.user.username,creator.config,creator.profile_img_url,creator.discord_id,owner.address,owner.config,owner.profile_img_url,owner.discord_id
0,0,,,30033345,9132198125583360689797322472203169750011716430...,Chad Olozumin,[],,True,https://storage.opensea.io/files/b09d044f17573...,...,,,CryptoHamstersOfficial,,https://storage.googleapis.com/opensea-static/...,,0x0000000000000000000000000000000000000000,,https://storage.googleapis.com/opensea-static/...,
1,0,,,30033344,3268304683559065074852929730958675774624902165...,IconPunk #149 Elton John,"[{'trait_type': 'id', 'value': '0149', 'displa...",24x24 pixels and pushing the limit to the max!...,True,https://lh3.googleusercontent.com/P7NXQatzHWTt...,...,,,iconpunks,,https://storage.googleapis.com/opensea-static/...,,0x0000000000000000000000000000000000000000,,https://storage.googleapis.com/opensea-static/...,
2,0,,,30033343,7779220728716121010175700030620486582973273824...,PlagueDoctor #234,"[{'trait_type': 'id', 'value': '234', 'display...",Where are the sick - There I Am. Healing since...,True,https://lh3.googleusercontent.com/qCzcK3sEGp2R...,...,,,PlagueDr,,https://storage.googleapis.com/opensea-static/...,,0x0000000000000000000000000000000000000000,,https://storage.googleapis.com/opensea-static/...,
3,0,,,30033342,9510190511962339173712504282354773321324120343...,Crypto Graphic #47/999,[],#crypto #abstract #art #collectible #rarible #...,True,https://lh3.googleusercontent.com/iDHrRY_ko62L...,...,,,Vezzen_Dragon,,https://storage.googleapis.com/opensea-static/...,,0x0000000000000000000000000000000000000000,,https://storage.googleapis.com/opensea-static/...,
4,0,,,30033341,1029511648543007736247727748847500291464805430...,380,"[{'trait_type': 'Clothes', 'value': 'Tanktop',...",,True,https://lh3.googleusercontent.com/LrBXq8LND7W-...,...,,,BoredAlien,,https://storage.googleapis.com/opensea-static/...,,0x0000000000000000000000000000000000000000,,https://storage.googleapis.com/opensea-static/...,


In [5]:
df.to_csv('2_data_condensed.csv', index=False)

## Explore & Engineer Target

In [7]:
(df['num_sales'] > 0).value_counts(normalize=True)

False    0.546255
True     0.453745
Name: num_sales, dtype: float64

In [8]:
df['last_sale.event_type'].notna().sum()/len(df_full)

0.35111279942795853

In [9]:
df['sell_orders'].notna().sum()/len(df_full)

0.22461565963532357

In [22]:
len(df[(df['num_sales'] > 0) | 
       (df['sell_orders'].notna()) | 
       (df['last_sale.event_type'].notna())])/len(df)


0.6027663568108688

In [27]:
df['target'] = np.where((df['num_sales'] > 0) | 
                        (df['sell_orders'].notna()) | 
                        (df['last_sale.event_type'].notna()), 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [29]:
df['target'].value_counts(normalize=True)

1    0.602766
0    0.397234
Name: target, dtype: float64

In [None]:
df['asset_contract.created_date'].dt.dayofweek.hist(figsize=[12,6])