# Part 2: EDA and Initial Analysis #

In [1]:
import pandas as pd

import json

### Section I: Extracting Informative Fields from JSON ###

I intend to include the following properties.  They seem most informative in distinguishing NFT types:
- id
- num_sales
- collection -> name
- creator -> address
- traits
- last_sale -> payment_token -> usd_price
- last_sale -> transaction -> timestamp

These may be of interest in the future, but their usefulness is questionable:
- asset_contract -> description (May be useful with some NLP, but descriptions are pretty generic and mostly reference the collection name.)
- background_color (Many are Null, and background color is unlikely to be a major factor.)
- owner (Many of these are or Null, even though the NFTs have sold many times.  Is this a form of anonymity?)

For some of these fields, I only want certain subfields.  However, it is possible for the fields to be Null, in which case I will assign Null to the subfields.

In [2]:
def get_creator_address(in_dict):
    if in_dict == None:
        return None
    else:
        return in_dict['address']

In [3]:
def get_last_saleprice(in_dict):
    if in_dict == None:
        return None
    else:
        return in_dict['payment_token']['usd_price']

In [4]:
def get_last_saledate(in_dict):
    if in_dict == None:
        return None
    else:
        return in_dict['transaction']['timestamp']

I'm now ready to extract the data from the stored files and store the information I need in a Pandas DataFrame.

In [5]:
def json_to_df():
    
    request_passes = [[], [], []] # Each element in this list will contain all requests from one pass through the top 10,000 NFTs.
    
    for passnum in range(3):
        
        # In the filenames, passes were labled a, b, and c.
        if passnum == 0:
            whichpass = 'a'
        elif passnum == 1:
            whichpass = 'b'
        else:
            whichpass = 'c'
        
        # There were 200 requests in each pass.
        for request in range(1, 201):
            request_passes[passnum].append(json.load(open(f'./raw_data/request_{request}_{whichpass}', 'r')))
            
    # Fields to be initially extracted from dictionaries.
    ids = [[], [], []]
    num_sales = [[], [], []]
    collection_name = [[], [], []]
    collection_slug = [[], [], []]
    creator = [[], [], []]
    last_sale = [[], [], []]
    traits = [[], [], []]
    owner = [[], [], []] ##
    
    # Subfields that must be extracted using the functions defined above.
    creator_address = [[], [], []]
    last_sale_price = [[], [], []]
    last_sale_date = [[], [], []]

    # List that will hold the Dataframe for each pass.
    df_list = [[], [], []]
    
    for passnum in range(3):

        for request in request_passes[passnum]:
            
            for nft in request['assets']:
                ids[passnum].append(nft['id'])
                num_sales[passnum].append(nft['num_sales'])
                collection_name[passnum].append(nft['collection']['name'])
                collection_slug[passnum].append(nft['collection']['slug'])
                creator[passnum].append(nft['creator'])
                last_sale[passnum].append(nft['last_sale'])
                traits[passnum].append(nft['traits'])
                owner[passnum].append(nft['owner'])
        
        # Use functions defined above to extract subfields from fields tha have some None values.
        creator_address[passnum] = [get_creator_address(entry) for entry in creator[passnum]]
        last_sale_price[passnum] = [get_last_saleprice(entry) for entry in last_sale[passnum]]
        last_sale_date[passnum] = [get_last_saledate(entry) for entry in last_sale[passnum]]
        
        # Zip the features together in preparation for making the Dataframe.
        features = zip(ids[passnum], num_sales[passnum], collection_name[passnum], collection_slug[passnum], creator_address[passnum],
                       last_sale_price[passnum], last_sale_date[passnum],
                       traits[passnum], owner[passnum])
        
        # Create the dataframe for the current pass.
        df_list[passnum] = pd.DataFrame(features, columns=['asset_id', 'num_sales', 'collection_name', 'collection_slug', 'creator_address',
                                                           'last_sale_price', 'last_sale_date', 'traits', 'owner'])
    
    return df_list

In [6]:
df_list = json_to_df()

### Section II: Cleaning and Initial EDA ###

In [7]:
len(df_list[0]['collection_name'].unique())

1325

In [8]:
df_list[0]['collection_name'].value_counts()

Dodgers MLB Crypto      2295
Rarible                 1156
Hero                     812
The Sandbox ASSETS       230
Town Star                196
                        ... 
Jamaican Bobsled NFT       1
Greetings by Z             1
Macchina di Lusso          1
COACHK - CLUB              1
Conscience Cards           1
Name: collection_name, Length: 1325, dtype: int64

Some observations: why traits and owner are essentially useless

In [9]:
# df_list[0][:50]

In [10]:
# df_list[0]['traits'][:50]

In [11]:
# def get_types(traits):
#     if traits == []:
#         return None
#     else:
#         types = [trait['trait_type'] for trait in traits]
#         return types
    
# df_list[0]['trait_types'] = df_list[0]['traits'].apply(get_types)

In [12]:
# df_list[0]['trait_types'].value_counts()

In [13]:
# df_list[0].isna().sum()

In [14]:
df_list[0][df_list[0]['num_sales'] >= 20]['asset_id']

0       151609016
1        44596334
2        48159964
3        76676445
4        18781782
          ...    
5215     78597154
5216     31629888
5217     36068919
5218     16813677
5219     18332356
Name: asset_id, Length: 5220, dtype: int64

How can the last sale be None when there are 15 sales?  Were they sold as part of a bundle, and if so, does that cause this?

Check things like:
- Number of different collections
- Distribution of num_sales
- Distribution of sale prices and dates
- Distributions among different collections.
- Is there a 1-to-1 relationship between collection name and creator address?
- Examine changes from one pass to the next, whether the most changes and odd behavior are at the bottom of the list, duplicates and apparent omissions, etc.

In [15]:
# # Check for duplicates and let the user know if there are none.
# if len(ids) == len(set(ids)):
#     print('No duplicates found.')
#     f = open('./raw_data/duplicates.txt', 'a')
#     f.write(f'Unique IDs found: {len(set(ids))}')
#     f.write('No duplicates!')
#     f.close()

# # If there are duplicates, make a list of them and save it in a file.
# else:
#     ids.sort() # Sort the ids to make it easy to check for duplicates.
#     duplicate_ids = [ident for i, ident in enumerate(ids[:-1]) if ids[i] == ids[i+1]] # Create the list of duplicates.
#     print(f'Duplicates found with the following IDs: {duplicate_ids}.')
#     f = open('./raw_data/duplicates.txt', 'a')
#     for ident in duplicate_ids:
#         f.write(f'{ident}\n')
#     f.close()

In [None]:
len(df_list[0][df_list[0]['num_sales'] >= 20]['collection_slug'].unique())

1110

Join all the dataframes into one, which will be used to identify all unique collection slugs identified with any NFT having at least 20 sales at the time of data collection for any of the three passes.

In [13]:
df_full = pd.concat(df_list)

In [15]:
top_collections = [slug for slug in df_full[df_full['num_sales'] >= 20]['collection_slug'].unique()]

In [21]:
top_collections_df = pd.DataFrame(top_collections, columns=['slug'])

In [22]:
top_collections_df.to_csv('./top_collections.csv', index=False)

In [25]:
top_collections_test = top_collections_df[:10]

In [29]:
top_collections_test.to_csv('./top_collections_test.csv', index=False)

### Section III: Combining the Three Passes into One DataFrame ###