In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
twitter = pd.read_csv('data/twitter_data.csv' , index_col=None, header=0, lineterminator='\n')
print(twitter.shape)
twitter.head()

(62997, 79)


Unnamed: 0,followers_count_mean,followers_count_median,followers_count_min,followers_count_max,friends_count_mean,friends_count_median,friends_count_min,friends_count_max,listed_count_mean,listed_count_median,...,number_of_photos_min,number_of_photos_max,number_of_urls_mean,number_of_urls_median,number_of_urls_min,number_of_urls_max,has_nft,has_crypto,number_of_tweets,permalink
0,334.0,334.0,334,334,1368.0,1368.0,1368,1368,2.0,2.0,...,0,0,1.0,1.0,1,1,0,0,1,https://opensea.io/assets/0x005c1cfc36e5ec711b...
1,2901.0,2901.0,2901,2901,3027.0,3027.0,3027,3027,19.0,19.0,...,0,0,1.0,1.0,1,1,0,0,1,https://opensea.io/assets/0x005c1cfc36e5ec711b...
2,1627.0,1627.0,1627,1627,908.0,908.0,908,908,16.0,16.0,...,0,0,1.0,1.0,1,1,0,0,2,https://opensea.io/assets/0x005efb3633638dd0dd...
3,76.0,76.0,76,76,20.0,20.0,20,20,1.0,1.0,...,0,0,1.0,1.0,1,1,0,0,1,https://opensea.io/assets/0x0076b645920716be2a...
4,47.0,47.0,47,47,215.0,215.0,215,215,4.0,4.0,...,0,0,1.0,1.0,1,1,0,0,1,https://opensea.io/assets/0x00c719960bfcb4286d...


In [3]:
opensea = pd.read_csv('data/opensea_data.csv' , index_col=0, header=0, lineterminator='\n')
print(opensea.shape)
opensea.head()

(62997, 47)


Unnamed: 0,permalink,sell_orders,top_bid,listing_date,is_presale,transfer_fee,supports_wyvern,numEvents,transfer,successful,...,market_cap,floor_price,is_collection_verified,is_subject_to_whitelist,collection_name,creation_day,creation_month,creation_year,price_label,sale_label
0,https://opensea.io/assets/0xc7e5e9434f4a71e6db...,0.0,0.0,0.0,0,0.0,1,1,1,0,...,289.641,0,1,0,971,2,1,2021,0,0
1,https://opensea.io/assets/0x495f947276749ce646...,0.0,0.0,0.0,1,0.0,1,1,1,0,...,0.0,0,0,0,7291,2,1,2021,0,0
2,https://opensea.io/assets/0x495f947276749ce646...,0.0,0.0,0.0,1,0.0,1,1,1,0,...,0.0,0,0,0,7291,3,1,2021,0,0
3,https://opensea.io/assets/0xd07dc4262bcdbf8519...,0.0,0.0,0.0,0,0.0,1,6,1,0,...,3827521000000000.0,0,1,0,11248,3,1,2021,0,0
5,https://opensea.io/assets/0xd07dc4262bcdbf8519...,0.0,0.0,0.0,0,0.0,1,2,2,0,...,2838661000000000.0,0,1,0,11248,2,1,2021,0,0


In [4]:
df = twitter.merge(opensea, on='permalink')
print(df.shape)
df.head()

(62997, 125)


Unnamed: 0,followers_count_mean,followers_count_median,followers_count_min,followers_count_max,friends_count_mean,friends_count_median,friends_count_min,friends_count_max,listed_count_mean,listed_count_median,...,market_cap,floor_price,is_collection_verified,is_subject_to_whitelist,collection_name,creation_day,creation_month,creation_year,price_label,sale_label
0,334.0,334.0,334,334,1368.0,1368.0,1368,1368,2.0,2.0,...,0.0,0,1,0,4818,23,3,2021,0,0
1,2901.0,2901.0,2901,2901,3027.0,3027.0,3027,3027,19.0,19.0,...,0.0,0,1,0,4818,23,3,2021,0,0
2,1627.0,1627.0,1627,1627,908.0,908.0,908,908,16.0,16.0,...,0.0,0,1,0,12051,5,1,2021,4,1
3,76.0,76.0,76,76,20.0,20.0,20,20,1.0,1.0,...,0.0,0,0,0,12410,26,3,2021,0,0
4,47.0,47.0,47,47,215.0,215.0,215,215,4.0,4.0,...,0.0,0,0,0,7921,17,3,2021,3,1


## Modeling only with all data

Here we use both twitter and opensea features in order to try to achieve better performance.

In [5]:
df['price_label'].value_counts()

0    48183
2     8391
1     4000
3     2189
4      211
5       23
Name: price_label, dtype: int64

### Missing values

We don't have missing values as we handle data quality during the data acquisition step.

In [6]:
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})
missing_value_df.sort_values('percent_missing', inplace=True)
missing_value_df.reset_index(drop=True)
full_missing_value_df = missing_value_df[missing_value_df['percent_missing'] > 0 ]
full_missing_value_df

Unnamed: 0,column_name,percent_missing


### Categorical variables

There is no categorical variables to handle because twitter dataset is made of counts and other aggregations statistics.

In [7]:
df.drop("permalink",axis=1,inplace=True)
df.dtypes.value_counts()

int64      67
float64    57
dtype: int64

### Classification step

Here, we use a tree based model that is known to be very effective for this kind of tasks. Hence, we don't need to handle multicolinearity, outliers and normalization fo the data. H

In [8]:
n = 1500 

df.loc[df['price_label'] == 5] = 4
df_subsampled0 = df[df['price_label'] == 4]
df_subsampled1 = df[df['price_label'] == 0].sample(n=n, random_state=0)
df_subsampled2 = df[df['price_label'] == 1].sample(n=n, random_state=0)
df_subsampled3 = df[df['price_label'] == 2].sample(n=n, random_state=0)
df_subsampled4 = df[df['price_label'] == 3].sample(n=n, random_state=0)

df_concatenated = pd.concat([df_subsampled0, df_subsampled1, df_subsampled2, df_subsampled3, df_subsampled4])
df_concatenated['price_label'].value_counts()

0    1500
1    1500
2    1500
3    1500
4     234
Name: price_label, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split

# Define the target
X = df_concatenated.drop(['price_label', 'avg_selling_price'],axis=1)
y = df_concatenated.price_label

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)



### Balance the classes 

First we have to manually subsample the majority class and then we can use an algorithm.

ADASYN (Adaptive Synthetic Sampling) is a type of oversampling technique for imbalanced datasets. Unlike traditional oversampling techniques like SMOTE which oversamples the minority class with a fixed ratio, ADASYN dynamically adjusts the oversampling ratio for minority samples based on their difficulty in being classified.

The idea behind ADASYN is to oversample the minority samples that are difficult to be classified correctly. These samples are usually located near the decision boundary between classes and may cause the classifier to be biased towards the majority class. By oversampling these samples, the classifier can be trained to better handle the minority class.

In ADASYN, a density distribution is calculated for the minority samples based on their distances to their k-nearest neighbors. The samples with higher density are more likely to be oversampled. This way, the oversampling ratio is adjusted based on the difficulty of the minority samples, rather than a fixed ratio.

In [10]:
y_train.value_counts()

2    1200
1    1200
3    1200
0    1200
4     187
Name: price_label, dtype: int64

In [11]:
from imblearn.over_sampling import ADASYN

adasyn = ADASYN(sampling_strategy='minority', random_state=0)
X_resampled, y_resampled = adasyn.fit_resample(X_train, y_train)
y_resampled.value_counts()

4    1246
2    1200
1    1200
3    1200
0    1200
Name: price_label, dtype: int64

In [13]:
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV


clf = lgb.LGBMClassifier()
        
# Random Search for Hyperparameters
param_grid = {
    'boosting_type': ['gbdt'],
    'objective': ['binary'],
    'num_leaves': [32, 64, 128, 256],
    'learning_rate': [0.05, 0.1, 0.15, 0.2, 0.5],
    'min_child_samples': [10, 20, 30, 40],
    'n_estimators': [50, 100, 200, 500, 750],
    'verbose': [-1]
    }

clf_random = RandomizedSearchCV(estimator = clf, param_distributions = param_grid, n_iter = 30, cv = 3, verbose=2, random_state=42, n_jobs = -1)
clf_random.fit(X_resampled, y_resampled)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   10.3s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:  1.4min finished


RandomizedSearchCV(cv=3, estimator=LGBMClassifier(), n_iter=30, n_jobs=-1,
                   param_distributions={'boosting_type': ['gbdt'],
                                        'learning_rate': [0.05, 0.1, 0.15, 0.2,
                                                          0.5],
                                        'min_child_samples': [10, 20, 30, 40],
                                        'n_estimators': [50, 100, 200, 500,
                                                         750],
                                        'num_leaves': [32, 64, 128, 256],
                                        'objective': ['binary'],
                                        'verbose': [-1]},
                   random_state=42, verbose=2)

In [14]:
clf_random.best_params_

{'verbose': -1,
 'objective': 'binary',
 'num_leaves': 256,
 'n_estimators': 200,
 'min_child_samples': 20,
 'learning_rate': 0.2,
 'boosting_type': 'gbdt'}

In [15]:
# Get the best hyperparameters
best_clf = clf_random.best_estimator_

# Predict the target using the best classifier
y_pred = best_clf.predict(X_test)


### Results interpretation

In [17]:
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report


# Calculate the evaluation metrics 
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1, '\n')

# Print classification report
print("Classification Report:\n\n", classification_report(y_test , y_pred, digits=3))


Accuracy: 0.8323977546110666
Precision: 0.8178213271201047
Recall: 0.792482269503546
F1 Score: 0.8034103661426928 

Classification Report:

               precision    recall  f1-score   support

           0      1.000     1.000     1.000       300
           1      0.856     0.830     0.843       300
           2      0.697     0.750     0.722       300
           3      0.800     0.787     0.793       300
           4      0.737     0.596     0.659        47

    accuracy                          0.832      1247
   macro avg      0.818     0.792     0.803      1247
weighted avg      0.834     0.832     0.833      1247



All our metrics have improved by adding Opensea data to the equation. For instance, accuracy jumped by 9 points.

 