In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

In [2]:
df = pd.read_csv('data/secondary_sale_dataset.csv')
test_ratio = 0.1
use_smote = True
use_scaler = True

In [3]:
def split_categories(df):
    categorized_data = {}
    for category in df['Category'].unique():
        categorized_data[category] = {
            'df': df[df['Category'] == category].copy(deep=True)
        }
    categorized_data['All'] = {
        'df' : df.copy(deep=True)
    }
    
    return categorized_data

# prepare data
categorized_data = split_categories(df)
feature_columns = ['0','1','2','3','4','5','6','week_1','centrality_buyer','centrality_seller','p_resale']
label_column = 'secondary_sale'
for key, item in categorized_data.items():
    category_df = item['df']
    X = category_df[feature_columns].values
    y = category_df[label_column].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, random_state=42)
    categorized_data[key]['X_train'] = X_train
    categorized_data[key]['X_test'] = X_test
    categorized_data[key]['y_train'] = y_train
    categorized_data[key]['y_test'] = y_test
    
for key, item in categorized_data.items():
    print(f'{key}\n\tTrain count: {len(item["X_train"]):,}\n\tTest count: {len(item["X_test"]):,}')

Games
	Train count: 1,184,546
	Test count: 131,617
Art
	Train count: 1,009,301
	Test count: 112,145
Other
	Train count: 95,619
	Test count: 10,625
Collectible
	Train count: 948,886
	Test count: 105,432
Utility
	Train count: 3,747
	Test count: 417
Metaverse
	Train count: 30,467
	Test count: 3,386
All
	Train count: 3,272,569
	Test count: 363,619


In [4]:
# fit model Adaboost
results = pd.DataFrame(columns=['accuracy','recall','f1','precision','confusion_matrix'], index=categorized_data.keys())
for category, category_data in categorized_data.items():
    X_train = category_data['X_train']
    y_train = category_data['y_train']
    X_test = category_data['X_test']
    y_test = category_data['y_test']
    
    print(f'Training {category}')
    if use_scaler:
        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)
  
    if use_smote:
        oversample = SMOTE()
        X_train_res, y_train_res = oversample.fit_resample(X_train, y_train)
    else:
        X_train_res, y_train_res = np.array(X_train), np.array(y_train)
    
    abc = AdaBoostClassifier(n_estimators=50, learning_rate=1)
    abc.fit(X_train_res, y_train_res)
    y_pred = abc.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    results.loc[category,'accuracy'] = accuracy
    results.loc[category,'recall'] = recall_score(y_test, y_pred)
    results.loc[category,'f1'] = f1_score(y_test,y_pred)
    results.loc[category,'precision'] = precision_score(y_test,y_pred)
    results.loc[category,'confusion_matrix'] = confusion_matrix(y_test, y_pred)
    print("\tAccuracy:" ,accuracy)
    print("\tRecall:", recall_score(y_test, y_pred))
    print("\tF1 score:",f1_score(y_test,y_pred))
    print("\tPresicion", precision_score(y_test,y_pred))
    print("\tconfusion_matrix", confusion_matrix(y_test, y_pred))

Training Games
	Accuracy: 0.6828905080650676
	Recall: 0.5840536309807391
	F1 score: 0.45128380421492714
	Presicion 0.3676971527733145
	confusion_matrix [[72717 29514]
 [12223 17163]]
Training Art
	Accuracy: 0.7262383521333987
	Recall: 0.7342400315373289
	F1 score: 0.8048909140594716
	Presicion 0.8905858858605462
	confusion_matrix [[18118  7780]
 [22921 63326]]
Training Other
	Accuracy: 0.6193882352941177
	Recall: 0.6077081899518239
	F1 score: 0.3039586919104991
	Presicion 0.20266238237319256
	confusion_matrix [[5698 3474]
 [ 570  883]]
Training Collectible
	Accuracy: 0.7298448288944533
	Recall: 0.5328940833692017
	F1 score: 0.5305490086199793
	Presicion 0.5282244830981293
	confusion_matrix [[60854 14375]
 [14108 16095]]
Training Utility
	Accuracy: 0.6330935251798561
	Recall: 0.6428571428571429
	F1 score: 0.3703703703703703
	Presicion 0.26011560693641617
	confusion_matrix [[219 128]
 [ 25  45]]
Training Metaverse
	Accuracy: 0.7058476077968104
	Recall: 0.7286401925391095
	F1 score: 0.708

In [5]:
# fit model xgboost
results = pd.DataFrame(columns=['accuracy','recall','f1','precision','confusion_matrix'], index=categorized_data.keys())
for category, category_data in categorized_data.items():
    X_train = category_data['X_train']
    y_train = category_data['y_train']
    X_test = category_data['X_test']
    y_test = category_data['y_test']
    
    print(f'Training {category}') 
    if use_scaler:
        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)
  
    if use_smote:
        oversample = SMOTE()
        X_train_res, y_train_res = oversample.fit_resample(X_train, y_train)
    else:
        X_train_res, y_train_res = np.array(X_train), np.array(y_train)
    
    if category == "Utility":
        model = XGBClassifier(reg_lambda= 0, reg_alpha = 1, max_depth=15, learning_rate= 0.1, colsample_bytree = 1)
    elif category == "Collectible":
        model = XGBClassifier(reg_lambda= 0.1, reg_alpha = 1, max_depth=15, learning_rate= 0.1)
    else:
        model = XGBClassifier()
    model.fit(X_train_res, y_train_res)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    results.loc[category,'accuracy'] = accuracy
    results.loc[category,'recall'] = recall_score(y_test, y_pred)
    results.loc[category,'f1'] = f1_score(y_test,y_pred)
    results.loc[category,'precision'] = precision_score(y_test,y_pred)
    results.loc[category,'confusion_matrix'] = confusion_matrix(y_test, y_pred)
    
    print("\tAccuracy:" ,accuracy)
    print("\tRecall:", recall_score(y_test, y_pred))
    print("\tF1 score:",f1_score(y_test,y_pred))
    print("\tPresicion", precision_score(y_test,y_pred))
    print("\tconfusion_matrix", confusion_matrix(y_test, y_pred))

Training Games
	Accuracy: 0.7390306723295623
	Recall: 0.6608589124072688
	F1 score: 0.5306880909438706
	Presicion 0.4433587507419752
	confusion_matrix [[77849 24382]
 [ 9966 19420]]
Training Art
	Accuracy: 0.810424004636854
	Recall: 0.8265330968033671
	F1 score: 0.8702329215293715
	Presicion 0.918811625958626
	confusion_matrix [[19599  6299]
 [14961 71286]]
Training Other
	Accuracy: 0.7452235294117647
	Recall: 0.5691672401927047
	F1 score: 0.37927080944737446
	Presicion 0.2843878954607978
	confusion_matrix [[7091 2081]
 [ 626  827]]
Training Collectible
	Accuracy: 0.7925202974429015
	Recall: 0.6702976525510711
	F1 score: 0.6492423635051712
	Presicion 0.6294695603507244
	confusion_matrix [[63312 11917]
 [ 9958 20245]]
Training Utility
	Accuracy: 0.7553956834532374
	Recall: 0.34285714285714286
	F1 score: 0.32
	Presicion 0.3
	confusion_matrix [[291  56]
 [ 46  24]]
Training Metaverse
	Accuracy: 0.7607796810395747
	Recall: 0.7545126353790613
	F1 score: 0.755877034358047
	Presicion 0.757246

In [6]:
# fit model RandomForest
results = pd.DataFrame(columns=['accuracy','recall','f1','precision','confusion_matrix'], index=categorized_data.keys())
for category, category_data in categorized_data.items():
    X_train = category_data['X_train']
    y_train = category_data['y_train']
    X_test = category_data['X_test']
    y_test = category_data['y_test']
    print(f'Training {category}')

    if use_scaler:
       sc = StandardScaler()
       X_train = sc.fit_transform(X_train)
       X_test = sc.transform(X_test)
  
    if use_smote:
       oversample = SMOTE()
       X_train_res, y_train_res = oversample.fit_resample(X_train, y_train)
    else:
       X_train_res, y_train_res = np.array(X_train), np.array(y_train)
    
    rf=RandomForestClassifier(n_estimators=100)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
        
    accuracy = accuracy_score(y_test, y_pred)
    results.loc[category,'accuracy'] = accuracy
    results.loc[category,'recall'] = recall_score(y_test, y_pred)
    results.loc[category,'f1'] = f1_score(y_test,y_pred)
    results.loc[category,'precision'] = precision_score(y_test,y_pred)
    results.loc[category,'confusion_matrix'] = confusion_matrix(y_test, y_pred)
    
    print("\tAccuracy:", accuracy)
    print("\tRecall:", recall_score(y_test, y_pred))
    print("\tF1 score:", f1_score(y_test,y_pred))
    print("\tPresicion", precision_score(y_test,y_pred))
    print("\tconfusion_matrix", confusion_matrix(y_test, y_pred))

Training Games
	Accuracy: 0.8477932182012962
	Recall: 0.49179881576260803
	F1 score: 0.5906369413736028
	Presicion 0.739194926090737
	confusion_matrix [[97132  5099]
 [14934 14452]]
Training Art
	Accuracy: 0.8503187837175086
	Recall: 0.9317773371827426
	F1 score: 0.9054373788814277
	Presicion 0.8805456637265107
	confusion_matrix [[14996 10902]
 [ 5884 80363]]
Training Other
	Accuracy: 0.8881882352941176
	Recall: 0.2505161734342739
	F1 score: 0.37995824634655523
	Presicion 0.7861771058315334
	confusion_matrix [[9073   99]
 [1089  364]]
Training Collectible
	Accuracy: 0.8189449123605737
	Recall: 0.5550773102009734
	F1 score: 0.6372222961287747
	Presicion 0.7479032833690221
	confusion_matrix [[69578  5651]
 [13438 16765]]
Training Utility
	Accuracy: 0.841726618705036
	Recall: 0.18571428571428572
	F1 score: 0.28260869565217395
	Presicion 0.5909090909090909
	confusion_matrix [[338   9]
 [ 57  13]]
Training Metaverse
	Accuracy: 0.7660956881275842
	Recall: 0.7779783393501805
	F1 score: 0.7655

In [7]:
# fit model Logistic
results = pd.DataFrame(columns=['accuracy','recall','f1','precision','confusion_matrix'], index=categorized_data.keys())
for category, category_data in categorized_data.items():
    X_train = category_data['X_train']
    y_train = category_data['y_train']
    X_test = category_data['X_test']
    y_test = category_data['y_test']
    print(f'Training {category}')
    
    if use_scaler:
       sc = StandardScaler()
       X_train = sc.fit_transform(X_train)
       X_test = sc.transform(X_test)
  
    if use_smote:
       oversample = SMOTE()
       X_train_res, y_train_res = oversample.fit_resample(X_train, y_train)
    else:
       X_train_res, y_train_res = np.array(X_train), np.array(y_train)
    
    model = LogisticRegression(solver='liblinear')
    model.fit(X_train_res, y_train_res)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    results.loc[category,'accuracy'] = accuracy
    results.loc[category,'recall'] = recall_score(y_test, y_pred)
    results.loc[category,'f1'] = f1_score(y_test,y_pred)
    results.loc[category,'precision'] = precision_score(y_test,y_pred)
    results.loc[category,'confusion_matrix'] = confusion_matrix(y_test, y_pred)
  
    print("\tAccuracy:" ,accuracy)
    print("\tRecall:", recall_score(y_test, y_pred))
    print("\tF1 score:",f1_score(y_test,y_pred))
    print("\tPresicion", precision_score(y_test,y_pred))
    print("\tconfusion_matrix", confusion_matrix(y_test, y_pred))

Training Games
	Accuracy: 0.6353434586717521
	Recall: 0.5478118832096917
	F1 score: 0.40149143918893637
	Presicion 0.3168585769117213
	confusion_matrix [[67524 34707]
 [13288 16098]]
Training Art
	Accuracy: 0.5385527665076464
	Recall: 0.4808515078785349
	F1 score: 0.6158003756691142
	Presicion 0.8560459067828097
	confusion_matrix [[18924  6974]
 [44775 41472]]
Training Other
	Accuracy: 0.5773176470588235
	Recall: 0.6111493461803166
	F1 score: 0.2833891814265198
	Presicion 0.1844619858745326
	confusion_matrix [[5246 3926]
 [ 565  888]]
Training Collectible
	Accuracy: 0.6687153805296304
	Recall: 0.49428864682316326
	F1 score: 0.46087117587133025
	Presicion 0.43168608854061247
	confusion_matrix [[55575 19654]
 [15274 14929]]
Training Utility
	Accuracy: 0.5251798561151079
	Recall: 0.7
	F1 score: 0.33108108108108103
	Presicion 0.2168141592920354
	confusion_matrix [[170 177]
 [ 21  49]]
Training Metaverse
	Accuracy: 0.642055522740697
	Recall: 0.5282791817087846
	F1 score: 0.5916442048517521
