In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.ensemble import AdaBoostClassifier

In [None]:
!gdown 1-2gXdkQELPxQXdUOwN8F1dR2vgke60qW
df = pd.read_csv('secondary_sale_dataset.csv')

In [None]:
def split_categories(df):
  categorized_data = {}
  for category in df['Category'].unique():
    categorized_data[category] = {
        'df': df[df['Category'] == category].copy(deep=True)
    }
  categorized_data['All'] = {
      'df' : df.copy(deep=True)
  }
  return categorized_data

# prepare data
categorized_data = split_categories(df)
feature_columns = ['0','1','2','3','4','5','6','week_1','centrality_buyer','centrality_seller','p_resale']
label_column = 'secondary_sale'
for key, item in categorized_data.items():
  category_df = item['df']
  X = category_df[feature_columns].values
  y = category_df[label_column].values
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)
  categorized_data[key]['X_train'] = X_train
  categorized_data[key]['X_test'] = X_test
  categorized_data[key]['y_train'] = y_train
  categorized_data[key]['y_test'] = y_test
for key, item in categorized_data.items():
  print(f'{key}\n\tTrain count: {len(item["X_train"]):,}\n\tTest count: {len(item["X_test"]):,}')

Games
	Train count: 1,250,354
	Test count: 65,809
Art
	Train count: 1,065,373
	Test count: 56,073
Other
	Train count: 100,931
	Test count: 5,313
Collectible
	Train count: 1,001,602
	Test count: 52,716
Utility
	Train count: 3,955
	Test count: 209
Metaverse
	Train count: 32,160
	Test count: 1,693
All
	Train count: 3,454,378
	Test count: 181,810


In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
# fit model adaboost
results = pd.DataFrame(columns=['accuracy','recall','f1','precision','confusion_matrix'], index=categorized_data.keys())
for category, category_data in categorized_data.items():
  X_train = category_data['X_train']
  y_train = category_data['y_train']
  X_test = category_data['X_test']
  y_test = category_data['y_test']
  print(f'Training {category}')
  oversample = SMOTE()
  X_train_res, y_train_res = oversample.fit_resample(X_train, y_train)
  abc = AdaBoostClassifier(n_estimators=50,learning_rate=1)
  abc.fit(X_train_res, y_train_res)
  y_pred = abc.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  results.loc[category,'accuracy'] = accuracy
  results.loc[category,'recall'] = recall_score(y_test, y_pred)
  results.loc[category,'f1'] = f1_score(y_test,y_pred)
  results.loc[category,'precision'] = precision_score(y_test,y_pred)
  results.loc[category,'confusion_matrix'] = confusion_matrix(y_test, y_pred)
  print("\tAccuracy:" ,accuracy)
  print("\tRecall:", recall_score(y_test, y_pred))
  print("\tF1 score:",f1_score(y_test,y_pred))
  print("\tPresicion", precision_score(y_test,y_pred))
  print("\tconfusion_matrix", confusion_matrix(y_test, y_pred))

Training Games
	Accuracy: 0.6912124481453904
	Recall: 0.5619789531228646
	F1 score: 0.44733335146454895
	Presicion 0.3715382877795347
	confusion_matrix [[37264 13911]
 [ 6410  8224]]
Training Art
	Accuracy: 0.7245198223744048
	Recall: 0.7317118541598847
	F1 score: 0.8029292065881634
	Presicion 0.8895044803120672
	confusion_matrix [[ 9158  3909]
 [11538 31468]]
Training Other
	Accuracy: 0.6350461133069829
	Recall: 0.5663338088445078
	F1 score: 0.29052323454079765
	Presicion 0.1953740157480315
	confusion_matrix [[2977 1635]
 [ 304  397]]
Training Collectible
	Accuracy: 0.7467182639046969
	Recall: 0.4765433732548137
	F1 score: 0.5189508574722582
	Presicion 0.5696432808668829
	confusion_matrix [[32162  5441]
 [ 7911  7202]]
Training Utility
	Accuracy: 0.631578947368421
	Recall: 0.5681818181818182
	F1 score: 0.3937007874015748
	Presicion 0.30120481927710846
	confusion_matrix [[107  58]
 [ 19  25]]
Training Metaverse
	Accuracy: 0.7023036030714708
	Recall: 0.7276029055690073
	F1 score: 0.7045

In [None]:
from xgboost import XGBClassifier

In [None]:
# fit model xgboots
results = pd.DataFrame(columns=['accuracy','recall','f1','precision','confusion_matrix'], index=categorized_data.keys())
for category, category_data in categorized_data.items():
  if category =='Utility' :
    X_train = category_data['X_train']
    y_train = category_data['y_train']
    X_test = category_data['X_test']
    y_test = category_data['y_test']
    print(f'Training {category}')
    oversample = SMOTE()
    X_train_res, y_train_res = oversample.fit_resample(X_train, y_train)
    model = XGBClassifier(reg_lambda= 0, reg_alpha = 1, max_depth=15, learning_rate= 0.1, colsample_bytree = 1)
    model.fit(X_train_res, y_train_res)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results.loc[category,'accuracy'] = accuracy
    results.loc[category,'recall'] = recall_score(y_test, y_pred)
    results.loc[category,'f1'] = f1_score(y_test,y_pred)
    results.loc[category,'precision'] = precision_score(y_test,y_pred)
    results.loc[category,'confusion_matrix'] = confusion_matrix(y_test, y_pred)
    print("\tAccuracy:" ,accuracy)
    print("\tRecall:", recall_score(y_test, y_pred))
    print("\tF1 score:",f1_score(y_test,y_pred))
    print("\tPresicion", precision_score(y_test,y_pred))
    print("\tconfusion_matrix", confusion_matrix(y_test, y_pred))

Training Utility
	Accuracy: 0.8038277511961722
	Recall: 0.4090909090909091
	F1 score: 0.4675324675324675
	Presicion 0.5454545454545454
	confusion_matrix [[150  15]
 [ 26  18]]


In [None]:
# fit model xgboots
results = pd.DataFrame(columns=['accuracy','recall','f1','precision','confusion_matrix'], index=categorized_data.keys())
for category, category_data in categorized_data.items():
  if category =='Collectible' :
    X_train = category_data['X_train']
    y_train = category_data['y_train']
    X_test = category_data['X_test']
    y_test = category_data['y_test']
    print(f'Training {category}')
    oversample = SMOTE()
    X_train_res, y_train_res = oversample.fit_resample(X_train, y_train)
    model = XGBClassifier(reg_lambda= 0.1, reg_alpha = 1, max_depth=15, learning_rate= 0.1)
    model.fit(X_train_res, y_train_res)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results.loc[category,'accuracy'] = accuracy
    results.loc[category,'recall'] = recall_score(y_test, y_pred)
    results.loc[category,'f1'] = f1_score(y_test,y_pred)
    results.loc[category,'precision'] = precision_score(y_test,y_pred)
    results.loc[category,'confusion_matrix'] = confusion_matrix(y_test, y_pred)
    print("\tAccuracy:" ,accuracy)
    print("\tRecall:", recall_score(y_test, y_pred))
    print("\tF1 score:",f1_score(y_test,y_pred))
    print("\tPresicion", precision_score(y_test,y_pred))
    print("\tconfusion_matrix", confusion_matrix(y_test, y_pred))

Training Collectible


In [None]:
# fit model xgboots
results = pd.DataFrame(columns=['accuracy','recall','f1','precision','confusion_matrix'], index=categorized_data.keys())
for category, category_data in categorized_data.items():
  if category =='Utility' :
    X_train = category_data['X_train']
    y_train = category_data['y_train']
    X_test = category_data['X_test']
    y_test = category_data['y_test']
    print(f'Training {category}')
    oversample = SMOTE()
    X_train_res, y_train_res = oversample.fit_resample(X_train, y_train)
    model = XGBClassifier()
    model.fit(X_train_res, y_train_res)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results.loc[category,'accuracy'] = accuracy
    results.loc[category,'recall'] = recall_score(y_test, y_pred)
    results.loc[category,'f1'] = f1_score(y_test,y_pred)
    results.loc[category,'precision'] = precision_score(y_test,y_pred)
    results.loc[category,'confusion_matrix'] = confusion_matrix(y_test, y_pred)
    print("\tAccuracy:" ,accuracy)
    print("\tRecall:", recall_score(y_test, y_pred))
    print("\tF1 score:",f1_score(y_test,y_pred))
    print("\tPresicion", precision_score(y_test,y_pred))
    print("\tconfusion_matrix", confusion_matrix(y_test, y_pred))

Training Games
	Accuracy: 0.701226276041271
	Recall: 0.5884242175755091
	F1 score: 0.46692332718794055
	Presicion 0.38701123595505615
	confusion_matrix [[37536 13639]
 [ 6023  8611]]
Training Art
	Accuracy: 0.7662333030157117
	Recall: 0.7699390782681487
	F1 score: 0.834770332274492
	Presicion 0.9115234267466829
	confusion_matrix [[ 9853  3214]
 [ 9894 33112]]
Training Other
	Accuracy: 0.655561829474873
	Recall: 0.5606276747503567
	F1 score: 0.3004587155963303
	Presicion 0.20522193211488252
	confusion_matrix [[3090 1522]
 [ 308  393]]
Training Collectible
	Accuracy: 0.7396236436755445
	Recall: 0.5409250314298948
	F1 score: 0.5436228221838011
	Presicion 0.5463476575553031
	confusion_matrix [[30815  6788]
 [ 6938  8175]]
Training Utility
	Accuracy: 0.722488038277512
	Recall: 0.5454545454545454
	F1 score: 0.45283018867924524
	Presicion 0.3870967741935484
	confusion_matrix [[127  38]
 [ 20  24]]
Training Metaverse
	Accuracy: 0.7395156526875369
	Recall: 0.7312348668280871
	F1 score: 0.732565

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# fit model RandomForest
results = pd.DataFrame(columns=['accuracy','recall','f1','precision','confusion_matrix'], index=categorized_data.keys())
for category, category_data in categorized_data.items():
  X_train = category_data['X_train']
  y_train = category_data['y_train']
  X_test = category_data['X_test']
  y_test = category_data['y_test']
  print(f'Training {category}')
  #oversample = SMOTE()
  #X_train_res, y_train_res = oversample.fit_resample(X_train, y_train)
  rf=RandomForestClassifier(n_estimators=100)
  rf.fit(X_train, y_train)
  y_pred = rf.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  results.loc[category,'accuracy'] = accuracy
  results.loc[category,'recall'] = recall_score(y_test, y_pred)
  results.loc[category,'f1'] = f1_score(y_test,y_pred)
  results.loc[category,'precision'] = precision_score(y_test,y_pred)
  #results.loc[category,'confusion_matrix'] = confusion_matrix(y_test, y_pred)
  print("\tAccuracy:", accuracy)
  print("\tRecall:", recall_score(y_test, y_pred))
  print("\tF1 score:", f1_score(y_test,y_pred))
  print("\tPresicion", precision_score(y_test,y_pred))
  #print("\tconfusion_matrix", confusion_matrix(y_test, y_pred))

Training Games
	Accuracy: 0.848774483733228
	Recall: 0.494806614732814
	F1 score: 0.5926986985348285
	Presicion 0.7388775510204082
Training Art
	Accuracy: 0.8508729691651954
	Recall: 0.9327070641305865
	F1 score: 0.9056058519404873
	Presicion 0.8800351031154015
Training Other
	Accuracy: 0.8959156785243741
	Recall: 0.2796005706134094
	F1 score: 0.4148148148148148
	Presicion 0.8032786885245902
Training Collectible
	Accuracy: 0.8201115410880947
	Recall: 0.5593859591080527
	F1 score: 0.6406729566897806
	Presicion 0.7496009930838802
Training Utility
	Accuracy: 0.8038277511961722
	Recall: 0.22727272727272727
	F1 score: 0.32786885245901637
	Presicion 0.5882352941176471
Training Metaverse
	Accuracy: 0.7725930301240401
	Recall: 0.7966101694915254
	F1 score: 0.7736625514403294
	Presicion 0.752
Training All
	Accuracy: 0.8239975798910951
	Recall: 0.7553347951652584
	F1 score: 0.7792944097665275
	Presicion 0.804823842833331


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

In [None]:
# fit model Logistic
results = pd.DataFrame(columns=['accuracy','recall','f1','precision','confusion_matrix'], index=categorized_data.keys())
for category, category_data in categorized_data.items():
  X_train = category_data['X_train']
  y_train = category_data['y_train']
  X_test = category_data['X_test']
  y_test = category_data['y_test']
  print(f'Training {category}')
  #standart scaler
  sc = StandardScaler()
  X_train = sc.fit_transform(X_train)
  X_test = sc.transform(X_test)
  #smote
  oversample = SMOTE()
  X_train_res, y_train_res = oversample.fit_resample(X_train, y_train)
  model2 = LogisticRegression(solver='liblinear')
  model2.fit(X_train_res, y_train_res)
  y_pred = model2.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  results.loc[category,'accuracy'] = accuracy
  results.loc[category,'recall'] = recall_score(y_test, y_pred)
  results.loc[category,'f1'] = f1_score(y_test,y_pred)
  results.loc[category,'precision'] = precision_score(y_test,y_pred)
  results.loc[category,'confusion_matrix'] = confusion_matrix(y_test, y_pred)
  print("\tAccuracy:" ,accuracy)
  print("\tRecall:", recall_score(y_test, y_pred))
  print("\tF1 score:",f1_score(y_test,y_pred))
  print("\tPresicion", precision_score(y_test,y_pred))
  print("\tconfusion_matrix", confusion_matrix(y_test, y_pred))

Training Games
	Accuracy: 0.6357793007035512
	Recall: 0.5440754407544075
	F1 score: 0.39916777379490137
	Presicion 0.31521437903321586
	confusion_matrix [[33878 17297]
 [ 6672  7962]]
Training Art
	Accuracy: 0.5370677509674888
	Recall: 0.4775845230898014
	F1 score: 0.6127752252521033
	Presicion 0.8547232625884311
	confusion_matrix [[ 9576  3491]
 [22467 20539]]
Training Other
	Accuracy: 0.5795219273480143
	Recall: 0.6248216833095578
	F1 score: 0.28167202572347266
	Presicion 0.18181818181818182
	confusion_matrix [[2641 1971]
 [ 263  438]]
Training Collectible
	Accuracy: 0.6683739282191365
	Recall: 0.496923178720307
	F1 score: 0.4621254076672205
	Presicion 0.4318822243947323
	confusion_matrix [[27724  9879]
 [ 7603  7510]]
Training Utility
	Accuracy: 0.49282296650717705
	Recall: 0.7045454545454546
	F1 score: 0.36904761904761907
	Presicion 0.25
	confusion_matrix [[72 93]
 [13 31]]
Training Metaverse
	Accuracy: 0.629060838747785
	Recall: 0.5060532687651331
	F1 score: 0.5710382513661202
	Pr