# STEP 1 : Install and Important Libaries

In [None]:
# Data processing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Model and performance evaluation
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import confusion_matrix

# Hyperparameter tuning
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from hyperopt import tpe, STATUS_OK, Trials, hp, fmin, STATUS_OK, space_eval

#OverSampling
from imblearn.over_sampling import RandomOverSampler, SMOTE

#STEP 2 : Download and Preparing Data

In [None]:
!gdown 1-2gXdkQELPxQXdUOwN8F1dR2vgke60qW
df = pd.read_csv('data/secondary_sale_dataset.csv')

In [None]:
def split_categories(df):
  categorized_data = {}
  for category in df['Category'].unique():
    categorized_data[category] = {
        'df': df[df['Category'] == category].copy(deep=True)
    }
  categorized_data['All'] = {
      'df' : df.copy(deep=True)
  }
  return categorized_data

# prepare data
categorized_data = split_categories(df)
feature_columns = ['0','1','2','3','4','5','6','week_1','centrality_buyer','centrality_seller','p_resale']
label_column = 'secondary_sale'
for key, item in categorized_data.items():
  category_df = item['df']
  X = category_df[feature_columns].values
  y = category_df[label_column].values
#train-test split
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)
#SMOTE for OverSampling
  oversample = SMOTE()
  X_train_res, y_train_res = oversample.fit_resample(X_train, y_train)
  categorized_data[key]['X_train'] = X_train_res
  categorized_data[key]['X_test'] = X_test
  categorized_data[key]['y_train'] = y_train_res
  categorized_data[key]['y_test'] = y_test
for key, item in categorized_data.items():
  print(f'{key}\n\tTrain count: {len(item["X_train"]):,}\n\tTest count: {len(item["X_test"]):,}')

Games
	Train count: 1,938,626
	Test count: 65,809
Art
	Train count: 1,638,290
	Test count: 56,073
Other
	Train count: 173,102
	Test count: 5,313
Collectible
	Train count: 1,427,352
	Test count: 52,716
Utility
	Train count: 6,470
	Test count: 209
Metaverse
	Train count: 33,464
	Test count: 1,693
All
	Train count: 4,072,412
	Test count: 181,810


# Step 3: XGBoost Classifier With No Hyperparameter Tuning

In [None]:
# Initiate XGBoost Classifier
model = XGBClassifier()
# Print default setting
model.get_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 1,
 'gamma': 0,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 3,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 100,
 'n_jobs': 1,
 'nthread': None,
 'objective': 'binary:logistic',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': None,
 'silent': None,
 'subsample': 1,
 'verbosity': 1}

In [None]:
# For each category fit model XGBoost
results = pd.DataFrame(columns=['accuracy','recall','f1','precision','confusion_matrix'], index=categorized_data.keys())
for category, category_data in categorized_data.items():
  X_train = category_data['X_train']
  y_train = category_data['y_train']
  X_test = category_data['X_test']
  y_test = category_data['y_test']
  print(f'Training {category}')
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
#Calculating and printing performance metrics
  results.loc[category,'accuracy'] = accuracy_score(y_test, y_pred)
  results.loc[category,'recall'] = recall_score(y_test, y_pred)
  results.loc[category,'f1'] = f1_score(y_test,y_pred)
  results.loc[category,'precision'] = precision_score(y_test,y_pred)
  results.loc[category,'confusion_matrix'] = confusion_matrix(y_test, y_pred)
  print("\tAccuracy:" ,accuracy_score(y_test, y_pred))
  print("\tRecall:", recall_score(y_test, y_pred))
  print("\tF1 score:",f1_score(y_test,y_pred))
  print("\tPresicion", precision_score(y_test,y_pred))
  print("\tconfusion_matrix", confusion_matrix(y_test, y_pred))

Training Games
	Accuracy: 0.7016821407406282
	Recall: 0.5891075577422441
	F1 score: 0.4675923414872268
	Presicion 0.38763489208633095
	confusion_matrix [[37556 13619]
 [ 6013  8621]]
Training Art
	Accuracy: 0.7687122144347547
	Recall: 0.7723340929172674
	F1 score: 0.8366604113401932
	Presicion 0.9126755145220234
	confusion_matrix [[ 9889  3178]
 [ 9791 33215]]
Training Other
	Accuracy: 0.6627140974967062
	Recall: 0.5392296718972895
	F1 score: 0.2967032967032967
	Presicion 0.2046561992420141
	confusion_matrix [[3143 1469]
 [ 323  378]]
Training Collectible
	Accuracy: 0.7403824265877532
	Recall: 0.5395354992390657
	F1 score: 0.5437087417483497
	Presicion 0.5479470465694509
	confusion_matrix [[30876  6727]
 [ 6959  8154]]
Training Utility
	Accuracy: 0.7033492822966507
	Recall: 0.5227272727272727
	F1 score: 0.4259259259259259
	Presicion 0.359375
	confusion_matrix [[124  41]
 [ 21  23]]
Training Metaverse
	Accuracy: 0.7365623154164206
	Recall: 0.7288135593220338
	F1 score: 0.729696969696969

# Step 5: Grid Search for XGBoost

In [None]:
# Define the search space
param_grid = {
    # Percentage of columns to be randomly samples for each tree.
    "colsample_bytree": [ 0.3, 0.5 , 0.8, 1],
    # reg_alpha provides l1 regularization to the weight, higher values result in more conservative models
    "reg_alpha": [0, 0.5, 1, 5],
    # reg_lambda provides l2 regularization to the weight, higher values result in more conservative models
    "reg_lambda": [0, 0.5, 1, 5]
    }

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=17)
#from sklearn.model_selection import KFold
#cv_inner = KFold(n_splits=15, shuffle=True, random_state=1)

In [None]:
# Define grid search
grid_search = GridSearchCV(estimator=model,
                           param_grid=param_grid,
                           scoring='f1',
                           refit='f1',
                           n_jobs=-1,
                           cv=kfold,
                           verbose=0)

for category, category_data in categorized_data.items():
   if category == 'Other' :
    X_train = category_data['X_train']
    y_train = category_data['y_train']
    X_test = category_data['X_test']
    y_test = category_data['y_test']
    print(f'Training {category}')
    # Fit grid search
    grid_search.fit(X_train, y_train)
    # Print the best score and the corresponding hyperparameters
    print(f'The best score is {grid_search.best_score_:.4f}')
    print(f'The best hyperparameters are {grid_search.best_params_}')
    # Make prediction using the best model
    grid_predict = grid_search.predict(X_test)
    #printing performance according to the best model for accuracy
    print("\tAccuracy:" ,accuracy_score(y_test, grid_predict))
    print("\tRecall:", recall_score(y_test, grid_predict))
    print("\tF1 score:",f1_score(y_test,grid_predict))
    print("\tPresicion", precision_score(y_test,grid_predict))
    print("\tconfusion_matrix", confusion_matrix(y_test, grid_predict))

Training Other
The best score is 0.6635
The best hyperparameters are {'colsample_bytree': 0.3, 'reg_alpha': 0.5, 'reg_lambda': 0}
	Accuracy: 0.6559382646339168
	Recall: 0.5592011412268189
	F1 score: 0.3001531393568147
	Presicion 0.20512820512820512
	confusion_matrix [[3093 1519]
 [ 309  392]]


In [None]:
# Define grid search
grid_search = GridSearchCV(estimator=model,
                           param_grid=param_grid,
                           scoring='f1',
                           n_jobs=-1,
                           cv=kfold,
                           verbose=0)

for category, category_data in categorized_data.items():
  if category == 'Metaverse' or category == 'Other' or category =='Utility' :
    X_train = category_data['X_train']
    y_train = category_data['y_train']
    X_test = category_data['X_test']
    y_test = category_data['y_test']
    print(f'Training {category}')
    # Fit grid search
    grid_result = grid_search.fit(X_train, y_train)
    # Print the best score and the corresponding hyperparameters
    print(f'The best score is {grid_result.best_score_:.4f}')
    print(f'The best hyperparameters are {grid_result.best_params_}')
    # Make prediction using the best model
    grid_predict = grid_result.predict(X_test)
    # Get predicted probabilities
    grid_result_prob = grid_result.predict_proba(X_test)[:,1]
    #printing performance according to the best model for accuracy
    print("\tAccuracy:" ,accuracy_score(y_test, grid_predict))
    print("\tRecall:", recall_score(y_test, grid_predict))
    print("\tF1 score:",f1_score(y_test,grid_predict))
    print("\tPresicion", precision_score(y_test,grid_predict))
    print("\tconfusion_matrix", confusion_matrix(y_test, grid_predict))
  # Get performance metrics
  #precision, recall, fscore, support = score(y_test, grid_predict)
  # Print result
  #print(f'Presicion: {precision[0]:.4f}')
  #print(f'Recall: {recall[1]:.4f}')
  #print(f'F1 score: {fscore[2]:.4f}')

Training Other
The best score is 0.6621
The best hyperparameters are {'colsample_bytree': 0.3, 'reg_alpha': 0.5, 'reg_lambda': 0}
	Accuracy: 0.6563146997929606
	Recall: 0.5420827389443652
	F1 score: 0.29389017788089716
	Presicion 0.20159151193633953
	confusion_matrix [[3107 1505]
 [ 321  380]]
Training Utility
The best score is 0.7696
The best hyperparameters are {'colsample_bytree': 0.8, 'reg_alpha': 0.5, 'reg_lambda': 0.5}
	Accuracy: 0.7129186602870813
	Recall: 0.5454545454545454
	F1 score: 0.4444444444444444
	Presicion 0.375
	confusion_matrix [[125  40]
 [ 20  24]]
Training Metaverse
The best score is 0.7568
The best hyperparameters are {'colsample_bytree': 0.3, 'reg_alpha': 0, 'reg_lambda': 5}
	Accuracy: 0.7383343177790904
	Recall: 0.7554479418886199
	F1 score: 0.7380248373743348
	Presicion 0.7213872832369942
	confusion_matrix [[626 241]
 [202 624]]


In [None]:
# Define grid search
grid_search = GridSearchCV(estimator=model,
                           param_grid=param_grid,
                           scoring='f1',
                           n_jobs=-1,
                           cv=kfold,
                           verbose=0)

for category, category_data in categorized_data.items():
  if category == 'Collectible' :
    X_train = category_data['X_train']
    y_train = category_data['y_train']
    X_test = category_data['X_test']
    y_test = category_data['y_test']
    print(f'Training {category}')
    # Fit grid search
    grid_result_collectible = grid_search.fit(X_train, y_train)
    # Print the best score and the corresponding hyperparameters
    print(f'The best score is {grid_result_collectible.best_score_:.4f}')
    print(f'The best hyperparameters are {grid_result_collectible.best_params_}')
    # Make prediction using the best model
    grid_predict_collectible = grid_result_collectible.predict(X_test)
    # Get predicted probabilities
    grid_result_prob = grid_result_collectible.predict_proba(X_test)[:,1]
    #printing performance according to the best model for accuracy
    print("\tAccuracy:" ,accuracy_score(y_test, grid_predict_collectible))
    print("\tRecall:", recall_score(y_test, grid_predict_collectible))
    print("\tF1 score:",f1_score(y_test,grid_predict_collectible))
    print("\tPresicion", precision_score(y_test,grid_predict_collectible))
    print("\tconfusion_matrix", confusion_matrix(y_test, grid_predict_collectible))
  # Get performance metrics
  #precision, recall, fscore, support = score(y_test, grid_predict)
  # Print result
  #print(f'Presicion: {precision[0]:.4f}')
  #print(f'Recall: {recall[1]:.4f}')
  #print(f'F1 score: {fscore[2]:.4f}')

Training Collectible
The best score is 0.6477
The best hyperparameters are {'colsample_bytree': 0.8, 'reg_alpha': 5, 'reg_lambda': 0}
	Accuracy: 0.7411032703543516
	Recall: 0.5341097068748759
	F1 score: 0.541890440386681
	Presicion 0.5499012194291164
	confusion_matrix [[30996  6607]
 [ 7041  8072]]


In [None]:
import pandas as pd
random_results_pd_collectible = pd.concat([pd.DataFrame(grid_search.cv_results_["params"]),pd.DataFrame(grid_search.cv_results_["mean_test_score"], columns=["F1"])],axis=1)
random_results_pd_collectible.head()

Unnamed: 0,colsample_bytree,reg_alpha,reg_lambda,F1
0,0.3,0.0,0.0,0.640112
1,0.3,0.0,0.5,0.640103
2,0.3,0.0,1.0,0.639788
3,0.3,0.0,5.0,0.63996
4,0.3,0.5,0.0,0.640009


In [None]:
import plotly.express as px
df = random_results_pd_collectible
fig = px.parallel_coordinates(df, color="F1" )
fig.show()

In [None]:
# Define grid search
grid_search = GridSearchCV(estimator=model,
                           param_grid=param_grid,
                           scoring='accuracy',
                           n_jobs=-1,
                           cv=kfold,
                           verbose=0)
#grid_search = GridSearchCV(estimator=model,
#                           param_grid=param_grid,
#                          scoring=scoring,
#                           refit='accuracy',
#                           n_jobs=-1,
#                           cv=kfold,
#                           verbose=0)


for category, category_data in categorized_data.items():
  X_train = category_data['X_train']
  y_train = category_data['y_train']
  X_test = category_data['X_test']
  y_test = category_data['y_test']
  print(f'Training {category}')
  # Fit grid search
  grid_result = grid_search.fit(X_train, y_train)
  # Print the best score and the corresponding hyperparameters
  print(f'The best score is {grid_result.best_score_:.4f}')
  print(f'The best hyperparameters are {grid_result.best_params_}')
  # Make prediction using the best model
  grid_predict = grid_result.predict(X_test)
  # Get predicted probabilities
  grid_result_prob = grid_result.predict_proba(X_test)[:,1]
  #printing performance according to the best model for accuracy
  print("\tAccuracy:" ,accuracy_score(y_test, grid_predict))
  print("\tRecall:", recall_score(y_test, grid_predict))
  print("\tF1 score:",f1_score(y_test,grid_predict))
  print("\tPresicion", precision_score(y_test,grid_predict))
  print("\tconfusion_matrix", confusion_matrix(y_test, grid_predict))
  # Get performance metrics
  #precision, recall, fscore, support = score(y_test, grid_predict)
  # Print result
  #print(f'Presicion: {precision[0]:.4f}')
  #print(f'Recall: {recall[1]:.4f}')
  #print(f'F1 score: {fscore[2]:.4f}')

Training Games
The best score is 0.6665
The best hyperparameters are {'colsample_bytree': 0.8, 'reg_alpha': 1, 'reg_lambda': 1}
	Accuracy: 0.7033384491482928
	Recall: 0.5868525351920186
	F1 score: 0.46802365187062317
	Presicion 0.38921368683435303
	confusion_matrix [[37698 13477]
 [ 6046  8588]]
Training Art
The best score is 0.7729
The best hyperparameters are {'colsample_bytree': 0.8, 'reg_alpha': 0, 'reg_lambda': 5}
	Accuracy: 0.7693007329730887
	Recall: 0.7757057154815608
	F1 score: 0.8376016872551972
	Presicion 0.9102319236016371
	confusion_matrix [[ 9777  3290]
 [ 9646 33360]]
Training Other
The best score is 0.6674
The best hyperparameters are {'colsample_bytree': 0.3, 'reg_alpha': 0, 'reg_lambda': 0.5}
	Accuracy: 0.6519856954639563
	Recall: 0.5549215406562055
	F1 score: 0.2961553102398173
	Presicion 0.2019730010384216
	confusion_matrix [[3075 1537]
 [ 312  389]]
Training Collectible
The best score is 0.6925
The best hyperparameters are {'colsample_bytree': 0.8, 'reg_alpha': 0, 

