# STEP 1 : Install and Import Libaries

In [None]:
# Data processing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Model and performance evaluation
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import confusion_matrix

# Hyperparameter tuning
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from hyperopt import tpe, STATUS_OK, Trials, hp, fmin, STATUS_OK, space_eval

#OverSampling
from imblearn.over_sampling import RandomOverSampler, SMOTE

#STEP 2 : Download and Preparing Data

In [None]:
df = pd.read_csv('data/secondary_sale_dataset.csv')

In [None]:
def split_categories(df):
  categorized_data = {}
  for category in df['Category'].unique():
    categorized_data[category] = {
        'df': df[df['Category'] == category].copy(deep=True)
    }
  categorized_data['All'] = {
      'df' : df.copy(deep=True)
  }
  return categorized_data

# prepare data
categorized_data = split_categories(df)
feature_columns = ['0','1','2','3','4','5','6','week_1','centrality_buyer','centrality_seller','p_resale']
label_column = 'secondary_sale'
for key, item in categorized_data.items():
  category_df = item['df']
  X = category_df[feature_columns].values
  y = category_df[label_column].values
#train-test split
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)
# define oversampling strategy
  oversample = RandomOverSampler(sampling_strategy='minority')
# fit and apply the transform
  X_train_over, y_train_over = oversample.fit_resample(X_train, y_train)
  categorized_data[key]['X_train'] = X_train_res
  categorized_data[key]['X_test'] = X_test
  categorized_data[key]['y_train'] = y_train_res
  categorized_data[key]['y_test'] = y_test
for key, item in categorized_data.items():
  print(f'{key}\n\tTrain count: {len(item["X_train"]):,}\n\tTest count: {len(item["X_test"]):,}')

# Step 3: AdaBoost Classifier Grid Search

In [None]:
# Initiate AdaBoost Classifier
clf_ada = AdaBoostClassifier()
# Print default setting
clf_ada.get_params()

In [None]:
# Seting up the hyperparameter search

param_dist = {"n_estimators": [200, 300, 400, 500],
              "learning_rate": [0.3, 0.4, 0.5]}


In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold

# Set up the k-fold cross-validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)

In [None]:
ada_search_2 = GridSearchCV(estimator=clf_ada,
                           param_grid= param_dist,
                           scoring='f1',
                           refit='f1',
                           n_jobs=-1,
                           cv=kfold,
                           verbose=0)

In [None]:
# For each category fit model AdaBoost
results = pd.DataFrame(columns=['accuracy','recall','f1','precision','confusion_matrix'], index=categorized_data.keys())
for category, category_data in categorized_data.items():
    X_train = category_data['X_train']
    y_train = category_data['y_train']
    X_test = category_data['X_test']
    y_test = category_data['y_test']
    print(f'Training {category}')
    games_ada2 = ada_search_2.fit(X_train, y_train)
    games_ada2_preds = games_ada2.best_estimator_.predict(X_test)
    print(f'The best hyperparameters are {games_ada2.best_params_}')
    #printing performance according to the best model for accuracy
    print("\tAccuracy:" ,accuracy_score(y_test, games_ada2_preds))
    print("\tRecall:", recall_score(y_test, games_ada2_preds))
    print("\tF1 score:",f1_score(y_test,games_ada2_preds))
    print("\tPresicion", precision_score(y_test,games_ada2_preds))
    print("\tconfusion_matrix", confusion_matrix(y_test, games_ada2_preds))

# Step 4: RandomForest Classifier Grid Search

In [None]:
# Create a based model
rf = RandomForestClassifier()
# Print default setting
rf.get_params()

In [None]:
# Seting up the hyperparameter search
param_grid = {
  # Percentage of columns to be randomly samples for each tree.
    "colsample_bytree": [ 0.3, 0.5 , 0.8, 1],
    # reg_alpha provides l1 regularization to the weight, higher values result in more conservative models
    "reg_alpha": [0, 0.5, 1, 5],
    # reg_lambda provides l2 regularization to the weight, higher values result in more conservative models
    "reg_lambda": [0, 0.5, 1, 5]
    }

In [None]:
# Set up the k-fold cross-validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)

In [None]:
random_search  = GridSearchCV(estimator=rf,
                 param_grid= param_grid,
                 scoring='f1',
                 refit='f1',
                 n_jobs=-1,
                 cv=kfold,
                 verbose=0)

In [None]:
# For each category fit model AdaBoost
results2 = pd.DataFrame(columns=['accuracy','recall','f1','precision','confusion_matrix'], index=categorized_data.keys())
for category, category_data in categorized_data.items():
    X_train = category_data['X_train']
    y_train = category_data['y_train']
    X_test = category_data['X_test']
    y_test = category_data['y_test']
    print(f'Training {category}')
    randomOver2 = random_search.fit(X_train, y_train)
    randomOver2_preds = randomOver2.best_estimator_.predict(X_test)
    print(f'The best hyperparameters are {randomOver2.best_params_}')
    #printing performance according to the best model for accuracy
    print("\tAccuracy:" ,accuracy_score(y_test, randomOver2_preds))
    print("\tRecall:", recall_score(y_test,randomOver2_preds))
    print("\tF1 score:",f1_score(y_test,randomOver2_preds))
    print("\tPresicion", precision_score(y_test,randomOver2_preds))
    print("\tconfusion_matrix", confusion_matrix(y_test,randomOver2_preds))

# Step 5: XGBoost Classifier Grid Search

In [None]:
# Initiate XGBoost Classifier
model = XGBClassifier()
# Print default setting
model.get_params()

In [None]:
# Define the search space
param_grid = {
  # Percentage of columns to be randomly samples for each tree.
    "colsample_bytree": [ 0.3, 0.5 , 0.8, 1],
    # reg_alpha provides l1 regularization to the weight, higher values result in more conservative models
    "reg_alpha": [0, 0.5, 1, 5],
    # reg_lambda provides l2 regularization to the weight, higher values result in more conservative models
    "reg_lambda": [0, 0.5, 1, 5]
    }

In [None]:
# Set up the k-fold cross-validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)

In [None]:
grid_search = GridSearchCV(estimator=model,
              param_grid= param_grid,
              scoring='f1',
              refit='f1',
              n_jobs=-1,
              cv=kfold,
              verbose=0)

In [None]:
# For each category fit model AdaBoost
results3 = pd.DataFrame(columns=['accuracy','recall','f1','precision','confusion_matrix'], index=categorized_data.keys())
for category, category_data in categorized_data.items():
    X_train = category_data['X_train']
    y_train = category_data['y_train']
    X_test = category_data['X_test']
    y_test = category_data['y_test']
    print(f'Training {category}')
    xgboost2 = grid_search.fit(X_train, y_train)
    xgboost2_preds = xgboost2.best_estimator_.predict(X_test)
    print(f'The best hyperparameters are {xgboost2.best_params_}')
    #printing performance according to the best model for accuracy
    print("\tAccuracy:" ,accuracy_score(y_test, xgboost2_preds))
    print("\tRecall:", recall_score(y_test, xgboost2_preds))
    print("\tF1 score:",f1_score(y_test,xgboost2_preds))
    print("\tPresicion", precision_score(y_test,xgboost2_preds))
    print("\tconfusion_matrix", confusion_matrix(y_test, xgboost2_preds))