In [None]:
%matplotlib inline

import pandas as pd
import numpy as np
import juno.junoutils as junoutils
import juno.junodb as junodb
import json
import datetime
import sklearn

## Load in the previously summarized data

In [None]:
# read in the previously summarized data
results_df = pd.read_csv('../ten_minute_pipeline/data/interac_transaction_summary.csv')

In [None]:
# load the database credentials from file
with open('../user_aggregation_pipeline//creds.json') as json_data:
    creds = json.load(json_data)

# set up a database with credentials
db = junodb.Database(creds)

# get the full history of interac requests
all_interac_requests = list(db._client['production']['eventCollection'].find({
        'eventCategory': 'interac',
        'eventAction': 'request',
        'metadata.email': {'$ne': None}}))

# flatten objects into a pandas dataframe
interac = junoutils.flattenObjects(all_interac_requests)

# subset the columns
cols_to_use = ['created','eventLabel','metadata.amount','metadata.email','metadata.rate','value']

interac = interac[cols_to_use]


interac.columns = ['created','currency','amount','email','rate','value']

interac = pd.get_dummies(data=interac, columns=['currency'])

interac = interac.sort_values(by='created')

interac['created'] = pd.to_datetime(interac.created)
interac['amount'] = pd.to_numeric(interac.amount)
interac['rate'] = pd.to_numeric(interac.rate)
interac['value'] = pd.to_numeric(interac.value)

In [None]:
def getPriorUserEvents(email, request_time=None, lookback_s=None):
    
    if (lookback_s == None) & (request_time != None):
        # find all events for the user before the request_time and after or at 30 days ago
        events_json = list(db._client['production']['eventCollection'].find({
                'metadata.email': email,
                'created': {'$lt': request_time}
        }))
    
    elif (lookback_s != None) & (request_time == None):
        
        lookback_time = datetime.datetime.now() - datetime.timedelta(seconds=lookback_s)
        
        # find all events for the user before the request_time and after or at 30 days ago
        events_json = list(db._client['production']['eventCollection'].find({
                'metadata.email': email,
                'created': {'$gte': lookback_time}
        }))
                           
    elif (lookback_s != None) & (request_time != None):
        
        lookback_time = request_time - datetime.timedelta(seconds=lookback_s)
        
        # find all events for the user before the request_time and after or at 30 days ago
        events_json = list(db._client['production']['eventCollection'].find({
                'metadata.email': email,
                'created': {'$lt': request_time, '$gte': lookback_time}
        }))
                           
    else:
        # find all events for the user before the request_time and after or at 30 days ago
        events_json = list(db._client['production']['eventCollection'].find({
                'metadata.email': email
        }))
    
    # flatten JSON objects into a pandas dataframe
    request_df = junoutils.flattenObjects(events_json)
    
    return request_df

def summarizeTransactionActions(df):
    
    record = {}
    record['n'] = df.shape[0]
    record['mean_value'] = df.value.mean()
    record['mean_timebetween'] = df.time_since_last.mean()
    
    # for each label column
    for col in [col for col in df.columns if 'eventLabel_' in col]:
        record['n_'+col] = np.sum(df[col])
        
    return record

def summarizeInteracActivity(df):
    
    if df.shape[0] == 0:
        return {}
    
    # subset the events to just the transaction related events
    this_df = df[(df.eventCategory == 'interac')]# & (df.eventAction.isin(['request','rejected','fulfilled']))]
    
    if this_df.shape[0] == 0:
        return {}
    
    # only use columns related to transactions
    # this_df = this_df[['created','eventCategory','eventAction','eventLabel', 'metadata.amount','metadata.rate','value']]
    
    # combine the eventCategory and eventAction into a unique key
    this_df['ca'] = this_df.eventCategory + '_' + this_df.eventAction
    
    # get unique interac combinations
    transaction_actions = this_df.ca.unique()
    
    record = {}
    
    # separate out dataframes
    for category_action in transaction_actions:
        
        ca_df = this_df[this_df.ca == category_action]
        ca_df = ca_df.sort_values(by='created')
        ca_df['previous_request'] = ca_df['created'].shift(1)
        ca_df = ca_df.dropna()
        ca_df['time_since_last'] = (ca_df['created'] - ca_df['previous_request']).astype(int)*1e-9 # convert time since last event to seconds
        ca_df = pd.get_dummies(ca_df, columns=['eventLabel'])
        
        record[category_action] = summarizeTransactionActions(ca_df)
        
    return record
    
def summarizeEvents(df):
    
    if df.shape[0] == 0:
        return {}
    
    # get rid of session ids from eventLabel
    this_df = df[(df.eventAction != 'start') & (df.eventCategory != 'session')]
    
    if this_df.shape[0] == 0:
        return {}
    
    this_df = this_df[['created','eventCategory','eventAction','eventLabel']]
    
    # combine the eventCategory and eventAction into a unique key
    this_df['ca'] = this_df.eventCategory + '_' + this_df.eventAction
    this_df['cal'] = this_df.eventCategory + '_' + this_df.eventAction + '_' + this_df.eventLabel
    this_df = this_df.sort_values(by='created')
    this_df['previous_request'] = this_df['created'].shift(1)
    this_df['previous_ca'] = this_df['ca'].shift(1)
    this_df['previous_cal'] = this_df['cal'].shift(1)
    this_df = this_df.dropna()
    
    # need to figure out how to weed out times that are obviously a long time ago doesn't make sense to keep those
    this_df['time_since_last'] = (this_df['created'] - this_df['previous_request']).astype(int)*1e-9 # convert time since last event to seconds
    this_df['st_ca'] = this_df['previous_ca'] + '_' + this_df['ca']
    this_df['st_cal'] = this_df['previous_cal'] + '_' + this_df['cal']
    this_df = pd.get_dummies(this_df, columns=['ca','st_ca','st_cal'])
    
    record = {}
    
    record['mean_timesince'] = this_df.time_since_last.mean()
    
    # for each state or transition column
    for col in [col for col in this_df.columns if 'ca_' in col] + [col for col in this_df.columns if 'cal_' in col]:
        record['n_'+col] = np.sum(this_df[col])
        
    return record

def summarizePreEventsMultiple(email, request_time=datetime.datetime.now(), lookback_s_list=None):

    result = {}
    
    if lookback_s_list != None:
        # sort the list of seconds into descending order to limit requests
        sorted_seconds = sorted(lookback_s_list, reverse=True)
        
        this_df = getPriorUserEvents(email=email, request_time=request_time, lookback_s=sorted_seconds[0])
        
        if this_df.shape[0] == 0:
            return {}
        
        col_name = 'mins_' + str(int(sorted_seconds[0]/60))
        
        result[col_name] = {**summarizeInteracActivity(this_df), **summarizeEvents(this_df)} 
        
        # for the shorter time periods
        for seconds in sorted_seconds[1:]:
            
            this_lookback = request_time - datetime.timedelta(seconds=seconds)
            
            this_df = this_df[this_df.created >= this_lookback]
            
            col_name = 'mins_' + str(int(seconds/60))
        
            result[col_name] = {**summarizeInteracActivity(this_df), **summarizeEvents(this_df)}
    
    else:
        
        this_df = getPriorUserEvents(email=email, request_time=request_time)
        result['now'] = {**summarizeInteracActivity(this_df), **summarizeEvents(this_df)}
    
    return result

In [None]:
# load the new interac transactions since the last time the summary routine was run
new_interac = interac[interac.created > results_df['request.created'].max()]
new_interac.reset_index(drop=True, inplace=True)

## Summarize All of the New Records

In [None]:
import time

ten_minutes_in_s = 60*10
hour_in_s = 60*60
day_in_s = 60*60*24
week_in_s = 60*60*24*7
days30_in_s = 60*60*24*30

new_results = []
fetch_times = []

times = [ten_minutes_in_s, hour_in_s, day_in_s, week_in_s, days30_in_s]

for i in np.arange(new_interac.shape[0]):
    
    now = time.time()
    
    result = {}
    result['request'] = new_interac.loc[i,:].to_dict()
    result['prior'] = summarizePreEventsMultiple(email=new_interac.email.values[i], request_time=new_interac.created[i], lookback_s_list=times)
    new_results.append(result)
    
    end = time.time()
    fetch_times.append(end-now)
    print(i, 'of',new_interac.shape[0], 'Duration: ', end-now, 'Mean Time:', np.mean(fetch_times))

## Combine the new and old records together

In [None]:
all_results = new_results+results_df.to_dict(orient='records')
all_results_df = junoutils.flattenObjects(all_results).fillna(0)
all_results_df.loc[all_results_df.fraud == 0,'fraud'] = False

fraudsters = ['gaelkevin@hotmail.com', 'royer.8383@gmail.com','adventurous7381@gmail.com']
results_df['fraud'] = results_df['request.email'].isin(fraudsters)

def RemoveWhitelistRecords(df):
    
    db = junodb.Database(creds)

    wlemails = junoutils.flattenObjects(list(db._client['production']['emailWhitelistCollection'].find({'level': 'ALLOWED'}))).email
    
    df = df[df['request.email'].str.contains('test') == False]
    df = df[df['request.email'].str.contains('fingerfoodstudios') == False]
    df = df[df['request.email'].isin(wlemails) == False]
    
    return df 

records_clean = RemoveWhitelistRecords(all_results_df)

In [None]:
records_clean.shape

In [None]:
# save the new dataset back to csv
records_clean.to_csv('../ten_minute_pipeline/data/interac_transaction_summary.csv', index=False)

## Split the dataset which is unbalanced into randomized and stratified training and test sets

In [None]:
def split_by_users_rand(df):   
    X = df.drop(['request.created'], axis = 1)
    
    # define the split percentage for test
    split_percentage = 1/3
    
    # get the fraudulent users and normal users
    fraud_users = df['request.email'][df.fraud == True].unique()
    normal_users = df['request.email'][df.fraud == False].unique()
    
    # get the number of each class
    n_fraud_users = len(fraud_users)
    n_normal_users = len(normal_users)
    
    # randomly shuffle each class
    rand_fraud_users_idx = np.random.choice(a=np.arange(n_fraud_users), size=n_fraud_users, replace=False)
    rand_normal_users_idx = np.random.choice(a=np.arange(n_normal_users), size=n_normal_users, replace=False)
    
    # get the number of test values for each class
    n_fraud_users_test = int(split_percentage*n_fraud_users)
    n_normal_users_test = int(split_percentage*n_normal_users)
    
    # randomly generate train and test user sets
    rand_test_fraud_users = fraud_users[rand_fraud_users_idx[0:n_fraud_users_test]]
    rand_test_normal_users = normal_users[rand_normal_users_idx[0:n_normal_users_test]]
    rand_train_fraud_users = fraud_users[rand_fraud_users_idx[n_fraud_users_test:]]
    rand_train_normal_users = normal_users[rand_normal_users_idx[n_normal_users_test:]]
    
    # manually split so there's no contamination between fraudulent and normal users
    test_fraud = X[X['request.email'].isin(rand_test_fraud_users)].drop(['request.email'], axis=1)
    train_fraud = X[X['request.email'].isin(rand_train_fraud_users)].drop(['request.email'], axis=1)
    test_normal = X[X['request.email'].isin(rand_test_normal_users)].drop(['request.email'], axis=1)
    train_normal = X[X['request.email'].isin(rand_train_normal_users)].drop(['request.email'], axis=1)

    # get combine test and train together
    test = pd.concat((test_fraud,test_normal))
    train = pd.concat((train_fraud,train_normal))

    # split into labels and data
    X_test = test.drop('fraud', axis = 1)
    X_train = train.drop('fraud', axis = 1)
    y_test = test['fraud']
    y_train = train['fraud']
    
    return (X_train, X_test, y_train, y_test)

In [None]:
X_train, X_test, y_train, y_test = split_by_users_rand(records_clean)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

## Investigate Different Time Ranges

In [None]:
def leaveOneUserOutCrossValidation(df, classifier_list):
    
    # list of different lookback times
    times = [['10'],['60'],['1440'],['10080'],['43200'],['10','60'], ['10','60','1440']]

    # variables to keep track of
    times_records = []
    user_as_test = []
    training_fraud_percentages = []
    testing_fraud_percentages = []
    training_accuracies = []
    testing_accuracies = []
    training_average_precisions = []
    testing_average_precisions = []
    train_true_negatives = []
    train_false_negatives = []
    train_true_positives = []
    train_false_positives = []
    test_true_negatives = []
    test_false_negatives = []
    test_true_positives = []
    test_false_positives = []
    classifiers = []
    
    # for each classifer type
    for clf in classifier_list:
        
        classifier = clf['Label']
        model = clf['Model']

        # for each time period or combination of time periods
        for time in times:

            prefixes = ['prior.mins_'+t+'.' for t in time]

            columns = ['fraud','request.email','request.created']

            # get only the columns for the currently seclected time periods
            last_60_columns = list(set([col if any(prefix in col for prefix in prefixes) else None for col in results_df.columns])-set([None]))

            # get the data for the last 10 columns
            X = records_clean[last_60_columns].values
            y = records_clean['fraud'].values
            emails = records_clean['request.email']

            group_kfold = GroupKFold(n_splits=3)
            group_kfold.get_n_splits(X=X, y=y, groups=emails)

            for train_index, test_index in group_kfold.split(X=X, y=y, groups=emails):

                X_train = X[train_index,:]
                X_test = X[test_index,:]
                y_train = y[train_index]
                y_test = y[test_index]
                
                class_ratio = np.sum(y_train == 0)/np.sum(y_train == 1)
                
                if type(model) == type(XGBClassifier()):
                    model.set_params(**{'scale_pos_weight': class_ratio})

                model.fit(X_train, y_train)

                preds_train = model.predict(X_train)
                preds_test = model.predict(X_test)
                probs_train = model.predict_proba(X_train)[:,1]
                probs_test = model.predict_proba(X_test)[:,1]

                training_accuracy = np.sum(preds_train == y_train)/len(y_train)
                testing_accuracy = np.sum(preds_test == y_test)/len(y_test)

                training_avp = sklearn.metrics.average_precision_score(y_train, probs_train)
                testing_avp = sklearn.metrics.average_precision_score(y_test, probs_test)

                train_cm = sklearn.metrics.confusion_matrix(y_train, preds_train, sample_weight=None)

                test_cm = sklearn.metrics.confusion_matrix(y_test, preds_test, sample_weight=None)

                times_records.append(time)
                training_fraud_percentages.append(np.sum(y_train)/len(y_train))
                testing_fraud_percentages.append(np.sum(y_test)/len(y_test))
                training_accuracies.append(training_accuracy)
                testing_accuracies.append(testing_accuracy)
                training_average_precisions.append(training_avp)
                testing_average_precisions.append(testing_avp)
                train_true_negatives.append(train_cm[0,0])
                train_false_negatives.append(train_cm[1,0])
                train_true_positives.append(train_cm[1,1])
                train_false_positives.append(train_cm[0,1])
                test_true_negatives.append(test_cm[0,0])
                test_false_negatives.append(test_cm[1,0])
                test_true_positives.append(test_cm[1,1])
                test_false_positives.append(test_cm[0,1])
                classifiers.append(classifier)

    
    # put results together in a dataframe
    model_results_df = pd.DataFrame({
        'classifier': classifiers,
        'lookback': [time[0] if len(time) == 1 else '_'.join(time) for time in times_records],
        'train_fraud_percentage': training_fraud_percentages,
        'test_fraud_percentage': testing_fraud_percentages,
        'train_accuracy': training_accuracies,
        'test_accuracy': testing_accuracies,
        'train_average_precision': training_average_precisions,
        'test_average_precision': testing_average_precisions,
        'train_true_negatives': train_true_negatives,
        'train_false_negatives': train_false_negatives,
        'train_true_positives': train_true_positives,
        'train_false_positives': train_false_positives,
        'test_true_negatives': test_true_negatives,
        'test_false_negatives': test_false_negatives,
        'test_true_positives': test_true_positives,
        'test_false_positives': test_false_positives,
    })     

    # calculate precision and recall
    model_results_df['test_precision'] = model_results_df.test_true_positives/(model_results_df.test_true_positives+model_results_df.test_false_positives)
    model_results_df['test_recall'] = model_results_df.test_true_positives/(model_results_df.test_true_positives+model_results_df.test_false_negatives)
    model_results_df['train_precision'] = model_results_df.train_true_positives/(model_results_df.train_true_positives+model_results_df.train_false_positives)
    model_results_df['train_recall'] = model_results_df.train_true_positives/(model_results_df.train_true_positives+model_results_df.train_false_negatives)
    model_results_df.fillna(0, inplace=True)

    return model_results_df[sorted(model_results_df.columns)]

In [None]:
from sklearn.dummy import DummyClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

model_results = None

classifiers = [
    {'Label': 'XGBoost', 'Model': XGBClassifier()},
    {'Label': 'Modified XGBoost', 'Model': XGBClassifier(max_depth=9, subsample=0.92)},
    {'Label': 'Random Forest', 'Model': RandomForestClassifier()},
    {'Label': 'Modified Random Forest', 'Model': RandomForestClassifier(n_estimators=474, max_depth=15, max_features=int(X.shape[1]/100), class_weight='balanced')},
    {'Label': 'Logistic Regression', 'Model': LogisticRegression(penalty='l1', class_weight='balanced')},
    {'Label': 'Gradient Boosted Tree', 'Model': AdaBoostClassifier()},
    {'Label': 'Multi Layer Perceptron', 'Model': MLPClassifier(alpha=0.0001, shuffle=True, hidden_layer_sizes=(100,3))},
    {'Label': 'Dummy', 'Model': DummyClassifier()}
]

model_results = leaveOneUserOutCrossValidation(df=records_clean, classifier_list=classifiers)
        
summary = model_results.groupby(['classifier','lookback'], as_index=False)[['test_precision','test_recall','test_average_precision']].agg(['mean','median','std']).reset_index()
summary.columns = [col[0] if col[1] == '' else '_'.join(col) for col in summary.columns.ravel()]
summary.sort_values(by='test_average_precision_mean', ascending=False)

In [None]:
prefix = 'prior.mins_60'

X = records_clean
X_all = records_clean[list(set([col if prefix in col else None for col in X.columns])-set([None]))]
y_all = records_clean.fraud

# get only the columns for the currently seclected time periods
last_60_columns = ['request.email','request.created','fraud']+list(set([col if prefix in col else None for col in X.columns])-set([None]))

mean_avps = []

X_train, X_test, y_train, y_test = split_by_users_rand(X)

rf = RandomForestClassifier(n_estimators=474, max_depth=15, max_features=959, class_weight='balanced')
rf.fit(X_all,y_all)
 
preds = rf.predict(X_all)
probs = rf.predict_proba(X_all)

print(sklearn.metrics.accuracy_score(y_all, preds))
print(sklearn.metrics.confusion_matrix(y_all, preds))
print(sklearn.metrics.average_precision_score(y_all, probs[:,1]))

## Bayesian Hyperparameter Optimization Using Random Forest

In [None]:
from skopt.space import Real, Integer
from skopt.utils import use_named_args
from sklearn.datasets import load_boston
from sklearn.model_selection import cross_val_score
from skopt import gp_minimize

max_possible_features = 1102

# The list of hyper-parameters we want to optimize. For each one we define the bounds,
# the corresponding scikit-learn parameter name, as well as how to sample values
# from that dimension (`'log-uniform'` for the learning rate)
space  = [Integer(1, 100, name='max_depth'),
          Integer(1, max_possible_features, name='max_features'),
          Integer(1, 1000, name='n_estimators')
         ]

# this decorator allows your objective function to receive a the parameters as
# keyword arguments. This is particularly convenient when you want to set scikit-learn
# estimator parameters
@use_named_args(space)
def mean_average_precision_of_RF(**params):
    
    
    df = records_clean

    prefixes = ['prior.mins_60']

    columns = ['fraud','request.email','request.created']

    # get only the columns for the currently seclected time periods
    last_60_columns = ['fraud','request.email','request.created'] + list(set([col if any(prefix in col for prefix in prefixes) else None for col in results_df.columns])-set([None]))

    # get the data for the last 10 columns
    X = records_clean[last_60_columns]
    
    mavps = []
    
    for i in range(5):
            
        model = RandomForestClassifier(class_weight='balanced')

        model.set_params(**params)

        X_train, X_test, y_train, y_test = split_by_users_rand(X)

        model.fit(X_train, y_train)

        probs_test = model.predict_proba(X_test)[:,1]

        testing_avp = sklearn.metrics.average_precision_score(y_test, probs_test)

        mavps.append(testing_avp)

    return -np.mean(mavps)
                    
res_gp = gp_minimize(mean_average_precision_of_RF, space, n_calls=50, random_state=0)

print("Best score=%.4f" % res_gp.fun)
print("Hyperparameter Values", res_gp.x)

## Bayesian Hyperparameter Optimization Using Gradient Boosted Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from skopt.space import Real, Integer
from skopt.utils import use_named_args
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score
from skopt import gp_minimize

max_possible_features = 1102

# The list of hyper-parameters we want to optimize. For each one we define the bounds,
# the corresponding scikit-learn parameter name, as well as how to sample values
# from that dimension (`'log-uniform'` for the learning rate)
space  = [Real(0.001, 10, name='learning_rate'),
          Integer(100, 1000, name='n_estimators')
         ]

# this decorator allows your objective function to receive a the parameters as
# keyword arguments. This is particularly convenient when you want to set scikit-learn
# estimator parameters
@use_named_args(space)
def mean_average_precision_of_RF(**params):
    
    df = records_clean
    
    tree = DecisionTreeClassifier(class_weight='balanced')
    
    model = AdaBoostClassifier(base_estimator=tree)
                    
    model.set_params(**params)

    prefixes = ['prior.mins_60']

    columns = ['fraud','request.email','request.created']

    # get only the columns for the currently seclected time periods
    last_60_columns = ['fraud','request.email','request.created'] + list(set([col if any(prefix in col for prefix in prefixes) else None for col in results_df.columns])-set([None]))

    # get the data for the last 10 columns
    X = records_clean[last_60_columns]
    
    mavps = []

    # for each fraudulent user
    for i in range(5):

        X_train, X_test, y_train, y_test = split_by_users_rand(X)

        model.fit(X_train, y_train)

        probs_test = model.predict_proba(X_test)[:,1]

        testing_avp = sklearn.metrics.average_precision_score(y_test, probs_test)

        mavps.append(testing_avp)

    return -np.mean(mavps)
                    
res_gp_adaboost = gp_minimize(mean_average_precision_of_RF, space, n_calls=50, random_state=0)

print(res_gp_adaboost.x)
print(res_gp_adaboost.fun)

## Bayesian Hyperparameter Optimization Using XGBoost

In [None]:
from skopt.space import Real, Integer
from skopt.utils import use_named_args
from sklearn.datasets import load_boston
from sklearn.model_selection import cross_val_score
from skopt import gp_minimize

max_possible_features = 1102

# The list of hyper-parameters we want to optimize. For each one we define the bounds,
# the corresponding scikit-learn parameter name, as well as how to sample values
# from that dimension (`'log-uniform'` for the learning rate)
space  = [Integer(1, 10, name='max_depth'),
          Real(0.5, 1, name='subsample')
         ]

# this decorator allows your objective function to receive a the parameters as
# keyword arguments. This is particularly convenient when you want to set scikit-learn
# estimator parameters
@use_named_args(space)
def mean_average_precision_of_RF(**params):
    
    df = records_clean

    prefixes = ['prior.mins_60']

    columns = ['fraud','request.email']

    # get only the columns for the currently seclected time periods
    last_60_columns = list(set([col if any(prefix in col for prefix in prefixes) else None for col in results_df.columns])-set([None]))

    # get the data for the last 10 columns
    X = records_clean[last_60_columns].values
    y = records_clean['fraud'].values
    emails = records_clean['request.email']

    group_kfold = GroupKFold(n_splits=3)
    group_kfold.get_n_splits(X=X, y=y, groups=emails)

    mavps = []

    for train_index, test_index in group_kfold.split(X=X, y=y, groups=emails):

        X_train = X[train_index,:]
        X_test = X[test_index,:]
        y_train = y[train_index]
        y_test = y[test_index]
        
        class_ratio = np.sum(y_train == 0)/np.sum(y_train == 1)
            
        model = XGBClassifier(scale_pos_weight=class_ratio)
        model.set_params(**params)

        model.fit(X_train, y_train)

        probs_test = model.predict_proba(X_test)[:,1]

        testing_avp = sklearn.metrics.average_precision_score(y_test, probs_test)

        mavps.append(testing_avp)

    return -np.mean(mavps)
                    
res_gp = gp_minimize(mean_average_precision_of_RF, space, n_calls=10, random_state=0)

print("Best score=%.4f" % res_gp.fun)
print("Hyperparameter Values", res_gp.x)