In [216]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import recall_score 
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import f1_score

import itertools
import xgboost


import matplotlib.pyplot as plt

%matplotlib inline

In [217]:
df = pd.read_csv("Shill Bidding Dataset.csv")
df.head(3)

Unnamed: 0,Record_ID,Auction_ID,Bidder_ID,Bidder_Tendency,Bidding_Ratio,Successive_Outbidding,Last_Bidding,Auction_Bids,Starting_Price_Average,Early_Bidding,Winning_Ratio,Auction_Duration,Class
0,1,732,_***i,0.2,0.4,0.0,2.8e-05,0.0,0.993593,2.8e-05,0.666667,5,0
1,2,732,g***r,0.02439,0.2,0.0,0.013123,0.0,0.993593,0.013123,0.944444,5,0
2,3,732,t***p,0.142857,0.2,0.0,0.003042,0.0,0.993593,0.003042,1.0,5,0


In [218]:
df.shape

(6321, 13)

In [219]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6321 entries, 0 to 6320
Data columns (total 13 columns):
Record_ID                 6321 non-null int64
Auction_ID                6321 non-null int64
Bidder_ID                 6321 non-null object
Bidder_Tendency           6321 non-null float64
Bidding_Ratio             6321 non-null float64
Successive_Outbidding     6321 non-null float64
Last_Bidding              6321 non-null float64
Auction_Bids              6321 non-null float64
Starting_Price_Average    6321 non-null float64
Early_Bidding             6321 non-null float64
Winning_Ratio             6321 non-null float64
Auction_Duration          6321 non-null int64
Class                     6321 non-null int64
dtypes: float64(8), int64(4), object(1)
memory usage: 642.1+ KB


In [220]:
df.iloc[:, -1].value_counts()

0    5646
1     675
Name: Class, dtype: int64

In [221]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]

In [222]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)

In [223]:
categorical_columns = [n for n in df.columns if df[n].dtype.name == 'object']
numerical_columns   = [n for n in df.columns if df[n].dtype.name != 'object']

In [224]:
final_transformers = list()

for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    final_transformers.append((cat_col, cat_transformer))
    
for cont_col in continuous_columns:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col)),
                ('standard', StandardScaler())
            ])
    final_transformers.append((cont_col, cont_transformer))

In [225]:
feats = FeatureUnion(final_transformers)

feature_processing = Pipeline([('feats', feats)])

In [226]:
xg_pipeline = Pipeline([
    ('features',feats),
    ('classifier', xgboost.XGBClassifier(random_state = 42)),
])
xg_pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Bidder_ID',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Bidder_ID')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Bidder_ID'))])),
                                                ('Record_ID',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='Record_ID')),
                                                                 ('standard',
                                                                  StandardScaler())])),
                                                ('Auction_ID',
                                                 Pipeline(steps=[('selector',
        

In [227]:
y_predict = xg_pipeline.predict(x_test)

In [228]:
y_predict[:5]

array([0, 0, 0, 1, 0])

In [257]:
def evaluate_results(y_test, y_predict):
    print('Classification results:')
    f1 = f1_score(y_test, y_predict)
    print("f1: %.2f%%" % (f1 * 100.0)) 
    roc = roc_auc_score(y_test, y_predict)
    print("roc: %.2f%%" % (roc * 100.0)) 
    rec = recall_score(y_test, y_predict, average='binary')
    print("recall: %.2f%%" % (rec * 100.0)) 
    prc = precision_score(y_test, y_predict, average='binary')
    print("precision: %.2f%%" % (prc * 100.0)) 
    return f1, roc, rec, prc

    
results = np.array(evaluate_results(y_test, y_predict))

Classification results:
f1: 96.85%
roc: 98.94%
recall: 98.40%
precision: 95.35%


In [230]:
mod_data = df.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 169/675 as positives and unlabeling the rest


In [231]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    6152
 1     169
Name: class_test, dtype: int64


In [232]:
mod_data.head(8)

Unnamed: 0,Record_ID,Auction_ID,Bidder_ID,Bidder_Tendency,Bidding_Ratio,Successive_Outbidding,Last_Bidding,Auction_Bids,Starting_Price_Average,Early_Bidding,Winning_Ratio,Auction_Duration,Class,class_test
0,1,732,_***i,0.2,0.4,0.0,2.8e-05,0.0,0.993593,2.8e-05,0.666667,5,0,-1
1,2,732,g***r,0.02439,0.2,0.0,0.013123,0.0,0.993593,0.013123,0.944444,5,0,-1
2,3,732,t***p,0.142857,0.2,0.0,0.003042,0.0,0.993593,0.003042,1.0,5,0,-1
3,4,732,7***n,0.1,0.2,0.0,0.097477,0.0,0.993593,0.097477,1.0,5,0,-1
4,5,900,z***z,0.051282,0.222222,0.0,0.001318,0.0,0.0,0.001242,0.5,7,0,-1
5,8,900,i***e,0.038462,0.111111,0.0,0.016844,0.0,0.0,0.016844,0.8,7,0,-1
6,10,900,m***p,0.4,0.222222,0.0,0.006781,0.0,0.0,0.006774,0.75,7,0,-1
7,12,900,k***a,0.137931,0.444444,1.0,0.768044,0.0,0.0,0.016311,1.0,7,1,-1


In [233]:
x_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

In [234]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)
neg_sample.shape, pos_sample.shape

((169, 14), (169, 14))

In [235]:
mod_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6321 entries, 3569 to 627
Data columns (total 14 columns):
Record_ID                 6321 non-null int64
Auction_ID                6321 non-null int64
Bidder_ID                 6321 non-null object
Bidder_Tendency           6321 non-null float64
Bidding_Ratio             6321 non-null float64
Successive_Outbidding     6321 non-null float64
Last_Bidding              6321 non-null float64
Auction_Bids              6321 non-null float64
Starting_Price_Average    6321 non-null float64
Early_Bidding             6321 non-null float64
Winning_Ratio             6321 non-null float64
Auction_Duration          6321 non-null int64
Class                     6321 non-null int64
class_test                6321 non-null int64
dtypes: float64(8), int64(5), object(1)
memory usage: 740.7+ KB


In [236]:
categorical_columns = [n for n in mod_data.columns if mod_data[n].dtype.name == 'object']
numerical_columns   = [n for n in mod_data.columns if mod_data[n].dtype.name != 'object']

In [237]:
xg_final_transformers = list()

for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    xg_final_transformers.append((cat_col, cat_transformer))
    
for cont_col in continuous_columns:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col)),
                ('standard', StandardScaler())
            ])
    xg_final_transformers.append((cont_col, cont_transformer))

In [238]:
xg_final_transformers

[('Bidder_ID',
  Pipeline(steps=[('selector', FeatureSelector(column='Bidder_ID')),
                  ('ohe', OHEEncoder(key='Bidder_ID'))])),
 ('Record_ID', Pipeline(steps=[('selector', NumberSelector(key='Record_ID')),
                  ('standard', StandardScaler())])),
 ('Auction_ID', Pipeline(steps=[('selector', NumberSelector(key='Auction_ID')),
                  ('standard', StandardScaler())])),
 ('Bidder_Tendency',
  Pipeline(steps=[('selector', NumberSelector(key='Bidder_Tendency')),
                  ('standard', StandardScaler())])),
 ('Bidding_Ratio',
  Pipeline(steps=[('selector', NumberSelector(key='Bidding_Ratio')),
                  ('standard', StandardScaler())])),
 ('Successive_Outbidding',
  Pipeline(steps=[('selector', NumberSelector(key='Successive_Outbidding')),
                  ('standard', StandardScaler())])),
 ('Last_Bidding',
  Pipeline(steps=[('selector', NumberSelector(key='Last_Bidding')),
                  ('standard', StandardScaler())])),
 ('Auction_

In [254]:
feats = FeatureUnion(xg_final_transformers)

feature_processing = Pipeline([('feats', feats)])

xg_pipeline = Pipeline([
    ('features',feats),
    ('classifier', xgboost.XGBClassifier(random_state = 42)),
])

xg_pipeline.fit(sample_train.iloc[:,:-2], 
          sample_train.iloc[:,-2])
y_pred = xg_pipeline.predict(sample_test.iloc[:,:-2])

PU_results = np.array(evaluate_results(sample_test.iloc[:,-2], y_pred))

Classification results:
f1: 85.19%
roc: 98.26%
recall: 99.60%
precision: 74.43%


In [270]:
cols = ['f1', 'roc', 'rec', 'prc']
rows = ['normal model', 'PU model']

In [271]:
all_res = pd.DataFrame([results, PU_results], rows, cols)

In [272]:
all_res

Unnamed: 0,f1,roc,rec,prc
normal model,0.968504,0.989368,0.984,0.953488
PU model,0.851948,0.982581,0.995951,0.744327
