In [25]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

In [5]:
df = pd.read_csv('bank-full.csv', ';')

In [7]:
display(df.head(), df.describe(), df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323
std,10.618762,3044.765829,8.322476,257.527812,3.098021,100.128746,2.303441
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


None

In [8]:
df['y'] = df['y'].apply(lambda x: 1 if x == 'yes' else 0)

In [9]:
df['y'].value_counts()

0    39922
1     5289
Name: y, dtype: int64

In [10]:
df.select_dtypes(include=['object']).head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome
0,management,married,tertiary,no,yes,no,unknown,may,unknown
1,technician,single,secondary,no,yes,no,unknown,may,unknown
2,entrepreneur,married,secondary,no,yes,yes,unknown,may,unknown
3,blue-collar,married,unknown,no,yes,no,unknown,may,unknown
4,unknown,single,unknown,no,no,no,unknown,may,unknown


In [11]:
df["month"].value_counts()

may    13766
jul     6895
aug     6247
jun     5341
nov     3970
apr     2932
feb     2649
jan     1403
oct      738
sep      579
mar      477
dec      214
Name: month, dtype: int64

In [12]:
cleanup_nums = {"month": {"jan": 1, "feb": 2, "mar": 3, "apr": 4, "may": 5, "jun": 6, 
                          "jul": 7, "aug": 8, "sep": 9, "oct": 10, "nov": 11, "dec": 12
                         }
               }

In [13]:
df = df.replace(cleanup_nums)

In [14]:
df[['marital']] = df[['marital']].replace({'married': 1, 'single': 0, 'divorced': 0})
df[['education']] = df[['education']].replace({'tertiary': 3, 'secondary': 2, 'primary': 1, 'unknown': 1})
df[['default']] = df[['default']].replace({'yes': 1, 'no': 0})
df[['housing']] = df[['housing']].replace({'yes': 1, 'no': 0})
df[['loan']] = df[['loan']].replace({'yes': 1, 'no': 0})
df[['contact']] = df[['contact']].replace({'cellular': 2, 'telephone': 1, 'unknown': 0})
df[['poutcome']] = df[['poutcome']].replace({'failure': 2, 'success': 1, 'other': 0, 'unknown': 0})

In [15]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,1,3,0,2143,1,0,0,5,5,261,1,-1,0,0,0
1,44,technician,0,2,0,29,1,0,0,5,5,151,1,-1,0,0,0
2,33,entrepreneur,1,2,0,2,1,1,0,5,5,76,1,-1,0,0,0
3,47,blue-collar,1,1,0,1506,1,0,0,5,5,92,1,-1,0,0,0
4,33,unknown,0,1,0,1,0,0,0,5,5,198,1,-1,0,0,0


In [16]:
df["job"] = df["job"].astype('category')

In [18]:
df.dtypes

age             int64
job          category
marital         int64
education       int64
default         int64
balance         int64
housing         int64
loan            int64
contact         int64
day             int64
month           int64
duration        int64
campaign        int64
pdays           int64
previous        int64
poutcome        int64
y               int64
dtype: object

In [19]:
df["job"] = df["job"].cat.codes

In [20]:
df["job"] = df["job"].astype('int64')

In [21]:
from sklearn.model_selection import train_test_split

X = df.iloc[:,:-1]
y = df.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

In [24]:
model = xgb.XGBClassifier()
model.fit(X_train, y_train)
y_predict = model.predict(X_test)

In [26]:
def evaluate_results(y_test, y_predict):
    
    f1 = f1_score(y_test, y_predict)
    roc = roc_auc_score(y_test, y_predict)
    precision = precision_score(y_test, y_predict, average='binary')
    recall = recall_score(y_test, y_predict, average='binary')
    return f1, precision, recall, roc

In [27]:
evaluate_results(y_test, y_predict)

(0.5291181364392679,
 0.6060991105463787,
 0.46948818897637795,
 0.7154342651621644)

In [28]:
mod_data = df.copy()
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
np.random.shuffle(pos_ind)
pos_sample_len = int(np.ceil(0.1 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())


x_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

Using 529/5289 as positives and unlabeling the rest
target variable:
 -1    44682
 1      529
Name: class_test, dtype: int64


In [29]:
x_data = mod_data.drop(['y', 'class_test'], axis=1).values  
y_labeled = mod_data.loc[:, 'class_test'].values  
y_positive = mod_data.loc[:, 'y'].values  

In [30]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(529, 18) (529, 18)


In [31]:
model_rns = xgb.XGBClassifier()
model.fit(sample_train.drop(['y', 'class_test'], axis=1).values,
          sample_train.loc[:, 'y'].values)
y_predict_rns = model.predict(sample_test.drop(['y', 'class_test'], axis=1).values)

evaluate_results(sample_test.loc[:, 'y'].values, y_predict_rns)

(0.4852871939736346,
 0.33558521894839655,
 0.8761155971100723,
 0.8345898542246688)

In [32]:
pd.DataFrame([
    evaluate_results(y_test, y_predict),
    evaluate_results(sample_test.loc[:, 'y'].values, y_predict_rns)
], columns=['f1', 'precision', 'recall', 'roc'], index=['Simple xgboost', 'random negative sampling']).round(2)

Unnamed: 0,f1,precision,recall,roc
Simple xgboost,0.53,0.61,0.47,0.72
random negative sampling,0.49,0.34,0.88,0.83
