In [1]:
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn import model_selection
import numpy as np

In [2]:
df = pd.read_csv('sea.csv', names = ['feature1','feature2','feature3','label'])

In [3]:
df

Unnamed: 0,feature1,feature2,feature3,label
0,6.677259,5.152133,2.982455,0
1,9.874437,8.817701,4.786266,1
2,7.118725,2.990575,1.964403,0
3,6.128244,8.449696,2.604408,1
4,7.592623,0.859845,5.763160,0
...,...,...,...,...
59995,4.760386,7.501301,5.680291,1
59996,7.873042,7.951943,1.213854,1
59997,8.430418,9.531408,1.034550,0
59998,6.438425,8.759620,6.009597,1


Take the first 15000 and train the model on it

In [4]:

df1  = df[:15000]

In [5]:
df1.label.value_counts()

1    9683
0    5317
Name: label, dtype: int64

In [6]:

X = df1[['feature1', 'feature2', 'feature3']].values
y = df1['label'].values


In [7]:
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)
scoring = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted', 'roc_auc']
# fit model no training data
model = XGBClassifier(n_estimators = 500, max_depth=10)
model.fit(X_train, y_train)
print(model)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=500, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)


In [8]:
kfold = model_selection.KFold(n_splits=5, shuffle=True, random_state=90210)
cv_results = model_selection.cross_validate(model, X_train, y_train, cv=kfold, scoring=scoring)
# clf = model.fit(X_train, y_train,  eval_metric = 'logloss')



















In [9]:
cv_results

{'fit_time': array([6.30550694, 5.87707424, 7.93197083, 6.31817031, 5.05764794]),
 'score_time': array([0.08911419, 0.03206587, 0.0473597 , 0.04324794, 0.05784416]),
 'test_accuracy': array([0.85223881, 0.86318408, 0.8681592 , 0.8681592 , 0.85024876]),
 'test_precision_weighted': array([0.85064445, 0.86186032, 0.86707593, 0.86678074, 0.84902774]),
 'test_recall_weighted': array([0.85223881, 0.86318408, 0.8681592 , 0.8681592 , 0.85024876]),
 'test_f1_weighted': array([0.85084073, 0.86170325, 0.86690771, 0.8667476 , 0.84827401]),
 'test_roc_auc': array([0.85529158, 0.85587389, 0.87697703, 0.87996717, 0.85723713])}

In [10]:
# make predictions for test data
y_pred = model.predict(X_test)
# predictions = [round(value) for value in y_pred]
predictions = model.predict_proba(X_test)

In [11]:
# evaluate predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 86.99%


Use PSI to check model's current decile distribution

Get the probabilities of class 0 

In [12]:
predictions

array([[0.83737445, 0.16262552],
       [0.007029  , 0.992971  ],
       [0.04399091, 0.9560091 ],
       ...,
       [0.02260154, 0.97739846],
       [0.5994914 , 0.40050858],
       [0.08273053, 0.91726947]], dtype=float32)

In [13]:
predictions.shape

(4950, 2)

The first column in the matrix represents the propbability of being class 0

In [14]:
prob_0 = predictions[:,1]

In [15]:
df_prob_0 = pd.DataFrame(prob_0, columns = ['prob_0'])

In [16]:
df_prob_0

Unnamed: 0,prob_0
0,0.162626
1,0.992971
2,0.956009
3,0.995899
4,0.999719
...,...
4945,0.998658
4946,0.985793
4947,0.977398
4948,0.400509


In [17]:
conditions = [(df_prob_0.prob_0 >= 0.9),
              ((df_prob_0.prob_0 < 0.9) & (df_prob_0.prob_0 >= 0.8)),
              ((df_prob_0.prob_0 < 0.8) & (df_prob_0.prob_0 >= 0.7)),
              ((df_prob_0.prob_0 < 0.7) & (df_prob_0.prob_0 >= 0.6)),
              ((df_prob_0.prob_0 < 0.6) & (df_prob_0.prob_0 >= 0.5)),
              ((df_prob_0.prob_0 < 0.5) & (df_prob_0.prob_0 >= 0.4)),
              ((df_prob_0.prob_0 < 0.4) & (df_prob_0.prob_0 >= 0.3)),
              ((df_prob_0.prob_0 < 0.3) & (df_prob_0.prob_0 >= 0.2)),
              ((df_prob_0.prob_0 < 0.2) & (df_prob_0.prob_0 >= 0.1)),
              (df_prob_0.prob_0 < 0.1)]

choices = np.arange(1,11)

In [18]:
df_prob_0['decile'] = np.select(conditions, choices, default = 'none')

df_prob_0['decile'] = df_prob_0['decile'].astype(int)

df_prob_0

Unnamed: 0,prob_0,decile
0,0.162626,9
1,0.992971,1
2,0.956009,1
3,0.995899,1
4,0.999719,1
...,...,...
4945,0.998658,1
4946,0.985793,1
4947,0.977398,1
4948,0.400509,6


Count number of values in each decile

In [19]:
traininig_distribution = df_prob_0.groupby('decile')[['prob_0']].count().reset_index().sort_values(by = 'decile')
traininig_distribution = traininig_distribution.rename(columns = {'prob_0' : 'train_dist'})

In [20]:
traininig_distribution['train_dist_perc'] = np.round(100*traininig_distribution['train_dist']/traininig_distribution['train_dist'].sum(),1)

In [21]:
traininig_distribution

Unnamed: 0,decile,train_dist,train_dist_perc
0,1,2781,56.2
1,2,287,5.8
2,3,124,2.5
3,4,85,1.7
4,5,66,1.3
5,6,74,1.5
6,7,61,1.2
7,8,81,1.6
8,9,149,3.0
9,10,1242,25.1


Now keeping the model same make predictions on the next 15000 of SEA dataset and compare the decile distribution

In [22]:
df2=df[15000:30000]

In [23]:
df2.label.value_counts()

1    8677
0    6323
Name: label, dtype: int64

take the same number of samples we had in X test (4950), although this isn't required because we take % of samples in the range.

In [24]:

X2 = df2[['feature1', 'feature2', 'feature3']].values
y2 = df2['label'].values

In [25]:
seed = 7
test_size = 0.33
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=test_size, random_state=seed)

In [26]:
# make predictions for test data
# y2_pred = model.predict(X2_test)
y2_pred = model.predict(X2)
# predictions = [round(value) for value in y_pred]
# predictions = model.predict_proba(X2_test)
predictions = model.predict_proba(X2)

In [27]:
# evaluate predictions
# accuracy = accuracy_score(y2_test, y2_pred)
accuracy = accuracy_score(y2, y2_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 81.43%


Accuracy reduced for the changed concept

Check PSI

In [28]:
predictions

array([[0.01052308, 0.9894769 ],
       [0.9891025 , 0.01089752],
       [0.01394427, 0.98605573],
       ...,
       [0.04469311, 0.9553069 ],
       [0.00471586, 0.99528414],
       [0.12247115, 0.87752885]], dtype=float32)

In [29]:
prob_0_2 = predictions[:,1]

In [30]:
df_prob_0_2 = pd.DataFrame(prob_0_2, columns = ['prob_0'])

In [31]:
df_prob_0_2

Unnamed: 0,prob_0
0,0.989477
1,0.010898
2,0.986056
3,0.969428
4,0.977187
...,...
14995,0.901045
14996,0.984326
14997,0.955307
14998,0.995284


In [32]:
conditions = [(df_prob_0_2.prob_0 >= 0.9),
              ((df_prob_0_2.prob_0 < 0.9) & (df_prob_0_2.prob_0 >= 0.8)),
              ((df_prob_0_2.prob_0 < 0.8) & (df_prob_0_2.prob_0 >= 0.7)),
              ((df_prob_0_2.prob_0 < 0.7) & (df_prob_0_2.prob_0 >= 0.6)),
              ((df_prob_0_2.prob_0 < 0.6) & (df_prob_0_2.prob_0 >= 0.5)),
              ((df_prob_0_2.prob_0 < 0.5) & (df_prob_0_2.prob_0 >= 0.4)),
              ((df_prob_0_2.prob_0 < 0.4) & (df_prob_0_2.prob_0 >= 0.3)),
              ((df_prob_0_2.prob_0 < 0.3) & (df_prob_0_2.prob_0 >= 0.2)),
              ((df_prob_0_2.prob_0 < 0.2) & (df_prob_0_2.prob_0 >= 0.1)),
              (df_prob_0_2.prob_0 < 0.1)]

choices = np.arange(1,11)

In [33]:
df_prob_0_2['decile'] = np.select(conditions, choices, default = 'none')

df_prob_0_2['decile'] = df_prob_0_2['decile'].astype(int)

df_prob_0_2

Unnamed: 0,prob_0,decile
0,0.989477,1
1,0.010898,10
2,0.986056,1
3,0.969428,1
4,0.977187,1
...,...,...
14995,0.901045,1
14996,0.984326,1
14997,0.955307,1
14998,0.995284,1


In [34]:
serving_distribution = df_prob_0_2.groupby('decile')[['prob_0']].count().reset_index().sort_values(by = 'decile')
serving_distribution = serving_distribution.rename(columns = {'prob_0' : 'serv_dist'})

In [35]:
serving_distribution['serv_dist_perc'] = np.round(100*serving_distribution['serv_dist']/serving_distribution['serv_dist'].sum(),1)

In [36]:
serving_distribution

Unnamed: 0,decile,serv_dist,serv_dist_perc
0,1,8491,56.6
1,2,822,5.5
2,3,391,2.6
3,4,270,1.8
4,5,188,1.3
5,6,173,1.2
6,7,214,1.4
7,8,256,1.7
8,9,436,2.9
9,10,3759,25.1


In [37]:
merged = pd.merge(traininig_distribution, serving_distribution, on='decile', how ='inner')

In [38]:
merged

Unnamed: 0,decile,train_dist,train_dist_perc,serv_dist,serv_dist_perc
0,1,2781,56.2,8491,56.6
1,2,287,5.8,822,5.5
2,3,124,2.5,391,2.6
3,4,85,1.7,270,1.8
4,5,66,1.3,188,1.3
5,6,74,1.5,173,1.2
6,7,61,1.2,214,1.4
7,8,81,1.6,256,1.7
8,9,149,3.0,436,2.9
9,10,1242,25.1,3759,25.1


In [39]:
merged['s-t'] = merged['serv_dist_perc'] - merged['train_dist_perc']

In [40]:
merged['ln(s/t)'] = np.log(merged['serv_dist_perc']/merged['train_dist_perc'])

In [41]:
merged['PSI'] = merged['ln(s/t)'] * (merged['s-t']/100)

In [42]:
merged.PSI.sum()

0.001356335703262305

In [43]:
merged

Unnamed: 0,decile,train_dist,train_dist_perc,serv_dist,serv_dist_perc,s-t,ln(s/t),PSI
0,1,2781,56.2,8491,56.6,0.4,0.007092,2.8e-05
1,2,287,5.8,822,5.5,-0.3,-0.05311,0.000159
2,3,124,2.5,391,2.6,0.1,0.039221,3.9e-05
3,4,85,1.7,270,1.8,0.1,0.057158,5.7e-05
4,5,66,1.3,188,1.3,0.0,0.0,0.0
5,6,74,1.5,173,1.2,-0.3,-0.223144,0.000669
6,7,61,1.2,214,1.4,0.2,0.154151,0.000308
7,8,81,1.6,256,1.7,0.1,0.060625,6.1e-05
8,9,149,3.0,436,2.9,-0.1,-0.033902,3.4e-05
9,10,1242,25.1,3759,25.1,0.0,0.0,0.0


In [44]:
df3=df[30000:45000]

##take the same number of samples we had in X test (4950), although this isn't required because we take % of samples in the range.


X3 = df3[['feature1', 'feature2', 'feature3']].values
y3 = df3['label'].values

seed = 7
test_size = 0.33
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=test_size, random_state=seed)

# make predictions for test data
y3_pred = model.predict(X3_test)
# predictions = [round(value) for value in y_pred]
predictions = model.predict_proba(X3_test)

# evaluate predictions
accuracy = accuracy_score(y3_test, y3_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

#Accuracy reduced for the changed concept



Accuracy: 81.94%


In [45]:
#Check PSI

predictions

prob_0_3 = predictions[:,0]

df_prob_0_3 = pd.DataFrame(prob_0_3, columns = ['prob_0'])

df_prob_0_3

conditions = [(df_prob_0_3.prob_0 >= 0.9),
              ((df_prob_0_3.prob_0 < 0.9) & (df_prob_0_3.prob_0 >= 0.8)),
              ((df_prob_0_3.prob_0 < 0.8) & (df_prob_0_3.prob_0 >= 0.7)),
              ((df_prob_0_3.prob_0 < 0.7) & (df_prob_0_3.prob_0 >= 0.6)),
              ((df_prob_0_3.prob_0 < 0.6) & (df_prob_0_3.prob_0 >= 0.5)),
              ((df_prob_0_3.prob_0 < 0.5) & (df_prob_0_3.prob_0 >= 0.4)),
              ((df_prob_0_3.prob_0 < 0.4) & (df_prob_0_3.prob_0 >= 0.3)),
              ((df_prob_0_3.prob_0 < 0.3) & (df_prob_0_3.prob_0 >= 0.2)),
              ((df_prob_0_3.prob_0 < 0.2) & (df_prob_0_3.prob_0 >= 0.1)),
              (df_prob_0_3.prob_0 < 0.1)]

choices = np.arange(1,11)

df_prob_0_3['decile'] = np.select(conditions, choices, default = 'none')

df_prob_0_3['decile'] = df_prob_0_3['decile'].astype(int)

df_prob_0_3

serving_distribution = df_prob_0_3.groupby('decile')[['prob_0']].count().reset_index().sort_values(by = 'decile')
serving_distribution = serving_distribution.rename(columns = {'prob_0' : 'serv_dist'})

serving_distribution['serv_dist_perc'] = np.round(100*serving_distribution['serv_dist']/serving_distribution['serv_dist'].sum(),1)

merged = pd.merge(traininig_distribution, serving_distribution, on='decile', how ='inner')

merged

merged['s-t'] = merged['serv_dist_perc'] - merged['train_dist_perc']

merged['ln(s/t)'] = np.log(merged['serv_dist_perc']/merged['train_dist_perc'])

merged['PSI'] = merged['ln(s/t)'] * (merged['s-t']/100)



In [46]:
merged.PSI.sum()


0.5439541765592253