In [2]:
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn import model_selection
import numpy as np

In [3]:
df = pd.read_csv('sea.csv', names = ['feature1','feature2','feature3','label'])

In [4]:
df

Unnamed: 0,feature1,feature2,feature3,label
0,6.677259,5.152133,2.982455,0
1,9.874437,8.817701,4.786266,1
2,7.118725,2.990575,1.964403,0
3,6.128244,8.449696,2.604408,1
4,7.592623,0.859845,5.763160,0
...,...,...,...,...
59995,4.760386,7.501301,5.680291,1
59996,7.873042,7.951943,1.213854,1
59997,8.430418,9.531408,1.034550,0
59998,6.438425,8.759620,6.009597,1


Take the first 15000 and train the model on it

In [5]:

df1  = df[:15000]

In [7]:

X = df1[['feature1', 'feature2', 'feature3']].values
y = df1['label'].values


In [9]:
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)
scoring = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted', 'roc_auc']
# fit model no training data
model = RandomForestClassifier(n_estimators = 500, max_depth=10)
model.fit(X_train, y_train)
print(model)

RandomForestClassifier(max_depth=10, n_estimators=500)


In [10]:
kfold = model_selection.KFold(n_splits=5, shuffle=True, random_state=90210)
cv_results = model_selection.cross_validate(model, X_train, y_train, cv=kfold, scoring=scoring)
# clf = model.fit(X_train, y_train,  eval_metric = 'logloss')

In [11]:
cv_results

{'fit_time': array([3.44389129, 3.41538978, 3.45102859, 3.42449641, 3.42601275]),
 'score_time': array([0.25185347, 0.2479434 , 0.24998403, 0.25154591, 0.24822855]),
 'test_accuracy': array([0.87412935, 0.87711443, 0.89452736, 0.8960199 , 0.88059701]),
 'test_precision_weighted': array([0.87338224, 0.87636283, 0.89497756, 0.89577839, 0.88121108]),
 'test_recall_weighted': array([0.87412935, 0.87711443, 0.89452736, 0.8960199 , 0.88059701]),
 'test_f1_weighted': array([0.87207314, 0.87537241, 0.89282146, 0.89439768, 0.87845975]),
 'test_roc_auc': array([0.85781081, 0.85749883, 0.87181455, 0.89495125, 0.8633027 ])}

In [12]:
# make predictions for test data
y_pred = model.predict(X_test)
# predictions = [round(value) for value in y_pred]
predictions = model.predict_proba(X_test)

In [13]:
# evaluate predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 89.27%


Use PSI to check model's current decile distribution

Get the probabilities of class 0 

In [14]:
predictions

array([[0.85385989, 0.14614011],
       [0.12938216, 0.87061784],
       [0.09351713, 0.90648287],
       ...,
       [0.10114364, 0.89885636],
       [0.15995168, 0.84004832],
       [0.16562687, 0.83437313]])

In [15]:
predictions.shape

(4950, 2)

The first column in the matrix represents the propbability of being class 0

In [19]:
prob_0 = predictions[:,1]

In [20]:
df_prob_0 = pd.DataFrame(prob_0, columns = ['prob_0'])

In [21]:
df_prob_0

Unnamed: 0,prob_0
0,0.146140
1,0.870618
2,0.906483
3,0.881069
4,0.918362
...,...
4945,0.912150
4946,0.793421
4947,0.898856
4948,0.840048


In [22]:
conditions = [(df_prob_0.prob_0 >= 0.9),
              ((df_prob_0.prob_0 < 0.9) & (df_prob_0.prob_0 >= 0.8)),
              ((df_prob_0.prob_0 < 0.8) & (df_prob_0.prob_0 >= 0.7)),
              ((df_prob_0.prob_0 < 0.7) & (df_prob_0.prob_0 >= 0.6)),
              ((df_prob_0.prob_0 < 0.6) & (df_prob_0.prob_0 >= 0.5)),
              ((df_prob_0.prob_0 < 0.5) & (df_prob_0.prob_0 >= 0.4)),
              ((df_prob_0.prob_0 < 0.4) & (df_prob_0.prob_0 >= 0.3)),
              ((df_prob_0.prob_0 < 0.3) & (df_prob_0.prob_0 >= 0.2)),
              ((df_prob_0.prob_0 < 0.2) & (df_prob_0.prob_0 >= 0.1)),
              (df_prob_0.prob_0 < 0.1)]

choices = np.arange(1,11)

In [33]:
df_prob_0['decile'] = np.select(conditions, choices, default = 'none')

df_prob_0['decile'] = df_prob_0['decile'].astype(int)

df_prob_0

Unnamed: 0,prob_0,decile
0,0.146140,9
1,0.870618,2
2,0.906483,1
3,0.881069,2
4,0.918362,1
...,...,...
4945,0.912150,1
4946,0.793421,3
4947,0.898856,2
4948,0.840048,2


Count number of values in each decile

In [34]:
traininig_distribution = df_prob_0.groupby('decile')[['prob_0']].count().reset_index().sort_values(by = 'decile')
traininig_distribution = traininig_distribution.rename(columns = {'prob_0' : 'train_dist'})

In [35]:
traininig_distribution['train_dist_perc'] = np.round(100*traininig_distribution['train_dist']/traininig_distribution['train_dist'].sum(),1)

In [36]:
traininig_distribution

Unnamed: 0,decile,train_dist,train_dist_perc
0,1,774,15.6
1,2,2314,46.7
2,3,176,3.6
3,4,73,1.5
4,5,41,0.8
5,6,52,1.1
6,7,56,1.1
7,8,162,3.3
8,9,1200,24.2
9,10,102,2.1


Now keeping the model same make predictions on the next 15000 of SEA dataset and compare the decile distribution

In [37]:
df2=df[15000:30000]

In [38]:
df2['feature3'] = df2['feature3']+3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [39]:
df2.label.value_counts()

1    8677
0    6323
Name: label, dtype: int64

take the same number of samples we had in X test (4950), although this isn't required because we take % of samples in the range.

In [40]:

X2 = df2[['feature1', 'feature2', 'feature3']].values
y2 = df2['label'].values

In [41]:
seed = 7
test_size = 0.33
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=test_size, random_state=seed)

In [42]:
# make predictions for test data
# y2_pred = model.predict(X2_test)
y2_pred = model.predict(X2)
# predictions = [round(value) for value in y_pred]
# predictions = model.predict_proba(X2_test)
predictions = model.predict_proba(X2)

In [43]:
# evaluate predictions
# accuracy = accuracy_score(y2_test, y2_pred)
accuracy = accuracy_score(y2, y2_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 59.39%


Accuracy reduced for the changed concept

Check PSI

In [44]:
predictions

array([[0.2762503 , 0.7237497 ],
       [0.16625255, 0.83374745],
       [0.1029871 , 0.8970129 ],
       ...,
       [0.22376366, 0.77623634],
       [0.32624276, 0.67375724],
       [0.09394687, 0.90605313]])

In [45]:
prob_0_2 = predictions[:,1]

In [46]:
df_prob_0_2 = pd.DataFrame(prob_0_2, columns = ['prob_0'])

In [47]:
df_prob_0_2

Unnamed: 0,prob_0
0,0.723750
1,0.833747
2,0.897013
3,0.683709
4,0.608960
...,...
14995,0.651190
14996,0.814978
14997,0.776236
14998,0.673757


In [48]:
conditions = [(df_prob_0_2.prob_0 >= 0.9),
              ((df_prob_0_2.prob_0 < 0.9) & (df_prob_0_2.prob_0 >= 0.8)),
              ((df_prob_0_2.prob_0 < 0.8) & (df_prob_0_2.prob_0 >= 0.7)),
              ((df_prob_0_2.prob_0 < 0.7) & (df_prob_0_2.prob_0 >= 0.6)),
              ((df_prob_0_2.prob_0 < 0.6) & (df_prob_0_2.prob_0 >= 0.5)),
              ((df_prob_0_2.prob_0 < 0.5) & (df_prob_0_2.prob_0 >= 0.4)),
              ((df_prob_0_2.prob_0 < 0.4) & (df_prob_0_2.prob_0 >= 0.3)),
              ((df_prob_0_2.prob_0 < 0.3) & (df_prob_0_2.prob_0 >= 0.2)),
              ((df_prob_0_2.prob_0 < 0.2) & (df_prob_0_2.prob_0 >= 0.1)),
              (df_prob_0_2.prob_0 < 0.1)]

choices = np.arange(1,11)

In [49]:
df_prob_0_2['decile'] = np.select(conditions, choices, default = 'none')

df_prob_0_2['decile'] = df_prob_0_2['decile'].astype(int)

df_prob_0_2

Unnamed: 0,prob_0,decile
0,0.723750,3
1,0.833747,2
2,0.897013,2
3,0.683709,4
4,0.608960,4
...,...,...
14995,0.651190,4
14996,0.814978,2
14997,0.776236,3
14998,0.673757,4


In [50]:
serving_distribution = df_prob_0_2.groupby('decile')[['prob_0']].count().reset_index().sort_values(by = 'decile')
serving_distribution = serving_distribution.rename(columns = {'prob_0' : 'serv_dist'})

In [51]:
serving_distribution['serv_dist_perc'] = np.round(100*serving_distribution['serv_dist']/serving_distribution['serv_dist'].sum(),1)

In [52]:
serving_distribution

Unnamed: 0,decile,serv_dist,serv_dist_perc
0,1,1593,10.6
1,2,4631,30.9
2,3,3226,21.5
3,4,4582,30.5
4,5,654,4.4
5,6,54,0.4
6,7,46,0.3
7,8,85,0.6
8,9,126,0.8
9,10,3,0.0


In [53]:
merged = pd.merge(traininig_distribution, serving_distribution, on='decile', how ='inner')

In [54]:
merged

Unnamed: 0,decile,train_dist,train_dist_perc,serv_dist,serv_dist_perc
0,1,774,15.6,1593,10.6
1,2,2314,46.7,4631,30.9
2,3,176,3.6,3226,21.5
3,4,73,1.5,4582,30.5
4,5,41,0.8,654,4.4
5,6,52,1.1,54,0.4
6,7,56,1.1,46,0.3
7,8,162,3.3,85,0.6
8,9,1200,24.2,126,0.8
9,10,102,2.1,3,0.0


In [59]:
merged['s-t'] = merged['serv_dist_perc'] - merged['train_dist_perc']

In [60]:
merged['ln(s/t)'] = np.log(merged['serv_dist_perc']/merged['train_dist_perc'])

In [61]:
merged['PSI'] = merged['ln(s/t)'] * (merged['s-t']/100)

In [62]:
merged.PSI.sum()

inf

In [43]:
merged

Unnamed: 0,decile,train_dist,train_dist_perc,serv_dist,serv_dist_perc,s-t,ln(s/t),PSI
0,1,2781,56.2,8491,56.6,0.4,0.007092,2.8e-05
1,2,287,5.8,822,5.5,-0.3,-0.05311,0.000159
2,3,124,2.5,391,2.6,0.1,0.039221,3.9e-05
3,4,85,1.7,270,1.8,0.1,0.057158,5.7e-05
4,5,66,1.3,188,1.3,0.0,0.0,0.0
5,6,74,1.5,173,1.2,-0.3,-0.223144,0.000669
6,7,61,1.2,214,1.4,0.2,0.154151,0.000308
7,8,81,1.6,256,1.7,0.1,0.060625,6.1e-05
8,9,149,3.0,436,2.9,-0.1,-0.033902,3.4e-05
9,10,1242,25.1,3759,25.1,0.0,0.0,0.0


In [44]:
df3=df[30000:45000]

##take the same number of samples we had in X test (4950), although this isn't required because we take % of samples in the range.


X3 = df3[['feature1', 'feature2', 'feature3']].values
y3 = df3['label'].values

seed = 7
test_size = 0.33
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=test_size, random_state=seed)

# make predictions for test data
y3_pred = model.predict(X3_test)
# predictions = [round(value) for value in y_pred]
predictions = model.predict_proba(X3_test)

# evaluate predictions
accuracy = accuracy_score(y3_test, y3_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

#Accuracy reduced for the changed concept



Accuracy: 81.94%


In [45]:
#Check PSI

predictions

prob_0_3 = predictions[:,0]

df_prob_0_3 = pd.DataFrame(prob_0_3, columns = ['prob_0'])

df_prob_0_3

conditions = [(df_prob_0_3.prob_0 >= 0.9),
              ((df_prob_0_3.prob_0 < 0.9) & (df_prob_0_3.prob_0 >= 0.8)),
              ((df_prob_0_3.prob_0 < 0.8) & (df_prob_0_3.prob_0 >= 0.7)),
              ((df_prob_0_3.prob_0 < 0.7) & (df_prob_0_3.prob_0 >= 0.6)),
              ((df_prob_0_3.prob_0 < 0.6) & (df_prob_0_3.prob_0 >= 0.5)),
              ((df_prob_0_3.prob_0 < 0.5) & (df_prob_0_3.prob_0 >= 0.4)),
              ((df_prob_0_3.prob_0 < 0.4) & (df_prob_0_3.prob_0 >= 0.3)),
              ((df_prob_0_3.prob_0 < 0.3) & (df_prob_0_3.prob_0 >= 0.2)),
              ((df_prob_0_3.prob_0 < 0.2) & (df_prob_0_3.prob_0 >= 0.1)),
              (df_prob_0_3.prob_0 < 0.1)]

choices = np.arange(1,11)

df_prob_0_3['decile'] = np.select(conditions, choices, default = 'none')

df_prob_0_3['decile'] = df_prob_0_3['decile'].astype(int)

df_prob_0_3

serving_distribution = df_prob_0_3.groupby('decile')[['prob_0']].count().reset_index().sort_values(by = 'decile')
serving_distribution = serving_distribution.rename(columns = {'prob_0' : 'serv_dist'})

serving_distribution['serv_dist_perc'] = np.round(100*serving_distribution['serv_dist']/serving_distribution['serv_dist'].sum(),1)

merged = pd.merge(traininig_distribution, serving_distribution, on='decile', how ='inner')

merged

merged['s-t'] = merged['serv_dist_perc'] - merged['train_dist_perc']

merged['ln(s/t)'] = np.log(merged['serv_dist_perc']/merged['train_dist_perc'])

merged['PSI'] = merged['ln(s/t)'] * (merged['s-t']/100)



In [46]:
merged.PSI.sum()


0.5439541765592253