In [1]:
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn import model_selection
import numpy as np

In [2]:
df = pd.read_csv('sea.csv', names = ['feature1','feature2','feature3','label'])

Take the first 15000 and train the model on it

In [3]:

df1  = df[:15000]

In [4]:

X = df1[['feature1', 'feature2', 'feature3']].values
y = df1['label'].values


In [5]:
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)
scoring = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted', 'roc_auc']
# fit model no training data
model = LogisticRegression()
model.fit(X_train, y_train)
print(model)

LogisticRegression()


In [6]:
kfold = model_selection.KFold(n_splits=5, shuffle=True, random_state=90210)
cv_results = model_selection.cross_validate(model, X_train, y_train, cv=kfold, scoring=scoring)
# clf = model.fit(X_train, y_train,  eval_metric = 'logloss')

In [7]:
cv_results

{'fit_time': array([0.04979825, 0.0285728 , 0.02445245, 0.0708468 , 0.0285089 ]),
 'score_time': array([0.00538754, 0.00511265, 0.00519085, 0.00530291, 0.0053184 ]),
 'test_accuracy': array([0.8800995 , 0.88159204, 0.89850746, 0.89850746, 0.88059701]),
 'test_precision_weighted': array([0.8798492 , 0.88117885, 0.89987578, 0.89853657, 0.8816454 ]),
 'test_recall_weighted': array([0.8800995 , 0.88159204, 0.89850746, 0.89850746, 0.88059701]),
 'test_f1_weighted': array([0.87792055, 0.87973291, 0.89655439, 0.89680598, 0.87825352]),
 'test_roc_auc': array([0.86039396, 0.86286523, 0.86523345, 0.89128491, 0.86333586])}

In [8]:
# make predictions for test data
y_pred = model.predict(X_test)
# predictions = [round(value) for value in y_pred]
predictions = model.predict_proba(X_test)

In [9]:
# evaluate predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 89.33%


Use PSI to check model's current decile distribution

Get the probabilities of class 0 

In [10]:
predictions

array([[0.81697201, 0.18302799],
       [0.36382155, 0.63617845],
       [0.02920723, 0.97079277],
       ...,
       [0.02318988, 0.97681012],
       [0.36539917, 0.63460083],
       [0.39196588, 0.60803412]])

In [11]:
predictions.shape

(4950, 2)

The first column in the matrix represents the propbability of being class 0

In [12]:
prob_0 = predictions[:,0]

In [13]:
df_prob_0 = pd.DataFrame(prob_0, columns = ['prob_0'])

In [14]:
df_prob_0

Unnamed: 0,prob_0
0,0.816972
1,0.363822
2,0.029207
3,0.184542
4,0.008462
...,...
4945,0.168044
4946,0.416002
4947,0.023190
4948,0.365399


In [15]:
conditions = [(df_prob_0.prob_0 >= 0.9),
              ((df_prob_0.prob_0 < 0.9) & (df_prob_0.prob_0 >= 0.8)),
              ((df_prob_0.prob_0 < 0.8) & (df_prob_0.prob_0 >= 0.7)),
              ((df_prob_0.prob_0 < 0.7) & (df_prob_0.prob_0 >= 0.6)),
              ((df_prob_0.prob_0 < 0.6) & (df_prob_0.prob_0 >= 0.5)),
              ((df_prob_0.prob_0 < 0.5) & (df_prob_0.prob_0 >= 0.4)),
              ((df_prob_0.prob_0 < 0.4) & (df_prob_0.prob_0 >= 0.3)),
              ((df_prob_0.prob_0 < 0.3) & (df_prob_0.prob_0 >= 0.2)),
              ((df_prob_0.prob_0 < 0.2) & (df_prob_0.prob_0 >= 0.1)),
              (df_prob_0.prob_0 < 0.1)]

choices = np.arange(1,11)

In [16]:
df_prob_0['decile'] = np.select(conditions, choices, default = 'none')

df_prob_0['decile'] = df_prob_0['decile'].astype(int)

df_prob_0

Unnamed: 0,prob_0,decile
0,0.816972,2
1,0.363822,7
2,0.029207,10
3,0.184542,9
4,0.008462,10
...,...,...
4945,0.168044,9
4946,0.416002,6
4947,0.023190,10
4948,0.365399,7


Count number of values in each decile

In [17]:
traininig_distribution = df_prob_0.groupby('decile')[['prob_0']].count().reset_index().sort_values(by = 'decile')
traininig_distribution = traininig_distribution.rename(columns = {'prob_0' : 'train_dist'})

In [18]:
traininig_distribution['train_dist_perc'] = np.round(100*traininig_distribution['train_dist']/traininig_distribution['train_dist'].sum(),1)

In [19]:
traininig_distribution

Unnamed: 0,decile,train_dist,train_dist_perc
0,1,245,4.9
1,2,363,7.3
2,3,324,6.5
3,4,309,6.2
4,5,304,6.1
5,6,351,7.1
6,7,456,9.2
7,8,550,11.1
8,9,701,14.2
9,10,1347,27.2


Now keeping the model same make predictions on the next 15000 of SEA dataset and compare the decile distribution

In [20]:
df2=df[15000:30000]

take the same number of samples we had in X test (4950), although this isn't required because we take % of samples in the range.

In [21]:

X2 = df2[['feature1', 'feature2', 'feature3']].values
y2 = df2['label'].values

In [22]:
seed = 7
test_size = 0.33
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=test_size, random_state=seed)

In [23]:
# make predictions for test data
y2_pred = model.predict(X2_test)
# predictions = [round(value) for value in y_pred]
predictions = model.predict_proba(X2_test)

In [24]:
# evaluate predictions
accuracy = accuracy_score(y2_test, y2_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 82.46%


Accuracy reduced for the changed concept

Check PSI

In [25]:
predictions

array([[0.30432516, 0.69567484],
       [0.32305185, 0.67694815],
       [0.56700166, 0.43299834],
       ...,
       [0.18609494, 0.81390506],
       [0.21191409, 0.78808591],
       [0.45835339, 0.54164661]])

In [26]:
prob_0_2 = predictions[:,0]

In [27]:
df_prob_0_2 = pd.DataFrame(prob_0_2, columns = ['prob_0'])

In [28]:
df_prob_0_2

Unnamed: 0,prob_0
0,0.304325
1,0.323052
2,0.567002
3,0.355391
4,0.008870
...,...
4945,0.009271
4946,0.951822
4947,0.186095
4948,0.211914


In [29]:
conditions = [(df_prob_0_2.prob_0 >= 0.9),
              ((df_prob_0_2.prob_0 < 0.9) & (df_prob_0_2.prob_0 >= 0.8)),
              ((df_prob_0_2.prob_0 < 0.8) & (df_prob_0_2.prob_0 >= 0.7)),
              ((df_prob_0_2.prob_0 < 0.7) & (df_prob_0_2.prob_0 >= 0.6)),
              ((df_prob_0_2.prob_0 < 0.6) & (df_prob_0_2.prob_0 >= 0.5)),
              ((df_prob_0_2.prob_0 < 0.5) & (df_prob_0_2.prob_0 >= 0.4)),
              ((df_prob_0_2.prob_0 < 0.4) & (df_prob_0_2.prob_0 >= 0.3)),
              ((df_prob_0_2.prob_0 < 0.3) & (df_prob_0_2.prob_0 >= 0.2)),
              ((df_prob_0_2.prob_0 < 0.2) & (df_prob_0_2.prob_0 >= 0.1)),
              (df_prob_0_2.prob_0 < 0.1)]

choices = np.arange(1,11)

In [30]:
df_prob_0_2['decile'] = np.select(conditions, choices, default = 'none')

df_prob_0_2['decile'] = df_prob_0_2['decile'].astype(int)

df_prob_0_2

Unnamed: 0,prob_0,decile
0,0.304325,7
1,0.323052,7
2,0.567002,5
3,0.355391,7
4,0.008870,10
...,...,...
4945,0.009271,10
4946,0.951822,1
4947,0.186095,9
4948,0.211914,8


In [31]:
serving_distribution = df_prob_0_2.groupby('decile')[['prob_0']].count().reset_index().sort_values(by = 'decile')
serving_distribution = serving_distribution.rename(columns = {'prob_0' : 'serv_dist'})

In [32]:
serving_distribution['serv_dist_perc'] = np.round(100*serving_distribution['serv_dist']/serving_distribution['serv_dist'].sum(),1)

In [33]:
merged = pd.merge(traininig_distribution, serving_distribution, on='decile', how ='inner')

In [34]:
merged

Unnamed: 0,decile,train_dist,train_dist_perc,serv_dist,serv_dist_perc
0,1,245,4.9,269,5.4
1,2,363,7.3,343,6.9
2,3,324,6.5,299,6.0
3,4,309,6.2,300,6.1
4,5,304,6.1,310,6.3
5,6,351,7.1,362,7.3
6,7,456,9.2,438,8.8
7,8,550,11.1,561,11.3
8,9,701,14.2,712,14.4
9,10,1347,27.2,1356,27.4


In [35]:
merged['s-t'] = merged['serv_dist_perc'] - merged['train_dist_perc']

In [36]:
merged['ln(s/t)'] = np.log(merged['serv_dist_perc']/merged['train_dist_perc'])

In [37]:
merged['PSI'] = merged['ln(s/t)'] * (merged['s-t']/100)

In [38]:
merged.PSI.sum()

0.0015039322495797943

In [39]:
merged

Unnamed: 0,decile,train_dist,train_dist_perc,serv_dist,serv_dist_perc,s-t,ln(s/t),PSI
0,1,245,4.9,269,5.4,0.5,0.097164,0.000486
1,2,363,7.3,343,6.9,-0.4,-0.056353,0.000225
2,3,324,6.5,299,6.0,-0.5,-0.080043,0.0004
3,4,309,6.2,300,6.1,-0.1,-0.016261,1.6e-05
4,5,304,6.1,310,6.3,0.2,0.032261,6.5e-05
5,6,351,7.1,362,7.3,0.2,0.02778,5.6e-05
6,7,456,9.2,438,8.8,-0.4,-0.044452,0.000178
7,8,550,11.1,561,11.3,0.2,0.017858,3.6e-05
8,9,701,14.2,712,14.4,0.2,0.013986,2.8e-05
9,10,1347,27.2,1356,27.4,0.2,0.007326,1.5e-05


In [40]:
df3=df[30000:45000]

##take the same number of samples we had in X test (4950), although this isn't required because we take % of samples in the range.


X3 = df3[['feature1', 'feature2', 'feature3']].values
y3 = df3['label'].values

seed = 7
test_size = 0.33
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=test_size, random_state=seed)

# make predictions for test data
y3_pred = model.predict(X3_test)
# predictions = [round(value) for value in y_pred]
predictions = model.predict_proba(X3_test)

# evaluate predictions
accuracy = accuracy_score(y3_test, y3_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

#Accuracy reduced for the changed concept



Accuracy: 84.20%


In [41]:
#Check PSI

predictions

prob_0_3 = predictions[:,0]

df_prob_0_3 = pd.DataFrame(prob_0_3, columns = ['prob_0'])

df_prob_0_3

conditions = [(df_prob_0_3.prob_0 >= 0.9),
              ((df_prob_0_3.prob_0 < 0.9) & (df_prob_0_3.prob_0 >= 0.8)),
              ((df_prob_0_3.prob_0 < 0.8) & (df_prob_0_3.prob_0 >= 0.7)),
              ((df_prob_0_3.prob_0 < 0.7) & (df_prob_0_3.prob_0 >= 0.6)),
              ((df_prob_0_3.prob_0 < 0.6) & (df_prob_0_3.prob_0 >= 0.5)),
              ((df_prob_0_3.prob_0 < 0.5) & (df_prob_0_3.prob_0 >= 0.4)),
              ((df_prob_0_3.prob_0 < 0.4) & (df_prob_0_3.prob_0 >= 0.3)),
              ((df_prob_0_3.prob_0 < 0.3) & (df_prob_0_3.prob_0 >= 0.2)),
              ((df_prob_0_3.prob_0 < 0.2) & (df_prob_0_3.prob_0 >= 0.1)),
              (df_prob_0_3.prob_0 < 0.1)]

choices = np.arange(1,11)

df_prob_0_3['decile'] = np.select(conditions, choices, default = 'none')

df_prob_0_3['decile'] = df_prob_0_3['decile'].astype(int)

df_prob_0_3

serving_distribution = df_prob_0_3.groupby('decile')[['prob_0']].count().reset_index().sort_values(by = 'decile')
serving_distribution = serving_distribution.rename(columns = {'prob_0' : 'serv_dist'})

serving_distribution['serv_dist_perc'] = np.round(100*serving_distribution['serv_dist']/serving_distribution['serv_dist'].sum(),1)

merged = pd.merge(traininig_distribution, serving_distribution, on='decile', how ='inner')

merged

merged['s-t'] = merged['serv_dist_perc'] - merged['train_dist_perc']

merged['ln(s/t)'] = np.log(merged['serv_dist_perc']/merged['train_dist_perc'])

merged['PSI'] = merged['ln(s/t)'] * (merged['s-t']/100)



In [42]:
merged.PSI.sum()


0.0018929509395053928