In [1]:
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn import model_selection
import numpy as np

In [2]:
df = pd.read_csv('sea.csv', names = ['feature1','feature2','feature3','label'])

Take the first 15000 and train the model on it

In [4]:
df1  = df[:15000]

In [5]:
X = df1[['feature1', 'feature2', 'feature3']].values
y = df1['label'].values

In [6]:
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)
scoring = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted', 'roc_auc']
# fit model no training data
model = LogisticRegression()
model.fit(X_train, y_train)
print(model)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


In [7]:
kfold = model_selection.KFold(n_splits=5, shuffle=True, random_state=90210)
cv_results = model_selection.cross_validate(model, X_train, y_train, cv=kfold, scoring=scoring)
# clf = model.fit(X_train, y_train,  eval_metric = 'logloss')

In [9]:
cv_results

{'fit_time': array([0.01299739, 0.00999975, 0.01200008, 0.01099491, 0.01200151]),
 'score_time': array([0.00599861, 0.00500035, 0.00500083, 0.00499988, 0.00500059]),
 'test_accuracy': array([0.8800995 , 0.88159204, 0.89850746, 0.89850746, 0.88059701]),
 'test_precision_weighted': array([0.8798492 , 0.88117885, 0.89987578, 0.89853657, 0.8816454 ]),
 'test_recall_weighted': array([0.8800995 , 0.88159204, 0.89850746, 0.89850746, 0.88059701]),
 'test_f1_weighted': array([0.87792055, 0.87973291, 0.89655439, 0.89680598, 0.87825352]),
 'test_roc_auc': array([0.86039396, 0.86286523, 0.86523345, 0.89128491, 0.86333586])}

In [10]:
# make predictions for test data
y_pred = model.predict(X_test)
# predictions = [round(value) for value in y_pred]
predictions = model.predict_proba(X_test)

In [11]:
# evaluate predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 89.33%


Use PSI to check model's current decile distribution

Get the probabilities of class 0 

In [12]:
predictions

array([[0.81697201, 0.18302799],
       [0.36382155, 0.63617845],
       [0.02920723, 0.97079277],
       ...,
       [0.02318988, 0.97681012],
       [0.36539917, 0.63460083],
       [0.39196588, 0.60803412]])

In [13]:
predictions.shape

(4950, 2)

The first column in the matrix represents the propbability of being class 0

In [14]:
prob_0 = predictions[:,0]

In [15]:
df_prob_0 = pd.DataFrame(prob_0, columns = ['prob_0'])

In [16]:
df_prob_0

Unnamed: 0,prob_0
0,0.816972
1,0.363822
2,0.029207
3,0.184542
4,0.008462
...,...
4945,0.168044
4946,0.416002
4947,0.023190
4948,0.365399


In [17]:
conditions = [(df_prob_0.prob_0 >= 0.9),
              ((df_prob_0.prob_0 < 0.9) & (df_prob_0.prob_0 >= 0.8)),
              ((df_prob_0.prob_0 < 0.8) & (df_prob_0.prob_0 >= 0.7)),
              ((df_prob_0.prob_0 < 0.7) & (df_prob_0.prob_0 >= 0.6)),
              ((df_prob_0.prob_0 < 0.6) & (df_prob_0.prob_0 >= 0.5)),
              ((df_prob_0.prob_0 < 0.5) & (df_prob_0.prob_0 >= 0.4)),
              ((df_prob_0.prob_0 < 0.4) & (df_prob_0.prob_0 >= 0.3)),
              ((df_prob_0.prob_0 < 0.3) & (df_prob_0.prob_0 >= 0.2)),
              ((df_prob_0.prob_0 < 0.2) & (df_prob_0.prob_0 >= 0.1)),
              (df_prob_0.prob_0 < 0.1)]

choices = np.arange(1,11)

In [18]:
df_prob_0['decile'] = np.select(conditions, choices, default = 'none')

df_prob_0['decile'] = df_prob_0['decile'].astype(int)

df_prob_0

Unnamed: 0,prob_0,decile
0,0.816972,2
1,0.363822,7
2,0.029207,10
3,0.184542,9
4,0.008462,10
...,...,...
4945,0.168044,9
4946,0.416002,6
4947,0.023190,10
4948,0.365399,7


Count number of values in each decile

In [19]:
traininig_distribution = df_prob_0.groupby('decile')[['prob_0']].count().reset_index().sort_values(by = 'decile')
traininig_distribution = traininig_distribution.rename(columns = {'prob_0' : 'train_dist'})

In [20]:
traininig_distribution['train_dist_perc'] = np.round(100*traininig_distribution['train_dist']/traininig_distribution['train_dist'].sum(),1)

In [21]:
traininig_distribution

Unnamed: 0,decile,train_dist,train_dist_perc
0,1,245,4.9
1,2,363,7.3
2,3,324,6.5
3,4,309,6.2
4,5,304,6.1
5,6,351,7.1
6,7,456,9.2
7,8,550,11.1
8,9,701,14.2
9,10,1347,27.2


Now keeping the model same make predictions on the next 15000 of SEA dataset and compare the decile distribution

In [22]:
df2=df[15000:30000]

In [23]:
df2['feature3'] = df2['feature3'] + 3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


take the same number of samples we had in X test (4950), although this isn't required because we take % of samples in the range.

In [24]:

X2 = df2[['feature1', 'feature2', 'feature3']].values
y2 = df2['label'].values

In [25]:
seed = 7
test_size = 0.33
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=test_size, random_state=seed)

In [26]:
# make predictions for test data
y2_pred = model.predict(X2_test)
# predictions = [round(value) for value in y_pred]
predictions = model.predict_proba(X2_test)

In [27]:
# evaluate predictions
accuracy = accuracy_score(y2_test, y2_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 67.56%


Accuracy reduced for the changed concept

Check PSI

In [28]:
predictions

array([[0.09696916, 0.90303084],
       [0.10485944, 0.89514056],
       [0.2432488 , 0.7567512 ],
       ...,
       [0.05314284, 0.94685716],
       [0.06191929, 0.93808071],
       [0.17199534, 0.82800466]])

In [29]:
prob_0_2 = predictions[:,0]

In [31]:
df_prob_0_2 = pd.DataFrame(prob_0_2, columns = ['prob_0'])

In [32]:
df_prob_0_2

Unnamed: 0,prob_0
0,0.096969
1,0.104859
2,0.243249
3,0.119202
4,0.002192
...,...
4945,0.002292
4946,0.829049
4947,0.053143
4948,0.061919


In [33]:
conditions = [(df_prob_0_2.prob_0 >= 0.9),
              ((df_prob_0_2.prob_0 < 0.9) & (df_prob_0_2.prob_0 >= 0.8)),
              ((df_prob_0_2.prob_0 < 0.8) & (df_prob_0_2.prob_0 >= 0.7)),
              ((df_prob_0_2.prob_0 < 0.7) & (df_prob_0_2.prob_0 >= 0.6)),
              ((df_prob_0_2.prob_0 < 0.6) & (df_prob_0_2.prob_0 >= 0.5)),
              ((df_prob_0_2.prob_0 < 0.5) & (df_prob_0_2.prob_0 >= 0.4)),
              ((df_prob_0_2.prob_0 < 0.4) & (df_prob_0_2.prob_0 >= 0.3)),
              ((df_prob_0_2.prob_0 < 0.3) & (df_prob_0_2.prob_0 >= 0.2)),
              ((df_prob_0_2.prob_0 < 0.2) & (df_prob_0_2.prob_0 >= 0.1)),
              (df_prob_0_2.prob_0 < 0.1)]

choices = np.arange(1,11)

In [34]:
df_prob_0_2['decile'] = np.select(conditions, choices, default = 'none')

df_prob_0_2['decile'] = df_prob_0_2['decile'].astype(int)

df_prob_0_2

Unnamed: 0,prob_0,decile
0,0.096969,10
1,0.104859,9
2,0.243249,8
3,0.119202,9
4,0.002192,10
...,...,...
4945,0.002292,10
4946,0.829049,2
4947,0.053143,10
4948,0.061919,10


In [35]:
serving_distribution = df_prob_0_2.groupby('decile')[['prob_0']].count().reset_index().sort_values(by = 'decile')
serving_distribution = serving_distribution.rename(columns = {'prob_0' : 'serv_dist'})

In [36]:
serving_distribution['serv_dist_perc'] = np.round(100*serving_distribution['serv_dist']/serving_distribution['serv_dist'].sum(),1)

In [37]:
merged = pd.merge(traininig_distribution, serving_distribution, on='decile', how ='inner')

In [38]:
merged

Unnamed: 0,decile,train_dist,train_dist_perc,serv_dist,serv_dist_perc
0,2,363,7.3,98,2.0
1,3,324,6.5,153,3.1
2,4,309,6.2,180,3.6
3,5,304,6.1,170,3.4
4,6,351,7.1,228,4.6
5,7,456,9.2,274,5.5
6,8,550,11.1,405,8.2
7,9,701,14.2,761,15.4
8,10,1347,27.2,2681,54.2


In [39]:
merged['s-t'] = merged['serv_dist_perc'] - merged['train_dist_perc']

In [40]:
merged['ln(s/t)'] = np.log(merged['serv_dist_perc']/merged['train_dist_perc'])

In [41]:
merged['PSI'] = merged['ln(s/t)'] * (merged['s-t']/100)

In [42]:
merged.PSI.sum()

0.34950610209076305

In [43]:
merged

Unnamed: 0,decile,train_dist,train_dist_perc,serv_dist,serv_dist_perc,s-t,ln(s/t),PSI
0,2,363,7.3,98,2.0,-5.3,-1.294727,0.068621
1,3,324,6.5,153,3.1,-3.4,-0.7404,0.025174
2,4,309,6.2,180,3.6,-2.6,-0.543615,0.014134
3,5,304,6.1,170,3.4,-2.7,-0.584513,0.015782
4,6,351,7.1,228,4.6,-2.5,-0.434038,0.010851
5,7,456,9.2,274,5.5,-3.7,-0.514455,0.019035
6,8,550,11.1,405,8.2,-2.9,-0.302811,0.008782
7,9,701,14.2,761,15.4,1.2,0.081126,0.000974
8,10,1347,27.2,2681,54.2,27.0,0.689464,0.186155


In [44]:
df3=df[30000:45000]

##take the same number of samples we had in X test (4950), although this isn't required because we take % of samples in the range.


X3 = df3[['feature1', 'feature2', 'feature3']].values
y3 = df3['label'].values

seed = 7
test_size = 0.33
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=test_size, random_state=seed)

# make predictions for test data
y3_pred = model.predict(X3_test)
# predictions = [round(value) for value in y_pred]
predictions = model.predict_proba(X3_test)

# evaluate predictions
accuracy = accuracy_score(y3_test, y3_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

#Accuracy reduced for the changed concept



Accuracy: 84.20%


In [41]:
#Check PSI

predictions

prob_0_3 = predictions[:,0]

df_prob_0_3 = pd.DataFrame(prob_0_3, columns = ['prob_0'])

df_prob_0_3

conditions = [(df_prob_0_3.prob_0 >= 0.9),
              ((df_prob_0_3.prob_0 < 0.9) & (df_prob_0_3.prob_0 >= 0.8)),
              ((df_prob_0_3.prob_0 < 0.8) & (df_prob_0_3.prob_0 >= 0.7)),
              ((df_prob_0_3.prob_0 < 0.7) & (df_prob_0_3.prob_0 >= 0.6)),
              ((df_prob_0_3.prob_0 < 0.6) & (df_prob_0_3.prob_0 >= 0.5)),
              ((df_prob_0_3.prob_0 < 0.5) & (df_prob_0_3.prob_0 >= 0.4)),
              ((df_prob_0_3.prob_0 < 0.4) & (df_prob_0_3.prob_0 >= 0.3)),
              ((df_prob_0_3.prob_0 < 0.3) & (df_prob_0_3.prob_0 >= 0.2)),
              ((df_prob_0_3.prob_0 < 0.2) & (df_prob_0_3.prob_0 >= 0.1)),
              (df_prob_0_3.prob_0 < 0.1)]

choices = np.arange(1,11)

df_prob_0_3['decile'] = np.select(conditions, choices, default = 'none')

df_prob_0_3['decile'] = df_prob_0_3['decile'].astype(int)

df_prob_0_3

serving_distribution = df_prob_0_3.groupby('decile')[['prob_0']].count().reset_index().sort_values(by = 'decile')
serving_distribution = serving_distribution.rename(columns = {'prob_0' : 'serv_dist'})

serving_distribution['serv_dist_perc'] = np.round(100*serving_distribution['serv_dist']/serving_distribution['serv_dist'].sum(),1)

merged = pd.merge(traininig_distribution, serving_distribution, on='decile', how ='inner')

merged

merged['s-t'] = merged['serv_dist_perc'] - merged['train_dist_perc']

merged['ln(s/t)'] = np.log(merged['serv_dist_perc']/merged['train_dist_perc'])

merged['PSI'] = merged['ln(s/t)'] * (merged['s-t']/100)



In [42]:
merged.PSI.sum()


0.0018929509395053928