In [1]:
!pip install xgboost



In [2]:
from numpy import loadtxt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn import model_selection
import numpy as np


In [3]:
df = pd.read_csv('sea.csv', names = ['feature1','feature2','feature3','label'])

In [4]:
df

Unnamed: 0,feature1,feature2,feature3,label
0,6.677259,5.152133,2.982455,0
1,9.874437,8.817701,4.786266,1
2,7.118725,2.990575,1.964403,0
3,6.128244,8.449696,2.604408,1
4,7.592623,0.859845,5.763160,0
...,...,...,...,...
59995,4.760386,7.501301,5.680291,1
59996,7.873042,7.951943,1.213854,1
59997,8.430418,9.531408,1.034550,0
59998,6.438425,8.759620,6.009597,1


Take the first 15000 and train the model on it

In [5]:
df1  = df[:15000]

In [6]:
df1.label.value_counts()

1    9683
0    5317
Name: label, dtype: int64

In [7]:
X = df1[['feature1', 'feature2', 'feature3']].values
y = df1['label'].values

In [8]:
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

Scale the training set

In [28]:
# scaler = MinMaxScaler()
# scaler.fit(X_train)
# scaler.fit(X_test)

# X_train = scaler.transform(X_train)
# X_test = scaler.transform(X_test)  # separate scaling, because train set is supposed to be unseen

MinMaxScaler(copy=True, feature_range=(0, 1))

In [9]:
scoring = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted', 'roc_auc']
# fit model no training data
model = SVC(kernel = 'linear' , C = 0.25, probability = True)
model.fit(X_train, y_train)
print(model)

SVC(C=0.25, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)


In [10]:
kfold = model_selection.KFold(n_splits=5, shuffle=True, random_state=90210)
cv_results = model_selection.cross_validate(model, X_train, y_train, cv=kfold, scoring=scoring)
# clf = model.fit(X_train, y_train,  eval_metric = 'logloss')

In [11]:
cv_results

{'fit_time': array([4.55133605, 4.19502139, 4.14180875, 4.43121886, 4.44985533]),
 'score_time': array([0.07303882, 0.07105374, 0.07104015, 0.07604194, 0.07702112]),
 'test_accuracy': array([0.87761194, 0.88109453, 0.89751244, 0.89701493, 0.87711443]),
 'test_precision_weighted': array([0.87761973, 0.88082777, 0.89895962, 0.8972768 , 0.87854109]),
 'test_recall_weighted': array([0.87761194, 0.88109453, 0.89751244, 0.89701493, 0.87711443]),
 'test_f1_weighted': array([0.87512771, 0.87909543, 0.89549407, 0.89511736, 0.87445699]),
 'test_roc_auc': array([0.86044641, 0.86290765, 0.86525393, 0.8912684 , 0.86352304])}

In [12]:
# make predictions for test data
y_pred = model.predict(X_test)
# predictions = [round(value) for value in y_pred]
predictions = model.predict_proba(X_test)

In [13]:
# evaluate predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 89.15%


Use PSI to check model's current decile distribution

Get the probabilities of class 0 

In [14]:
predictions

array([[0.81745522, 0.18254478],
       [0.35948897, 0.64051103],
       [0.02902763, 0.97097237],
       ...,
       [0.02341872, 0.97658128],
       [0.36016359, 0.63983641],
       [0.39229622, 0.60770378]])

In [15]:
predictions.shape

(4950, 2)

The first column in the matrix represents the propbability of being class 0

In [16]:
prob_0 = predictions[:,1]

In [17]:
df_prob_0 = pd.DataFrame(prob_0, columns = ['prob_0'])

In [18]:
df_prob_0

Unnamed: 0,prob_0
0,0.182545
1,0.640511
2,0.970972
3,0.814667
4,0.991508
...,...
4945,0.833124
4946,0.585348
4947,0.976581
4948,0.639836


In [19]:
conditions = [(df_prob_0.prob_0 >= 0.9),
              ((df_prob_0.prob_0 < 0.9) & (df_prob_0.prob_0 >= 0.8)),
              ((df_prob_0.prob_0 < 0.8) & (df_prob_0.prob_0 >= 0.7)),
              ((df_prob_0.prob_0 < 0.7) & (df_prob_0.prob_0 >= 0.6)),
              ((df_prob_0.prob_0 < 0.6) & (df_prob_0.prob_0 >= 0.5)),
              ((df_prob_0.prob_0 < 0.5) & (df_prob_0.prob_0 >= 0.4)),
              ((df_prob_0.prob_0 < 0.4) & (df_prob_0.prob_0 >= 0.3)),
              ((df_prob_0.prob_0 < 0.3) & (df_prob_0.prob_0 >= 0.2)),
              ((df_prob_0.prob_0 < 0.2) & (df_prob_0.prob_0 >= 0.1)),
              (df_prob_0.prob_0 < 0.1)]

choices = np.arange(1,11)

In [20]:
df_prob_0['decile'] = np.select(conditions, choices, default = 'none')

df_prob_0['decile'] = df_prob_0['decile'].astype(int)

df_prob_0

Unnamed: 0,prob_0,decile
0,0.182545,9
1,0.640511,4
2,0.970972,1
3,0.814667,2
4,0.991508,1
...,...,...
4945,0.833124,2
4946,0.585348,5
4947,0.976581,1
4948,0.639836,4


Count number of values in each decile

In [21]:
traininig_distribution = df_prob_0.groupby('decile')[['prob_0']].count().reset_index().sort_values(by = 'decile')
traininig_distribution = traininig_distribution.rename(columns = {'prob_0' : 'train_dist'})

In [22]:
traininig_distribution['train_dist_perc'] = np.round(100*traininig_distribution['train_dist']/traininig_distribution['train_dist'].sum(),1)

In [23]:
traininig_distribution

Unnamed: 0,decile,train_dist,train_dist_perc
0,1,1353,27.3
1,2,690,13.9
2,3,554,11.2
3,4,466,9.4
4,5,357,7.2
5,6,292,5.9
6,7,307,6.2
7,8,323,6.5
8,9,367,7.4
9,10,241,4.9


Now keeping the model same make predictions on the next 15000 of SEA dataset and compare the decile distribution

In [24]:
df2=df[15000:30000]

Manually alter the feature

In [25]:
df2['feature3'] = df2['feature3']+3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [26]:
df2.label.value_counts()

1    8677
0    6323
Name: label, dtype: int64

take the same number of samples we had in X test (4950), although this isn't required because we take % of samples in the range.

In [27]:

X2 = df2[['feature1', 'feature2', 'feature3']].values
y2 = df2['label'].values

Scale the new data

In [88]:
# scaler = MinMaxScaler()
# scaler.fit(X2)
# X2 = scaler.transform(X2)


In [25]:
# seed = 7
# test_size = 0.33
# X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=test_size, random_state=seed)

In [28]:
# make predictions for test data
# y2_pred = model.predict(X2_test)
y2_pred = model.predict(X2)
# predictions = [round(value) for value in y_pred]
# predictions = model.predict_proba(X2_test)
predictions = model.predict_proba(X2)

In [29]:
# evaluate predictions
# accuracy = accuracy_score(y2_test, y2_pred)
accuracy = accuracy_score(y2, y2_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 67.36%


Accuracy reduced for the changed concept

Check PSI

In [30]:
predictions

array([[0.03640733, 0.96359267],
       [0.59656556, 0.40343444],
       [0.17975353, 0.82024647],
       ...,
       [0.1184366 , 0.8815634 ],
       [0.09011267, 0.90988733],
       [0.16700118, 0.83299882]])

In [31]:
prob_0_2 = predictions[:,1]

In [32]:
df_prob_0_2 = pd.DataFrame(prob_0_2, columns = ['prob_0'])

In [33]:
df_prob_0_2

Unnamed: 0,prob_0
0,0.963593
1,0.403434
2,0.820246
3,0.890622
4,0.957887
...,...
14995,0.992813
14996,0.828882
14997,0.881563
14998,0.909887


In [34]:
conditions = [(df_prob_0_2.prob_0 >= 0.9),
              ((df_prob_0_2.prob_0 < 0.9) & (df_prob_0_2.prob_0 >= 0.8)),
              ((df_prob_0_2.prob_0 < 0.8) & (df_prob_0_2.prob_0 >= 0.7)),
              ((df_prob_0_2.prob_0 < 0.7) & (df_prob_0_2.prob_0 >= 0.6)),
              ((df_prob_0_2.prob_0 < 0.6) & (df_prob_0_2.prob_0 >= 0.5)),
              ((df_prob_0_2.prob_0 < 0.5) & (df_prob_0_2.prob_0 >= 0.4)),
              ((df_prob_0_2.prob_0 < 0.4) & (df_prob_0_2.prob_0 >= 0.3)),
              ((df_prob_0_2.prob_0 < 0.3) & (df_prob_0_2.prob_0 >= 0.2)),
              ((df_prob_0_2.prob_0 < 0.2) & (df_prob_0_2.prob_0 >= 0.1)),
              (df_prob_0_2.prob_0 < 0.1)]

choices = np.arange(1,11)

In [35]:
df_prob_0_2['decile'] = np.select(conditions, choices, default = 'none')

df_prob_0_2['decile'] = df_prob_0_2['decile'].astype(int)

df_prob_0_2

Unnamed: 0,prob_0,decile
0,0.963593,1
1,0.403434,6
2,0.820246,2
3,0.890622,2
4,0.957887,1
...,...,...
14995,0.992813,1
14996,0.828882,2
14997,0.881563,2
14998,0.909887,1


In [36]:
serving_distribution = df_prob_0_2.groupby('decile')[['prob_0']].count().reset_index().sort_values(by = 'decile')
serving_distribution = serving_distribution.rename(columns = {'prob_0' : 'serv_dist'})

In [37]:
serving_distribution['serv_dist_perc'] = np.round(100*serving_distribution['serv_dist']/serving_distribution['serv_dist'].sum(),1)

In [38]:
serving_distribution

Unnamed: 0,decile,serv_dist,serv_dist_perc
0,1,8046,53.6
1,2,2374,15.8
2,3,1223,8.2
3,4,865,5.8
4,5,727,4.8
5,6,503,3.4
6,7,534,3.6
7,8,445,3.0
8,9,281,1.9
9,10,2,0.0


In [39]:
merged = pd.merge(traininig_distribution, serving_distribution, on='decile', how ='inner')

In [40]:
merged

Unnamed: 0,decile,train_dist,train_dist_perc,serv_dist,serv_dist_perc
0,1,1353,27.3,8046,53.6
1,2,690,13.9,2374,15.8
2,3,554,11.2,1223,8.2
3,4,466,9.4,865,5.8
4,5,357,7.2,727,4.8
5,6,292,5.9,503,3.4
6,7,307,6.2,534,3.6
7,8,323,6.5,445,3.0
8,9,367,7.4,281,1.9
9,10,241,4.9,2,0.0


In [41]:
merged['s-t'] = merged['serv_dist_perc'] - merged['train_dist_perc']

In [42]:
merged['ln(s/t)'] = np.log(merged['serv_dist_perc']/merged['train_dist_perc'])

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [43]:
merged['PSI'] = merged['ln(s/t)'] * (merged['s-t']/100)

In [44]:
merged.PSI.sum()

inf

In [82]:
merged

Unnamed: 0,decile,train_dist,train_dist_perc,serv_dist,serv_dist_perc,s-t,ln(s/t),PSI
0,1,1361,27.5,4151,27.7,0.2,0.007246,1.4e-05
1,2,693,14.0,2113,14.1,0.1,0.007117,7e-06
2,3,549,11.1,1648,11.0,-0.1,-0.00905,9e-06
3,4,460,9.3,1366,9.1,-0.2,-0.02174,4.3e-05
4,5,357,7.2,1142,7.6,0.4,0.054067,0.000216
5,6,289,5.8,906,6.0,0.2,0.033902,6.8e-05
6,7,309,6.2,896,6.0,-0.2,-0.03279,6.6e-05
7,8,318,6.4,942,6.3,-0.1,-0.015748,1.6e-05
8,9,365,7.4,1035,6.9,-0.5,-0.069959,0.00035
9,10,249,5.0,801,5.3,0.3,0.058269,0.000175


In [44]:
df3=df[30000:45000]

##take the same number of samples we had in X test (4950), although this isn't required because we take % of samples in the range.


X3 = df3[['feature1', 'feature2', 'feature3']].values
y3 = df3['label'].values

seed = 7
test_size = 0.33
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=test_size, random_state=seed)

# make predictions for test data
y3_pred = model.predict(X3_test)
# predictions = [round(value) for value in y_pred]
predictions = model.predict_proba(X3_test)

# evaluate predictions
accuracy = accuracy_score(y3_test, y3_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

#Accuracy reduced for the changed concept



Accuracy: 81.94%


In [45]:
#Check PSI

predictions

prob_0_3 = predictions[:,0]

df_prob_0_3 = pd.DataFrame(prob_0_3, columns = ['prob_0'])

df_prob_0_3

conditions = [(df_prob_0_3.prob_0 >= 0.9),
              ((df_prob_0_3.prob_0 < 0.9) & (df_prob_0_3.prob_0 >= 0.8)),
              ((df_prob_0_3.prob_0 < 0.8) & (df_prob_0_3.prob_0 >= 0.7)),
              ((df_prob_0_3.prob_0 < 0.7) & (df_prob_0_3.prob_0 >= 0.6)),
              ((df_prob_0_3.prob_0 < 0.6) & (df_prob_0_3.prob_0 >= 0.5)),
              ((df_prob_0_3.prob_0 < 0.5) & (df_prob_0_3.prob_0 >= 0.4)),
              ((df_prob_0_3.prob_0 < 0.4) & (df_prob_0_3.prob_0 >= 0.3)),
              ((df_prob_0_3.prob_0 < 0.3) & (df_prob_0_3.prob_0 >= 0.2)),
              ((df_prob_0_3.prob_0 < 0.2) & (df_prob_0_3.prob_0 >= 0.1)),
              (df_prob_0_3.prob_0 < 0.1)]

choices = np.arange(1,11)

df_prob_0_3['decile'] = np.select(conditions, choices, default = 'none')

df_prob_0_3['decile'] = df_prob_0_3['decile'].astype(int)

df_prob_0_3

serving_distribution = df_prob_0_3.groupby('decile')[['prob_0']].count().reset_index().sort_values(by = 'decile')
serving_distribution = serving_distribution.rename(columns = {'prob_0' : 'serv_dist'})

serving_distribution['serv_dist_perc'] = np.round(100*serving_distribution['serv_dist']/serving_distribution['serv_dist'].sum(),1)

merged = pd.merge(traininig_distribution, serving_distribution, on='decile', how ='inner')

merged

merged['s-t'] = merged['serv_dist_perc'] - merged['train_dist_perc']

merged['ln(s/t)'] = np.log(merged['serv_dist_perc']/merged['train_dist_perc'])

merged['PSI'] = merged['ln(s/t)'] * (merged['s-t']/100)



In [46]:
merged.PSI.sum()


0.5439541765592253