# Problem 1 
## Predicting Customer Churn 

This Problem has been solved in the following journal article using SOED: 

[Optimum profit-driven churn decision making: innovative artificial neural networks in telecom industry](classifications[https://link.springer.com/article/10.1007/s00521-020-04850-6)

We will be using kagglehub to import the sample data. Make sure to install it:

`pip install kagglehub`


In [6]:
import kagglehub
import pandas as pd
import os

# Download latest version
path = kagglehub.dataset_download("royjafari/customer-churn")

print("Path to dataset files:", path)

customer_df = pd.read_csv(path+'/Customer Churn.csv')
customer_df

Path to dataset files: /Users/royjafari/.cache/kagglehub/datasets/royjafari/customer-churn/versions/1


Unnamed: 0,Call Failure,Complains,Subscription Length,Charge Amount,Seconds of Use,Frequency of use,Frequency of SMS,Distinct Called Numbers,Age Group,Tariff Plan,Status,Age,Customer Value,FN,FP,Churn
0,8,0,38,0,4370,71,5,17,3,1,1,30,197.640,177.8760,69.7640,0
1,0,0,39,0,318,5,7,4,2,1,2,25,46.035,41.4315,60.0000,0
2,10,0,37,0,2453,60,359,24,3,1,1,30,1536.520,1382.8680,203.6520,0
3,10,0,38,0,4198,66,1,35,1,1,1,15,240.020,216.0180,74.0020,0
4,3,0,38,0,2393,58,2,33,1,1,1,15,145.805,131.2245,64.5805,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3145,21,0,19,2,6697,147,92,44,2,2,1,25,721.980,649.7820,122.1980,0
3146,17,0,17,1,9237,177,80,42,5,1,1,55,261.210,235.0890,76.1210,0
3147,13,0,18,4,3157,51,38,21,3,1,1,30,280.320,252.2880,78.0320,0
3148,7,0,11,2,4695,46,222,12,3,1,1,30,1077.640,969.8760,157.7640,0


In [7]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from soed import SOEDClassifier
from sklearn.metrics import precision_score, recall_score, roc_auc_score, accuracy_score

In [8]:
customer_df.columns

Index(['Call  Failure', 'Complains', 'Subscription  Length', 'Charge  Amount',
       'Seconds of Use', 'Frequency of use', 'Frequency of SMS',
       'Distinct Called Numbers', 'Age Group', 'Tariff Plan', 'Status', 'Age',
       'Customer Value', 'FN', 'FP', 'Churn'],
      dtype='object')

In [9]:
customer_df['Customer Value'] = customer_df['Customer Value'].replace({0:1}).sort_values()

In [10]:
X = customer_df[['Call  Failure', 'Complains', 'Subscription  Length', 'Charge  Amount',
       'Seconds of Use', 'Frequency of use', 'Frequency of SMS',
       'Distinct Called Numbers', 'Age Group', 'Tariff Plan', 'Status', 'Age',
       'Customer Value']]
y = customer_df['Churn']
c = customer_df[['FN','FP']]

c.loc[:,:] = np.where(c==0.0,1,c) 

c.loc[:,'FN'] = np.where(y==0,0,X['Customer Value'])
c.loc[:,'FP'] = np.where(y==1,0,2.0)
c.join(y)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  c.loc[:,:] = np.where(c==0.0,1,c)


Unnamed: 0,FN,FP,Churn
0,0.00,2.0,0
1,0.00,2.0,0
2,0.00,2.0,0
3,0.00,2.0,0
4,0.00,2.0,0
...,...,...,...
3145,0.00,2.0,0
3146,0.00,2.0,0
3147,0.00,2.0,0
3148,0.00,2.0,0


### Standardizing Data

In [11]:
X  = (X - X.mean())/X.std()

In [12]:
X.describe()

Unnamed: 0,Call Failure,Complains,Subscription Length,Charge Amount,Seconds of Use,Frequency of use,Frequency of SMS,Distinct Called Numbers,Age Group,Tariff Plan,Status,Age,Customer Value
count,3150.0,3150.0,3150.0,3150.0,3150.0,3150.0,3150.0,3150.0,3150.0,3150.0,3150.0,3150.0,3150.0
mean,-5.413659000000001e-17,-3.834675e-17,1.939894e-16,-6.315935e-17,-1.218073e-16,8.346058000000001e-17,-1.691768e-17,-6.541505e-17,-9.022765e-17,1.951173e-16,2.086514e-16,1.624098e-16,1.8609450000000002e-17
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-1.050118,-0.2877847,-3.44573,-0.6198636,-1.065402,-1.209835,-0.6519646,-1.365475,-2.045848,-0.2903628,-0.5745708,-1.8116,-0.9091595
25%,-0.9124506,-0.2877847,-0.2964845,-0.6198636,-0.7339868,-0.7395609,-0.5985066,-0.784665,-0.9254686,-0.2903628,-0.5745708,-0.6792377,-0.6909657
50%,-0.2241137,-0.2877847,0.2867091,-0.6198636,-0.3531424,-0.2692866,-0.4648615,-0.1457741,0.1949104,-0.2903628,-0.5745708,-0.1130565,-0.4691402
75%,0.6018905,-0.2877847,0.6366253,0.03756749,0.477807,0.4448335,0.1231769,0.6092788,0.1949104,-0.2903628,-0.5745708,-0.1130565,0.613903
max,3.905907,3.473717,1.686374,5.954448,3.005673,3.231644,3.998885,4.268381,2.435668,3.442874,1.739877,2.717849,3.277253


### Split Data

In [13]:
random_index = np.random.permutation(X.shape[0])

i = int(round(X.shape[0]*0.5))
train_index = random_index[:i]
test_index = random_index[i+1:]

X_train = X.iloc[train_index]
X_test = X.iloc[test_index]

y_train = y[train_index]
y_test = y[test_index]

In [14]:
X_train

Unnamed: 0,Call Failure,Complains,Subscription Length,Charge Amount,Seconds of Use,Frequency of use,Frequency of SMS,Distinct Called Numbers,Age Group,Tariff Plan,Status,Age,Customer Value
1793,-1.050118,-0.287785,-1.112956,-0.619864,-0.415078,-0.269287,-0.651965,-0.320017,0.194910,-0.290363,-0.574571,-0.113056,-0.695688
39,-0.499448,-0.287785,-0.763039,-0.619864,-0.691883,-0.530550,-0.384674,2.584032,1.315289,-0.290363,-0.574571,1.585487,-0.688308
770,-0.361781,-0.287785,-0.063207,-0.619864,1.665720,1.367964,-0.625236,1.306251,0.194910,-0.290363,-0.574571,-0.113056,0.010649
1172,1.014893,-0.287785,-0.413123,-0.619864,-0.924856,-0.547968,-0.616326,-0.610422,-0.925469,-0.290363,1.739877,-0.679238,-0.821612
2174,1.840897,-0.287785,0.869903,1.352430,0.407474,0.653844,-0.313397,0.260793,0.194910,-0.290363,-0.574571,-0.113056,-0.130402
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3010,-0.361781,-0.287785,0.753264,-0.619864,0.247871,0.357746,-0.651965,-0.552341,0.194910,-0.290363,-0.574571,-0.113056,-0.477574
2145,1.840897,-0.287785,-1.579510,0.694999,0.530155,1.350547,0.167725,1.190089,-0.925469,3.442874,-0.574571,-0.679238,0.485534
2239,-0.224114,-0.287785,-0.763039,0.037567,-0.633520,-0.478297,-0.366855,2.584032,1.315289,-0.290363,-0.574571,1.585487,-0.666644
1206,-1.050118,-0.287785,0.869903,-0.619864,-0.556815,-0.600220,1.914021,-0.203855,0.194910,-0.290363,-0.574571,-0.113056,1.485143


### Model Trainig

In [15]:
soed = SOEDClassifier(mlp_max_iter=10000,som_x=20,som_y=20,som_sigma=20)
soed.fit(X_train.values,y_train)

version 1.0.7
Model training complete.


<soed.SOEDClassifier at 0x1079ac290>

### Model Evaluation

In [16]:
y_proba = soed.predict_proba(X_test.values)
y_pred = soed.predict(X_test.values)

recall = recall_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
accuracy = accuracy_score(y_test,y_pred)
auc = roc_auc_score(y_test,y_proba[:,1])

performance = {'recall':recall,'precision':precision,'accuracy':accuracy,'auc':auc}

In [17]:
print(performance)

{'recall': 0.5578512396694215, 'precision': 0.9375, 'accuracy': 0.9263024142312579, 'auc': np.float64(0.9567263544536272)}


### Model Comparison

In [18]:
report_df = pd.DataFrame(index = [f'repeat{i}' for i in range(1,6)],
                         columns = ['DT','KNN','MLP','SOED'] )
report_df

Unnamed: 0,DT,KNN,MLP,SOED
repeat1,,,,
repeat2,,,,
repeat3,,,,
repeat4,,,,
repeat5,,,,


In [33]:
for loop_i in range(1,6):
    print(loop_i)
    random_index = np.random.permutation(X.shape[0])

    i = int(round(X.shape[0]*0.5))
    train_index = random_index[:i]
    test_index = random_index[i+1:]
    
    X_train = X.iloc[train_index]
    X_test = X.iloc[test_index]
    
    y_train = y[train_index]
    y_test = y[test_index]

    #soed
    soed = SOEDClassifier(mlp_max_iter=10000,som_x=15,som_y=15,som_sigma=20)
    soed.fit(X_train.values,y_train)
    y_proba = soed.predict_proba(X_test.values)
    auc = roc_auc_score(y_test,y_proba[:,1])
    report_df.loc[f'repeat{loop_i}','SOED'] = auc

    #dt
    dt = DecisionTreeClassifier(max_depth=10)
    dt.fit(X_train.values,y_train)
    y_proba = dt.predict_proba(X_test.values)
    auc = roc_auc_score(y_test,y_proba[:,1])
    report_df.loc[f'repeat{loop_i}','DT'] = auc

    #mlp
    mlp = MLPClassifier(max_iter=10000)
    mlp.fit(X_train.values,y_train)
    y_proba = mlp.predict_proba(X_test.values)
    auc = roc_auc_score(y_test,y_proba[:,1])
    report_df.loc[f'repeat{loop_i}','MLP'] = auc

    #knn
    knn = KNeighborsClassifier()
    knn.fit(X_train.values,y_train)
    y_proba = knn.predict_proba(X_test.values)
    auc = roc_auc_score(y_test,y_proba[:,1])
    report_df.loc[f'repeat{loop_i}','KNN'] = auc
    

report_df.loc['Average'] = report_df.mean()

1
version 1.0.7
Model training complete.
2
version 1.0.7
Model training complete.
3
version 1.0.7
Model training complete.
4
version 1.0.7
Model training complete.
5
version 1.0.7
Model training complete.


In [34]:
report_df

Unnamed: 0,DT,MLP,SOED,KNN
repeat1,0.916234,0.985414,0.956414,0.949541
repeat2,0.901782,0.983465,0.951065,0.95788
repeat3,0.876708,0.984569,0.931556,0.949321
repeat4,0.900472,0.980169,0.958522,0.960968
repeat5,0.908884,0.984519,0.960512,0.965695
Average,544.876847,104.115856,392.690678,0.956681


We can see that SOED is performing worse than MLP and KNN. SOED will show its superiority in the decision-making problem. 

# Problem 2
## Deciding Customer Loyalty actions

The FN and FP columns in this dataset are calculated based on the actual customers, making this a perfect example for the SEOD algorithm. Let's use it.


In [35]:
random_index = np.random.permutation(X.shape[0])

i = int(round(X.shape[0]*0.5))
train_index = random_index[:i]
test_index = random_index[i+1:]

X_train = X.iloc[train_index]
X_test = X.iloc[test_index]

y_train = y[train_index]
y_test = y[test_index]

c_train = c.loc[train_index,:]
c_test = c.loc[test_index,:]

In [36]:
soed = SOEDClassifier(mlp_max_iter=10000,som_x=15,som_y=15,som_sigma=20)
soed.fit(X_train.values,y_train,c_train.values)

version 1.0.7
Model training complete.


<soed.SOEDClassifier at 0x11d665d90>

In [37]:
y_decide = soed.decide(X_test.values)

In [38]:
df = pd.DataFrame(np.column_stack((y_test,y_decide,c_test)),columns = ['Reality','Decision','FN','FP'])
df['FN_cost'] = np.where((df.Decision==0) & (df.Reality==1),df.FN,0)
df['FP_cost'] = np.where((df.Decision==1) & (df.Reality==0),df.FP,0)
df['cost'] = df.FP_cost + df.FN_cost
df.sort_values('cost')

Unnamed: 0,Reality,Decision,FN,FP,FN_cost,FP_cost,cost
0,0.0,0.0,0.000,2.0,0.000,0.0,0.000
1020,0.0,0.0,0.000,2.0,0.000,0.0,0.000
1019,0.0,0.0,0.000,2.0,0.000,0.0,0.000
1018,0.0,0.0,0.000,2.0,0.000,0.0,0.000
1017,0.0,0.0,0.000,2.0,0.000,0.0,0.000
...,...,...,...,...,...,...,...
467,1.0,0.0,220.280,0.0,220.280,0.0,220.280
843,1.0,0.0,288.630,0.0,288.630,0.0,288.630
562,1.0,0.0,306.765,0.0,306.765,0.0,306.765
831,1.0,0.0,431.910,0.0,431.910,0.0,431.910


In [39]:
total_cost = df.cost.sum()
print(f'total cost is {total_cost}.')

total cost is 3379.045.


### Comparing Models

In [40]:
report_df = pd.DataFrame(index = [f'repeat{i}' for i in range(1,6)],
                         columns = ['DT','MLP','SOED'] )
report_df

Unnamed: 0,DT,MLP,SOED
repeat1,,,
repeat2,,,
repeat3,,,
repeat4,,,
repeat5,,,


In [41]:
def calc_cost(y_reality,y_decide,c):
    #print(y_reality.shape, y_decide.shape, c.shape)
    df = pd.DataFrame(np.column_stack((y_reality,y_decide,c)),columns = ['Reality','Decision','FN','FP'])
    df['FN_cost'] = np.where((df.Decision==0) & (df.Reality==1),df.FN,0)
    df['FP_cost'] = np.where((df.Decision==1) & (df.Reality==0),df.FP,0)
    df['cost'] = df.FP_cost + df.FN_cost
    return df.cost.sum()

In [42]:
def get_cost_minimizing_threshold(y_reality,y_prob,c):
    candidate_df = pd.DataFrame(index=range(99),columns = ['thresh','cost'])
    candidate_df.thresh = np.linspace(0.01,0.99,99)
    candidate_df = candidate_df.set_index('thresh')
    
    for t in candidate_df.index.tolist():
        y_decide = np.where(y_prob>t,1,0)
        candidate_df.loc[t,'cost'] = calc_cost(y_reality,y_decide,c)
    return candidate_df[candidate_df.cost == candidate_df.cost.min()].index[0]

In [43]:
for loop_i in range(1,6):
    print(loop_i)
    random_index = np.random.permutation(X.shape[0])

    i = int(round(X.shape[0]*0.7))
    train_index = random_index[:i]
    test_index = random_index[i+1:]
    
    X_train = X.iloc[train_index]
    X_test = X.iloc[test_index]
    
    y_train = y[train_index].values
    y_test = y[test_index].values

    c_train = c.loc[train_index,:].values
    c_test = c.loc[test_index,:].values

    #soed
    soed = SOEDClassifier(mlp_max_iter=10000,som_x=15,som_y=15,som_sigma=20)
    soed.fit(X_train.values,y_train,c_train)
    y_decide = soed.decide(X_test.values)
    total_cost= calc_cost(y_test,y_decide,c_test)
    report_df.loc[f'repeat{loop_i}','SOED'] = total_cost

    #dt
    dt = DecisionTreeClassifier()
    dt.fit(X_train.values,y_train)

    y_prob = dt.predict_proba(X_train.values)[:,1]
    thresh = get_cost_minimizing_threshold(y_train,y_prob,c_train)
    print(f'dt threshod= {thresh}')
    y_prob = dt.predict_proba(X_test.values)[:,1]
    y_decide = np.where(y_prob>thresh,1,0)
    total_cost= calc_cost(y_test,y_decide,c_test)
    report_df.loc[f'repeat{loop_i}','DT'] = total_cost

    #mlp
    mlp = MLPClassifier(max_iter=10000)
    mlp.fit(X_train.values,y_train)
    
    y_prob = mlp.predict_proba(X_train.values)[:,1]
    thresh = get_cost_minimizing_threshold(y_train,y_prob,c_train)
    print(f'mlp threshod= {thresh}')
    y_prob = mlp.predict_proba(X_test.values)[:,1]
    y_decide = np.where(y_prob>thresh,1,0)
    total_cost= calc_cost(y_test,y_decide,c_test)
    report_df.loc[f'repeat{loop_i}','MLP'] = total_cost


report_df.loc['Average'] = report_df.mean()

1
version 1.0.7
Model training complete.
dt threshod= 0.01
mlp threshod= 0.03
2
version 1.0.7
Model training complete.
dt threshod= 0.25
mlp threshod= 0.09
3
version 1.0.7
Model training complete.
dt threshod= 0.25
mlp threshod= 0.09999999999999999
4
version 1.0.7
Model training complete.
dt threshod= 0.2
mlp threshod= 0.02
5
version 1.0.7
Model training complete.
dt threshod= 0.25
mlp threshod= 0.060000000000000005


In [44]:
report_df

Unnamed: 0,DT,MLP,SOED
repeat1,2388.45,511.15,1546.12
repeat2,2154.87,846.73,1327.07
repeat3,4970.805,1463.68,2514.53
repeat4,2762.805,376.0,1304.86
repeat5,3992.45,1177.255,3155.91
Average,3253.876,874.963,1969.698


MLP performs better than SOED in this example. 