# Problem 1 
## Predicting Customer Churn 

This Problem has been solved in the following journal article using SOED: 

[Optimum profit-driven churn decision making: innovative artificial neural networks in telecom industry](classifications[https://link.springer.com/article/10.1007/s00521-020-04850-6)

We will be using kagglehub to import the sample data. Make sure to install it:

`pip install kagglehub`


In [1]:
import kagglehub
import pandas as pd
import os

# Download latest version
path = kagglehub.dataset_download("royjafari/customer-churn")

print("Path to dataset files:", path)

customer_df = pd.read_csv(path+'/Customer Churn.csv')
customer_df

Path to dataset files: /Users/royjafari/.cache/kagglehub/datasets/royjafari/customer-churn/versions/1


Unnamed: 0,Call Failure,Complains,Subscription Length,Charge Amount,Seconds of Use,Frequency of use,Frequency of SMS,Distinct Called Numbers,Age Group,Tariff Plan,Status,Age,Customer Value,FN,FP,Churn
0,8,0,38,0,4370,71,5,17,3,1,1,30,197.640,177.8760,69.7640,0
1,0,0,39,0,318,5,7,4,2,1,2,25,46.035,41.4315,60.0000,0
2,10,0,37,0,2453,60,359,24,3,1,1,30,1536.520,1382.8680,203.6520,0
3,10,0,38,0,4198,66,1,35,1,1,1,15,240.020,216.0180,74.0020,0
4,3,0,38,0,2393,58,2,33,1,1,1,15,145.805,131.2245,64.5805,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3145,21,0,19,2,6697,147,92,44,2,2,1,25,721.980,649.7820,122.1980,0
3146,17,0,17,1,9237,177,80,42,5,1,1,55,261.210,235.0890,76.1210,0
3147,13,0,18,4,3157,51,38,21,3,1,1,30,280.320,252.2880,78.0320,0
3148,7,0,11,2,4695,46,222,12,3,1,1,30,1077.640,969.8760,157.7640,0


In [2]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from soed import SOEDClassifier
from sklearn.metrics import precision_score, recall_score, roc_auc_score, accuracy_score

In [3]:
customer_df.columns

Index(['Call  Failure', 'Complains', 'Subscription  Length', 'Charge  Amount',
       'Seconds of Use', 'Frequency of use', 'Frequency of SMS',
       'Distinct Called Numbers', 'Age Group', 'Tariff Plan', 'Status', 'Age',
       'Customer Value', 'FN', 'FP', 'Churn'],
      dtype='object')

In [4]:
X = customer_df[['Call  Failure', 'Complains', 'Subscription  Length', 'Charge  Amount',
       'Seconds of Use', 'Frequency of use', 'Frequency of SMS',
       'Distinct Called Numbers', 'Age Group', 'Tariff Plan', 'Status', 'Age',
       'Customer Value']]
y = customer_df['Churn']
c = customer_df[['FN','FP']]

c.loc[:,:] = np.where(c==0.0,1,c) 

c.loc[:,'FN'] = np.where(y==0,0,c['FN'])
c.loc[:,'FP'] = np.where(y==1,0,c['FP'])
c.join(y)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  c.loc[:,:] = np.where(c==0.0,1,c)


Unnamed: 0,FN,FP,Churn
0,0.000,69.7640,0
1,0.000,60.0000,0
2,0.000,203.6520,0
3,0.000,74.0020,0
4,0.000,64.5805,0
...,...,...,...
3145,0.000,122.1980,0
3146,0.000,76.1210,0
3147,0.000,78.0320,0
3148,0.000,157.7640,0


### Standardizing Data

In [5]:
X  = (X - X.mean())/X.std()

In [6]:
X.describe()

Unnamed: 0,Call Failure,Complains,Subscription Length,Charge Amount,Seconds of Use,Frequency of use,Frequency of SMS,Distinct Called Numbers,Age Group,Tariff Plan,Status,Age,Customer Value
count,3150.0,3150.0,3150.0,3150.0,3150.0,3150.0,3150.0,3150.0,3150.0,3150.0,3150.0,3150.0,3150.0
mean,-5.413659000000001e-17,-3.834675e-17,1.939894e-16,-6.315935e-17,-1.218073e-16,8.346058000000001e-17,-1.691768e-17,-6.541505e-17,-9.022765e-17,1.951173e-16,2.086514e-16,1.624098e-16,-2.3684760000000003e-17
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-1.050118,-0.2877847,-3.44573,-0.6198636,-1.065402,-1.209835,-0.6519646,-1.365475,-2.045848,-0.2903628,-0.5745708,-1.8116,-0.9109456
25%,-0.9124506,-0.2877847,-0.2964845,-0.6198636,-0.7339868,-0.7395609,-0.5985066,-0.784665,-0.9254686,-0.2903628,-0.5745708,-0.6792377,-0.6908337
50%,-0.2241137,-0.2877847,0.2867091,-0.6198636,-0.3531424,-0.2692866,-0.4648615,-0.1457741,0.1949104,-0.2903628,-0.5745708,-0.1130565,-0.4690245
75%,0.6018905,-0.2877847,0.6366253,0.03756749,0.477807,0.4448335,0.1231769,0.6092788,0.1949104,-0.2903628,-0.5745708,-0.1130565,0.6139388
max,3.905907,3.473717,1.686374,5.954448,3.005673,3.231644,3.998885,4.268381,2.435668,3.442874,1.739877,2.717849,3.277092


### PCA Transformation

In [7]:
pca = PCA(n_components=7)  # Reduce to 7 dimensions
X_pca = pd.DataFrame(pca.fit_transform(X),columns = [f'PC{i}' for i in range(1,8)])

In [8]:
X_pca.describe()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7
count,3150.0,3150.0,3150.0,3150.0,3150.0,3150.0,3150.0
mean,-9.022765e-17,9.022765e-17,4.511382e-18,5.413659000000001e-17,6.315935e-17,9.022765e-18,-2.2556910000000002e-17
std,2.00439,1.522968,1.280813,1.103711,1.087488,0.9220609,0.7588733
min,-3.325235,-3.121093,-3.970532,-3.747675,-1.922514,-3.716034,-2.528226
25%,-1.556172,-0.9242746,-0.7731999,-0.5450013,-0.7441285,-0.6039311,-0.5565574
50%,-0.2405522,0.03744838,-0.1777119,0.2159625,-0.1681747,-0.1410306,-0.02693406
75%,1.271031,0.6473389,0.8877168,0.7506682,0.2748976,0.5051575,0.5707125
max,7.562679,5.576174,4.404271,2.305927,5.351531,3.369517,2.588053


### Split Data

In [9]:
random_index = np.random.permutation(X.shape[0])

i = int(round(X.shape[0]*0.5))
train_index = random_index[:i]
test_index = random_index[i+1:]

X_train = X_pca.iloc[train_index]
X_test = X_pca.iloc[test_index]

y_train = y[train_index]
y_test = y[test_index]

In [10]:
X_train

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7
2629,-1.433843,1.839518,1.250129,0.075247,-0.291420,-0.649771,-0.419943
577,-0.913075,1.869311,0.506416,1.027903,0.115009,-0.151462,1.360416
268,-2.428379,-0.662301,-0.964325,1.195649,0.121757,-1.251463,0.251629
3031,-2.795738,1.773628,0.933672,0.638523,0.040246,-0.579793,1.073808
1940,-2.025368,-0.538293,-1.220939,-0.195372,0.327741,-0.436260,0.056066
...,...,...,...,...,...,...,...
3147,0.405740,0.786892,0.020433,-1.716238,0.321670,-0.326112,-1.778992
2329,-1.295223,1.836308,1.227097,-0.265603,-0.345131,-0.386319,-0.355519
566,-2.234450,-0.874258,-0.699657,0.764326,-0.127754,-0.742062,0.576186
1201,-2.774218,-0.770983,-0.745294,0.916024,-0.049533,-1.114359,0.366115


### Model Trainig

In [11]:
soed = SOEDClassifier(mlp_max_iter=10000,som_x=20,som_y=20,som_sigma=20,som_input_len=X_train.shape[1])
soed.fit(X_train.values,y_train)

ValueError: operands could not be broadcast together with shapes (1575,2) (1575,) 

### Model Evaluation

In [None]:
y_proba = soed.predict_proba(X_test.values)
y_pred = soed.predict(X_test.values)

recall = recall_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
accuracy = accuracy_score(y_test,y_pred)
auc = roc_auc_score(y_test,y_proba[:,1])

performance = {'recall':recall,'precision':precision,'accuracy':accuracy,'auc':auc}

In [None]:
print(performance)

### Model Comparison

In [None]:
report_df = pd.DataFrame(index = [f'repeat{i}' for i in range(1,6)],
                         columns = ['DT','KNN','MLP','SOED'] )
report_df

In [None]:
for loop_i in range(1,6):
    print(loop_i)
    random_index = np.random.permutation(X.shape[0])

    i = int(round(X.shape[0]*0.5))
    train_index = random_index[:i]
    test_index = random_index[i+1:]
    
    X_train = X_pca.iloc[train_index]
    X_test = X_pca.iloc[test_index]
    
    y_train = y[train_index]
    y_test = y[test_index]

    #soed
    soed = SOEDClassifier(mlp_max_iter=10000,som_x=15,som_y=15,som_sigma=20,som_input_len=X_train.shape[1])
    soed.fit(X_train.values,y_train)
    y_proba = soed.predict_proba(X_test.values)
    auc = roc_auc_score(y_test,y_proba[:,1])
    report_df.loc[f'repeat{loop_i}','SOED'] = auc

    #dt
    dt = DecisionTreeClassifier(max_depth=10)
    dt.fit(X_train.values,y_train)
    y_proba = dt.predict_proba(X_test.values)
    auc = roc_auc_score(y_test,y_proba[:,1])
    report_df.loc[f'repeat{loop_i}','DT'] = auc

    #mlp
    mlp = MLPClassifier(max_iter=10000)
    mlp.fit(X_train.values,y_train)
    y_proba = mlp.predict_proba(X_test.values)
    auc = roc_auc_score(y_test,y_proba[:,1])
    report_df.loc[f'repeat{loop_i}','MLP'] = auc

    #knn
    knn = KNeighborsClassifier()
    knn.fit(X_train.values,y_train)
    y_proba = knn.predict_proba(X_test.values)
    auc = roc_auc_score(y_test,y_proba[:,1])
    report_df.loc[f'repeat{loop_i}','KNN'] = auc
    

report_df.loc['Average'] = report_df.mean()

In [None]:
report_df

We can see that SOED is performing worse than MLP and KNN. SOED will show its superiority in the decision-making problem. 

# Problem 2
## Deciding Customer Loyalty actions

The FN and FP columns in this dataset are calculated based on the actual customers, making this a perfect example for the SEOD algorithm. Let's use it.


In [None]:
random_index = np.random.permutation(X.shape[0])

i = int(round(X.shape[0]*0.5))
train_index = random_index[:i]
test_index = random_index[i+1:]

X_train = X_pca.iloc[train_index]
X_test = X_pca.iloc[test_index]

y_train = y[train_index]
y_test = y[test_index]

c_train = c.loc[train_index,:]
c_test = c.loc[test_index,:]

In [None]:
soed = SOEDClassifier(mlp_max_iter=10000,som_x=20,som_y=20,som_sigma=20,som_input_len=X_train.shape[1])
soed.fit(X_train.values,y_train,c_train.values)

In [None]:
y_decide = soed.decide(X_test.values)

In [None]:
df = pd.DataFrame(np.column_stack((y_test,y_decide,c_test)),columns = ['Reality','Decision','FN','FP'])
df['FN_cost'] = np.where((df.Decision==0) & (df.Reality==1),df.FN,0)
df['FP_cost'] = np.where((df.Decision==1) & (df.Reality==0),df.FP,0)
df['cost'] = df.FP_cost + df.FN_cost
df.sort_values('cost')

In [None]:
total_cost = df.cost.sum()
print(f'total cost is {total_cost}.')

### Comparing Models

In [None]:
report_df = pd.DataFrame(index = [f'repeat{i}' for i in range(1,6)],
                         columns = ['DT','MLP','SOED'] )
report_df

In [None]:
def calc_cost(y_reality,y_decide,c):
    #print(y_reality.shape, y_decide.shape, c.shape)
    df = pd.DataFrame(np.column_stack((y_reality,y_decide,c)),columns = ['Reality','Decision','FN','FP'])
    df['FN_cost'] = np.where((df.Decision==0) & (df.Reality==1),df.FN,0)
    df['FP_cost'] = np.where((df.Decision==1) & (df.Reality==0),df.FP,0)
    df['cost'] = df.FP_cost + df.FN_cost
    return df.cost.sum()

In [None]:
def get_cost_minimizing_threshold(y_reality,y_prob,c):
    candidate_df = pd.DataFrame(index=range(99),columns = ['thresh','cost'])
    candidate_df.thresh = np.linspace(0.01,0.99,99)
    candidate_df = candidate_df.set_index('thresh')
    
    for t in candidate_df.index.tolist():
        y_decide = np.where(y_prob>t,1,0)
        candidate_df.loc[t,'cost'] = calc_cost(y_reality,y_decide,c)
    return candidate_df[candidate_df.cost == candidate_df.cost.min()].index[0]

In [None]:
for loop_i in range(1,6):
    print(loop_i)
    random_index = np.random.permutation(X.shape[0])

    i = int(round(X.shape[0]*0.7))
    train_index = random_index[:i]
    test_index = random_index[i+1:]
    
    X_train = X_pca.iloc[train_index]
    X_test = X_pca.iloc[test_index]
    
    y_train = y[train_index].values
    y_test = y[test_index].values

    c_train = c.loc[train_index,:].values
    c_test = c.loc[test_index,:].values

    #soed
    soed = SOEDClassifier(mlp_max_iter=10000,som_x=20,som_y=20,som_sigma=20,som_input_len=X_train.shape[1])
    soed.fit(X_train.values,y_train,c_train)
    y_decide = soed.decide(X_test.values)
    total_cost= calc_cost(y_test,y_decide,c_test)
    report_df.loc[f'repeat{loop_i}','SOED'] = total_cost

    #dt
    dt = DecisionTreeClassifier()
    dt.fit(X_train.values,y_train)

    y_prob = dt.predict_proba(X_train.values)[:,1]
    thresh = get_cost_minimizing_threshold(y_train,y_prob,c_train)
    print(f'dt threshod= {thresh}')
    y_prob = dt.predict_proba(X_test.values)[:,1]
    y_decide = np.where(y_prob>thresh,1,0)
    total_cost= calc_cost(y_test,y_decide,c_test)
    report_df.loc[f'repeat{loop_i}','DT'] = total_cost

    #mlp
    mlp = MLPClassifier(max_iter=10000)
    mlp.fit(X_train.values,y_train)
    
    y_prob = mlp.predict_proba(X_train.values)[:,1]
    thresh = get_cost_minimizing_threshold(y_train,y_prob,c_train)
    print(f'mlp threshod= {thresh}')
    y_prob = mlp.predict_proba(X_test.values)[:,1]
    y_decide = np.where(y_prob>thresh,1,0)
    total_cost= calc_cost(y_test,y_decide,c_test)
    report_df.loc[f'repeat{loop_i}','MLP'] = total_cost


report_df.loc['Average'] = report_df.mean()

In [None]:
report_df

We can see that SOED will incur significantly less cost than MLP or DT.