In [5]:
#from audioop import mul
#from multiprocessing.context import assert_spawning
import os
#from statistics import mode
import timeit
#from bitarray import test

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
#from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
#from sympy import N


print("---Reading input file to pandas Dataframe---")
# dataset path
path = 'data'
file_name = 'churnsimulateddata.csv'
file = os.path.join(path, file_name)
print(file)
# read data
df = pd.read_csv(file)
print(f'Shape of original data: {df.shape}')

print("---Select features---")
#feature_names = ['Age','Tenure','PSYTE_Segment','Total_score','Trnx_count','num_products', 'Churn_risk']
feature_names = ['Age','PSYTE_Segment','Total_score','Churn_risk']
#feature_names = ['PSYTE_Segment','Total_score','Churn_risk']

selected_df = df[feature_names]

selected_df['Churn_risk'] = selected_df.Churn_risk.astype("category").cat.codes

print(selected_df['Churn_risk'].unique())

selected_df = selected_df.dropna()
#selected_df = selected_df.drop(selected_df[(selected_df.Churn_risk != 0) or (selected_df.Churn_risk != 1) (selected_df.Churn_risk != 2)].index)
#print(selected_df['Churn_risk'].unique())


print('---Simulating clients based on geographical locations of the banks---') 
geo_split = 'T'
if geo_split:
    selected_df_v2 = selected_df.sample(frac=1)
    clients_data = []
    for i in range(0, 60, 10):
        clients_data.append(selected_df_v2[(selected_df_v2.PSYTE_Segment >= i) & (selected_df_v2.PSYTE_Segment < i+10)]) 
    
else:
    n_clients = 10
    clients_data = np.array_split(selected_df.sample(frac=1), n_clients)
    
print(f'Number of {np.size(clients_data)} clients. ')
X_train = []
X_test = []
y_train = []
y_test = []
for i, client_data in enumerate(clients_data):
    X = client_data.drop(columns=['Churn_risk','PSYTE_Segment'])
    print(X.columns)
    y = client_data['Churn_risk']
    _X_train, _X_test, _y_train, _y_test = train_test_split(
    X, y, test_size=0.2, random_state=42) 
    
    from imblearn.over_sampling import SMOTE
    sm = SMOTE(random_state=27, ratio=1.0)
    _X_train, _y_train = sm.fit_sample(_X_train, _y_train)

    X_train.append(_X_train)
    X_test.append(_X_test)
    y_train.append(_y_train)
    y_test.append(_y_test)


class LR_ScikitModel():
    def __init__(self):
        self.name = 'LR'

    def fit(self, X_train, X_test, y_train, y_test):

        clf = LogisticRegression(multi_class='ovr', max_iter=1000)
        starttime = timeit.default_timer()
        #Train the model using the training sets y_pred=clf.predict(X_test)
        clf.fit(X_train, y_train)
        model_params = clf.get_params() 
        training_time = timeit.default_timer() - starttime
        print("The training time is :", training_time)
        #starttime = timeit.default_timer()
        y_pred=clf.predict(X_test)
        #precison = metrics.precision_score(y_test, y_pred, average='weighted')
        #print('Precison: ', precison)
        #recall = metrics.recall_score(y_test, y_pred, average='weighted')
        #print('Recall: ', recall)
        #f1 = metrics.f1_score(self.y_test, self.y_pred, average='weighted')
        #print('F1: ', f1)
        accuracy = accuracy_score(y_test, y_pred)
        print('Accuracy: ', accuracy)
        print(classification_report(y_test, y_pred, zero_division=0))
        #print('Intercept')
        #print(clf.intercept_)
        #print('Coefficients')
        #print(clf.coef_)
        
        #testing_time = timeit.default_timer() - starttime
        #print("The testing time is :", testing_time)
        return clf.intercept_, clf.coef_, accuracy


print('---Training local models at local clients---')
#Training Local model
intercept_l = []
coef_l = []
accuracy_l = []
for i in range(np.size(clients_data)):
    print(f'client No {i}')
    model = LR_ScikitModel()
    intercept, coef, accuracy =  model.fit(X_train[i], X_test[i], y_train[i], y_test[i])
    intercept_l.append(intercept)
    coef_l.append(coef)
    accuracy_l.append(accuracy)
#print(intercept_l)
#print(coef_l)
#print(accuracy_l)


print('---Aggregating at the aggregation server---')
#averaged the local weights
#print(np.sum(intercept_l,axis=0))
#print(np.sum(coef_l,axis=0))   # axis1=3 becasue there is 3 classes


print('---Constructing model---')
#averaged the local weights
global_intercept = np.sum(intercept_l,axis=0)
global_coef = np.sum(coef_l,axis=0) 
#print(np.sum(intercept_l,axis=0))
#print(np.sum(coef_l,axis=0))   # axis1=3 becasue there is 3 classes


print('---Testing on local clients---')
#
#print(np.sum(intercept_l,axis=0))
#print(np.sum(coef_l,axis=0))   # axis1=3 becasue there is 3 classes



def multiclass_LogisticFunction(X, W, b):
    '''
    Logistics Regression function
    Input: 
        X: input data in form of a matrix with size (n_samples, n_features)
        W: Weight or logistics coefficient matrix with size (n_classes, n_features)
        b: bias or intercept vector with size (n_classes)  
        ref: https://github.com/bamtak/machine-learning-implemetation-python/blob/master/Multi%20Class%20Logistic%20Regression.ipynb
    '''

    def softmax(z):
        prob = np.exp(z) / np.sum(np.exp(z), axis=1).reshape(-1,1)
        return prob
            
    def predict_(X, W, b):

        assert np.shape(X)[1] == np.shape(W)[1]   
        assert np.shape(W)[0] == np.shape(b)[0]   

        pre_vals = np.dot(X, W.T) + b
        return softmax(pre_vals)
    
    probability = predict_(X, W, b)
    max_prob = np.amax(probability, axis=1, keepdims=True)
    #print(np.shape(max_prob))
    label = np.argmax(probability, axis=1)

    return label



print('---Testing global model on local testing data---')
for i in range(np.size(clients_data)):
    print(f'client No {i}')
    model = LR_ScikitModel()

    label =  multiclass_LogisticFunction(X_test[i], np.array(global_coef), np.array(global_intercept))
    print(classification_report(y_test[i], label, zero_division=0))

    


---Reading input file to pandas Dataframe---
data/churnsimulateddata.csv
Shape of original data: (706693, 19)
---Select features---
[1 2 0]
---Simulating clients based on geographical locations of the banks---
Number of 6 clients. 
Index(['Age', 'Total_score'], dtype='object')
Index(['Age', 'Total_score'], dtype='object')
Index(['Age', 'Total_score'], dtype='object')
Index(['Age', 'Total_score'], dtype='object')
Index(['Age', 'Total_score'], dtype='object')
Index(['Age', 'Total_score'], dtype='object')
---Training local models at local clients---
client No 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_df['Churn_risk'] = selected_df.Churn_risk.astype("category").cat.codes
  return array(a, dtype, copy=False, order=order)


The training time is : 0.4515747089999991
Accuracy:  0.7164094843808807
              precision    recall  f1-score   support

           0       0.60      0.46      0.52      4762
           1       0.79      0.93      0.86     19263
           2       0.49      0.36      0.42      7859

    accuracy                           0.72     31884
   macro avg       0.63      0.58      0.60     31884
weighted avg       0.69      0.72      0.70     31884

client No 1
The training time is : 0.5688301250000052
Accuracy:  0.725025541352072
              precision    recall  f1-score   support

           0       0.81      0.35      0.49      5610
           1       0.79      0.91      0.85     23709
           2       0.53      0.52      0.53     10812

    accuracy                           0.73     40131
   macro avg       0.71      0.59      0.62     40131
weighted avg       0.72      0.73      0.71     40131

client No 2
The training time is : 0.3663896250000107
Accuracy:  0.6952164009111618

  return array(a, dtype, copy=False, order=order)


In [8]:
pip install -U imbalanced-learn

Could not fetch URL https://pypi.org/simple/imbalanced-learn/: There was a problem confirming the ssl certificate: HTTPSConnectionPool(host='pypi.org', port=443): Max retries exceeded with url: /simple/imbalanced-learn/ (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1129)'))) - skipping
[31mERROR: Could not find a version that satisfies the requirement imbalanced-learn (from versions: none)[0m
[31mERROR: No matching distribution found for imbalanced-learn[0m
Could not fetch URL https://pypi.org/simple/pip/: There was a problem confirming the ssl certificate: HTTPSConnectionPool(host='pypi.org', port=443): Max retries exceeded with url: /simple/pip/ (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1129)'))) - skipping
Note: you may need to restart the kernel 