In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, KFold, GroupKFold, StratifiedKFold, cross_val_score
import lightgbm as lgb
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, roc_auc_score, precision_score, f1_score
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.pipeline import Pipeline
from random import random, seed
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from rpy2 import robjects

sns.set(rc={'figure.figsize':(11.7,8.27)})

In [6]:
data = pd.read_csv("/data.csv")

In [7]:
data_train = data[data["basvuru_tarihi"]<"2022-02-31"]
data_test = data[data["basvuru_tarihi"]>"2022-02-31"]

In [8]:
cols_drop = ["basvuru_tarihi","vade_tarihi","is_rounded_price","com_nakdirisk_avg",
             "com_gayrinakdirisk_avg","com_nakdilimit_avg","com_gayrinakdilimit_avg",
             "com_avg_diff_gayrinakdi_limit_risk","com_max_diff_gayrinakdi_limit_risk","com_avg_diff_nakdi_limit_risk",
             "com_max_diff_nakdi_limit_risk","com_max_gayrinakdi_limit","com_max_gayrinakdi_risk","com_max_nakdi_limit",
             "com_max_nakdi_risk","quantile_ind_last_3_months_0_count","quantile_ind_last_3_months_1_count",
             "quantile_ind_last_3_months_2_count","quantile_ind_last_3_months_3_count","quantile_ind_last_3_months_4_count",
             "quantile_ind_last_3_months_5_count","quantile_ind_last_3_months_6_count","quantile_ind_02_03_tk_last_6_month_perf_max",
             "quantile_ind_02_03_tk_last_3_month_perf_max","quantile_ind_23_26_last_6_month_perf_max","quantile_ind_23_26_last_3_month_perf_max",
             "yearly_rate","individual_id","ihtiyac_rate","vade_ay","vade_sonu_ref","vade_sonu"]

data_train = data_train.drop(cols_drop,axis=1)
data_test = data_test.drop(cols_drop,axis=1)

In [9]:
data_train_copy = data_train.copy()
data_test_copy = data_test.copy()
#data_train.info(verbose=True)

In [None]:
def predict_with_cluster(kmeans_model,res_df,data,cols):
    data =  data.copy()
    scaler = MinMaxScaler()
    data_clust = scaler.fit_transform(data[cols])
    test_labels = kmeans_model.predict(data_clust)
    data["test_labels"] = test_labels
    data["preds"] = 0
    for c in res_df["label"].values:
        X = data[data["test_labels"]==c]["yearly_rate_ref"]
        if len(X) == 0:
            continue
        model = res_df.loc[res_df["label"]==c,"model"].values[0]
        pred_data = data[data["test_labels"]==c]
        preds = model.predict_proba(X.values.reshape(-1,1))
        data.loc[data["test_labels"]==c,"preds"] = [x[1] for x in preds]
    return data["preds"].values

## Non-segmented Logistic Regression

In [None]:
X = data_train_copy.drop(["event_id","completed"],axis=1)
y = data_train["completed"]
skf = StratifiedKFold(n_splits=5)
log_reg = LogisticRegression(C=0.1,solver="liblinear")
cv = cross_val_score(log_reg,X,y,scoring="roc_auc",cv=skf)
log_reg.fit(X,y)
cv.mean()

In [None]:
customer = data_test.iloc[0:1,].drop(["event_id","completed"],axis=1)
log_reg.predict_proba(customer)

In [None]:
give_price_reg(log_reg,0.81533405,customer)

In [None]:
customer["yearly_rate_ref"]

In [None]:
data_train["yearly_rate_ref"].max()

## LGB Loop

In [15]:
def train_loop(df, holdout, num_folds, useful_features, target, params, num_boost_round, verbose_eval, early_stopping_rounds, feval, sample_ratio):
    kfold = StratifiedKFold(n_splits = num_folds, shuffle=True, random_state = 2019)
    oof_predictions = np.zeros((df.shape[0]))
    holdout_predictions = []
    feature_importance = pd.DataFrame()
    clfs = []
    fold = 0
    print(useful_features)
    for train_index, valid_index in kfold.split(df[useful_features], df[target]):
        print("### Fold", fold+1, "###")
        if sample_ratio is not None:
            x_train = df.iloc[train_index].copy()
            sample_size = int(x_train[target].sum()*sample_ratio)
            x_train = pd.concat([x_train[x_train[target]==0].sample(sample_size),x_train[x_train[target]==1]], axis=0).reset_index(drop=True)
            y_train = x_train[target]
            x_train = x_train[useful_features]
        else:
            x_train = df[useful_features].iloc[train_index].copy()
            y_train = df[target].iloc[train_index]
            
        x_valid = df[useful_features].iloc[valid_index].copy()
        y_valid = df[target].iloc[valid_index]
        
        print("Train shape:",x_train.shape, "Target average:", y_train.mean(),"\n",
              "Valid shape:",x_valid.shape, "Target average:", y_valid.mean(),"\n")
        
        tr_data = lgb.Dataset(x_train, label=y_train)
        vl_data = lgb.Dataset(x_valid, label=y_valid)  
        if holdout is not None:
            holdout_data = lgb.Dataset(holdout[useful_features], label=holdout[target])  
            estimator = lgb.train(params,tr_data,valid_sets = [vl_data, holdout_data],num_boost_round=num_boost_round, 
                                  verbose_eval = verbose_eval, early_stopping_rounds=early_stopping_rounds, feval=feval) 

        else:
            estimator = lgb.train(params,tr_data,valid_sets = [vl_data],num_boost_round=num_boost_round, # tr_data, 
                                  verbose_eval = verbose_eval, early_stopping_rounds=early_stopping_rounds, feval=feval) 

        clfs.append(estimator)
        oof_pred = estimator.predict(x_valid)
        oof_predictions[valid_index] = oof_pred
        
        if holdout is not None:
            holdout_predictions.append(estimator.predict(holdout[useful_features]))
                  
        if params['metric'] == 'auc':
            oof_score = roc_auc_score(y_valid,oof_pred)
            print(f"Fold {fold+1} AUC Score", oof_score,"\n")
            if holdout is not None:
                print(f"Validation AUC: {roc_auc_score(holdout[target],np.mean(holdout_predictions,axis=0))}","\n")
            
        elif params['metric'] == 'rmse':
            oof_score = mean_squared_error(y_valid,oof_pred)**0.5
            print(f"Fold {fold+1} RMSE Score", oof_score,"\n")
            if holdout is not None:
                print(f"Validation RMSE: {mean_squared_error(holdout[target],np.mean(holdout_predictions,axis=0))**0.5}","\n")

        imp = pd.DataFrame(sorted(zip(estimator.feature_importance(importance_type='gain'),x_train.columns)), columns=["importance","feature"])                
        imp["fold"] = fold
        feature_importance = pd.concat([feature_importance, imp], axis=0)
        # timer.time(f"fold {fold+1} done")
        fold += 1  

    if params['metric'] == 'auc':
        oof_score = roc_auc_score(df[target],oof_predictions)
        print(f"Out of Fold AUC Score", oof_score)

    elif params['metric'] == 'rmse':
        oof_score = mean_squared_error(df[target],oof_predictions)**0.5
        print(f"Out of Fold RMSE Score", oof_score)
        
    return clfs, oof_predictions, feature_importance

def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.round(y_hat)
    return 'f1', f1_score(y_true, y_hat), True

In [None]:
lgb_params = {'objective':'binary','boosting_type':'gbdt','metric':'None','nthread':6,'learning_rate':0.005,'tree_learner':'serial',
              'num_leaves': 2**7,'min_data_in_leaf': 100,'max_depth':8,'max_bin':255,
              'subsample_freq':1,'feature_fraction': 0.75,'subsample':0.75,'verbose':-100,'seed': 492}   

models, oof_predictions, feature_importance = train_loop(df=data_train.drop(["event_id","yearly_rate_ref"],axis=1), holdout=None, num_folds=3, useful_features=data_train.drop(["event_id","completed","yearly_rate_ref"],axis=1).columns, target = 'completed',
                                                         params = lgb_params, num_boost_round=2000, verbose_eval=100, early_stopping_rounds=200, 
                                                         feval=lgb_f1_score,sample_ratio=None)

In [17]:
feat_imp_loop = feature_importance.groupby("feature")["importance"].mean().reset_index().sort_values("importance",ascending=False).iloc[0:16,0]

In [18]:
feat_imp_loop.to_csv("important_features.csv")

In [None]:
feature_importance.groupby("feature")["importance"].mean().reset_index().sort_values("importance",ascending=False).iloc[0:16,]

In [None]:
scaler = MinMaxScaler()
data_clust = scaler.fit_transform(data_train_copy[feat_imp_loop])
wcss = []
for i in range(1, 21):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=0,n_init=50)
    kmeans.fit(data_clust)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 21), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
scaler = MinMaxScaler()
data_clust = scaler.fit_transform(data_train_copy[feat_imp_loop])
kmeans_loop = KMeans(n_clusters=5, init='k-means++', random_state=0,n_init=50)
kmeans_loop.fit(data_clust)
labels_loop = kmeans_loop.predict(data_clust)
data_train["labels_loop"] = labels_loop
silhouette_score(data_clust,labels_loop)

In [None]:
res_frame_loop = pd.DataFrame({"label":[],"intercept":[],"price_coef":[],"model":[],"auc":[],"accuracy":[]})
for c in data_train["labels_loop"].unique():
    X = data_train[data_train["labels_loop"]==c]["yearly_rate_ref"]
    y = data_train[data_train["labels_loop"]==c]["completed"]
    log_reg = LogisticRegression(fit_intercept=True)
    auc =  np.mean(cross_val_score(log_reg,X.values.reshape(-1,1),y,cv=10,scoring="roc_auc"))
    accuracy = np.mean(cross_val_score(log_reg,X.values.reshape(-1,1),y,cv=10,scoring="accuracy"))
    log_reg.fit(X.values.reshape(-1,1),y)
    res_frame_loop.loc[len(res_frame_loop.index)] = [c,log_reg.intercept_[0],log_reg.coef_[0,0],log_reg,
                                                     auc,
                                                     accuracy]
res_frame_loop

In [None]:
data_train.groupby(['labels_loop',]).agg({"completed":"mean",
                                          "yearly_rate_ref":"mean",
                                          "event_id":"count"})

In [None]:
roc_auc_score(data_test["completed"],predict_with_cluster(kmeans_loop,res_frame_loop,data_test_copy,feat_imp_loop))

## Unsupervised Cluster

In [None]:
scaler = MinMaxScaler()
data_clust = scaler.fit_transform(data_train_copy.drop(["yearly_rate_ref"],axis=1))
wcss = []
for i in range(1, 21):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=0)
    kmeans.fit(data_clust)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 21), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
scaler = MinMaxScaler()
data_clust = scaler.fit_transform(data_train_copy.drop(["completed","yearly_rate_ref"],axis=1))
kmeans_unsp = KMeans(n_clusters=6, init='k-means++', random_state=0,n_init=50)
kmeans_unsp.fit(data_clust)
labels_unsp = kmeans_unsp.predict(data_clust)
data_train["labels_unsp"] = labels_unsp
silhouette_score(data_clust,labels_unsp)

In [None]:
res_frame_unsp = pd.DataFrame({"label":[],"intercept":[],"price_coef":[],"model":[],"auc":[],"accuracy":[]})
for c in data_train["labels_unsp"].unique():
    X = data_train[data_train["labels_unsp"]==c]["yearly_rate_ref"]
    y = data_train[data_train["labels_unsp"]==c]["completed"]
    log_reg = LogisticRegression(fit_intercept=True,penalty="none")
    log_reg.fit(X.values.reshape(-1,1),y)
    res_frame_unsp.loc[len(res_frame_unsp.index)] = [c,log_reg.intercept_[0],log_reg.coef_[0,0],log_reg,
                                                     np.mean(cross_val_score(log_reg,X.values.reshape(-1,1),y,cv=10,scoring="roc_auc")),
                                                     np.mean(cross_val_score(log_reg,X.values.reshape(-1,1),y,cv=10,scoring="accuracy"))]
res_frame_unsp

In [None]:
data_train.groupby(['labels_unsp',]).agg({"completed":"mean",
                                          "yearly_rate_ref":"mean",
                                          "event_id":"count"})

In [None]:
cols_unsp = data_train_copy.drop(["completed","yearly_rate_ref"],axis=1).columns.values
roc_auc_score(data_test["completed"],predict_with_cluster(kmeans_unsp,res_frame_unsp,data_test_copy,cols_unsp))

## MOB Features Clusters

In [29]:
mob_cols = list(pd.read_csv("features.csv").columns)

In [None]:
scaler = MinMaxScaler()
data_clust = scaler.fit_transform(data_train_copy[mob_cols])
wcss = []
for i in range(1, 21):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=0,n_init=50)
    kmeans.fit(data_clust)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 21), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
scaler = MinMaxScaler()
data_clust = scaler.fit_transform(data_train_copy[mob_cols])
kmeans_mob = KMeans(n_clusters=6, init='k-means++', random_state=0,n_init=50)
kmeans_mob.fit(data_clust)
labels_mob_cluster = kmeans_mob.predict(data_clust)
data_train["labels_mob_cluster"] = labels_mob_cluster
silhouette_score(data_clust,labels_mob_cluster)

In [None]:
res_frame_mob_cluster = pd.DataFrame({"label":[],"intercept":[],"price_coef":[],"model":[],"auc":[],"accuracy":[]})
for c in data_train["labels_mob_cluster"].unique():
    X = data_train[data_train["labels_mob_cluster"]==c]["yearly_rate_ref"]
    y = data_train[data_train["labels_mob_cluster"]==c]["completed"]
    log_reg = LogisticRegression(fit_intercept=True)
    log_reg.fit(X.values.reshape(-1,1),y)
    res_frame_mob_cluster.loc[len(res_frame_mob_cluster.index)] = [c,log_reg.intercept_[0],log_reg.coef_[0,0],log_reg,
                                                     np.mean(cross_val_score(log_reg,X.values.reshape(-1,1),y,cv=10,scoring="roc_auc")),
                                                     np.mean(cross_val_score(log_reg,X.values.reshape(-1,1),y,cv=10,scoring="accuracy"))]
res_frame_mob_cluster

In [None]:
data_train.groupby(['labels_mob_cluster',]).agg({"completed":"mean",
                                          "yearly_rate_ref":"mean",
                                          "event_id":"count"})

In [None]:
roc_auc_score(data_test["completed"],predict_with_cluster(kmeans_mob,res_frame_loop,data_test_copy,mob_cols))

# New Simulations

### MODEL BASED TREE

In [116]:
def give_nodes_MOB(customers_all,train_data):
    train_data.to_csv("train_data_tmp.csv")
    customers_all.to_csv("customers_all.csv")
    nodes = robjects.r('''
    library(data.table)
    library(partykit)
    data_train = fread("train_data_tmp.csv",encoding='UTF-8')
    data_train = data_train[,-c("V1")]
    model = glmtree(completed ~ yearly_rate_ref | .-yearly_rate_ref, 
                    data = data_train,family=binomial(link='logit'),
                    minsize = 500, maxdepth = 8)
    result_coefs = coef(model)
    colnames(result_coefs) = c("intercept","price_coef")
    nodes = as.numeric(rownames(result_coefs))
    result_coefs = data.table(result_coefs)
    result_coefs$node = nodes
    fwrite(result_coefs,file="result_coefs.csv")
    
    customers_all = fread("customers_all.csv")
    customers_all = customers_all[,-c("V1")]
    customer_nodes = predict(model,customers_all,type="node")
    node_table = data.table("nodes"=customer_nodes)
    fwrite(node_table,file="node_table.csv")
    
    customer_probs = predict(model,customers_all,type="response")
    prob_table = data.table("probs"=customer_probs)
    fwrite(prob_table,file="prob_table.csv")
    
    customers02 = copy(customers_all)
    customers02$yearly_rate_ref = -0.2
    customer_probs02 = predict(model,customers02,type="response")
    prob_table02 = data.table("probs"=customer_probs02)
    fwrite(prob_table02,file="prob_table02.csv")
    ''')
    
    result_coefs = pd.read_csv("result_coefs.csv")
    node_table = pd.read_csv("node_table.csv")
    prob_table = pd.read_csv("prob_table.csv")
    prob_table02 = pd.read_csv("prob_table02.csv")
    res = [node_table,result_coefs,prob_table,prob_table02]
    return res

In [82]:
def give_price_mbt(result_coefs,prob,node):
    intercept = result_coefs.loc[result_coefs["node"]==node,"intercept"].values[0]
    price_coef = result_coefs.loc[result_coefs["node"]==node,"price_coef"].values[0]
    price = (np.log(prob/(1-prob))-intercept)/price_coef
    if price_coef > 0:
        price = "tarfin"
    return price

In [166]:
numb_of_rep = 5
target_percent = 1 # (100%)

In [167]:
min_scale = 1
max_scale = data_test['farmer_paid_amount'].max() - data_test['farmer_paid_amount'].min() + 1

scale = MinMaxScaler([min_scale,max_scale])
data_test["paid_amount"] = scale.fit_transform(data_test[["farmer_paid_amount"]])
target = sum(data_test.loc[data_test['completed']== 1, 'paid_amount'])*target_percent
n_d = len(data_test)
lambda_d = sum(data_test['paid_amount'])/n_d

In [168]:
parameters_from_R = give_nodes_MOB(data_test.drop("event_id", axis=1), data_train.drop("event_id", axis=1))
node_table = parameters_from_R[0]
coef_table = parameters_from_R[1]
prob_table = parameters_from_R[2]
prob_table02 = parameters_from_R[3]

In [169]:
results_mbt_simulation = pd.DataFrame({"replication" : [],
                        "iteration" : [],
                        "event_id" : [],
                        "paid_amount" : [],
                        "prob" : [],
                        "offered_rate_ref" : [],
                        "offered_rate" : [],
                        "is_accepted" : [],
                        "cum_collected" : [],
                        "collected/target" : [],
                        "per_profit" : []})

In [None]:
seed(492)
for rep in range(numb_of_rep):
    collected = 0
    flag = 0
    for i in range(n_d):
        lambda_r = (target-collected)/(n_d-i+1)
        prob = lambda_r/lambda_d
        profit = 0
        if prob > 0.95:
            prob = 0.95
        elif prob < 0.05:
            prob = 0.05
        offeredRate = give_price_mbt(coef_table, prob, node_table.iloc[i].values[0])
        
        if offeredRate == "tarfin":
            current_event_id = data_test.iloc[i]['event_id']
            offeredRate = data[data['event_id'] == current_event_id]['yearly_rate_ref'].values[0]
            prob = prob_table.iloc[i].values[0]
        
        if offeredRate < -0.2:
            offeredRate = -0.2
            prob = prob_table02.iloc[i].values[0]
        
        if random() <= prob:
            collected = collected + data_test.iloc[i]['paid_amount']   
            is_accepted = 1
            current_event_id = data_test.iloc[i]['event_id']
            ihtiyac_rate = data[data['event_id'] == current_event_id]['ihtiyac_rate'].values[0]
            profit = (offeredRate*ihtiyac_rate + ihtiyac_rate)/100*data_test.iloc[i]["paid_amount"]
        else:
            is_accepted = 0
            
            
        tempRow = pd.DataFrame({"replication" : [rep+1], 
                                "iteration" : [i],
                                "event_id" : [data_test.iloc[i]['event_id']],
                                "paid_amount" : [data_test.iloc[i]['paid_amount']],
                                "prob" : [prob],
                                "offered_rate_ref" : [offeredRate],
                                "offered_rate" : [(offeredRate*ihtiyac_rate + ihtiyac_rate)/100],
                                "is_accepted" : [is_accepted],
                                "cum_collected" : [collected],
                                "collected/target" : [collected/target],
                                "per_profit" : [profit]})
        results_mbt_simulation = results_mbt_simulation.append(tempRow) 
        if collected >= target:
            print("early stopping for rep", rep+1, "at the customer indice of", i)
            break
        if flag == 0:
            if collected/target > 0.95:
                print("95% of the target has been reached for rep", rep+1, "at the customer indice of", i)
                flag = 1

### LOGISTIC REGRESSION

In [None]:
X = data_train_copy.drop(["event_id","completed"],axis=1)
y = data_train["completed"]
skf = StratifiedKFold(n_splits=5)
log_reg = LogisticRegression(C=0.1,solver="liblinear")
cv = cross_val_score(log_reg,X,y,scoring="roc_auc",cv=skf)
log_reg.fit(X,y)
cv.mean()

In [171]:
def give_price_reg(log_reg,prob,customer): 
    price_coef = log_reg.coef_[0][np.where(customer.columns == "yearly_rate_ref")][0]
    other_coefs = log_reg.coef_[0][np.where(customer.columns != "yearly_rate_ref")]
    coefs_mult = customer.iloc[0,:].drop(["yearly_rate_ref"])*other_coefs
    intercept = log_reg.intercept_[0]
    b0 = coefs_mult.sum() + intercept
    b1 = log_reg.coef_[0,0]
    price = (np.log(prob/(1-prob))-b0)/price_coef
    return price

In [172]:
results_logreg_simulation = pd.DataFrame({"replication" : [],
                        "iteration" : [],
                        "event_id" : [],
                        "paid_amount" : [],
                        "prob" : [],
                        "offered_rate_ref" : [],
                        "offered_rate" : [],
                        "is_accepted" : [],
                        "cum_collected" : [],
                        "collected/target" : [],
                        "per_profit" : []})

In [None]:
seed(492)
for rep in range(numb_of_rep):
    collected = 0
    flag = 0
    for i in range(n_d):
        lambda_r = (target-collected)/(n_d-i+1)
        prob = lambda_r/lambda_d
        profit = 0
        if prob > 0.95:
            prob = 0.95
        elif prob < 0.05:
            prob = 0.05
        customer_i = data_test.iloc[i:(i+1),].drop(["event_id","completed", "paid_amount"],axis=1)
        offeredRate = give_price_reg(log_reg, prob, customer_i) #burası her model için farklı fonksiyon olacak
        
        if offeredRate < -0.2:
            offeredRate = -0.2
            prob = prob_table02.iloc[i].values[0]
        
        if random() <= prob:
            collected = collected + data_test.iloc[i]['paid_amount']   
            is_accepted = 1
            current_event_id = data_test.iloc[i]['event_id']
            ihtiyac_rate = data[data['event_id'] == current_event_id]['ihtiyac_rate'].values[0]
            profit = (offeredRate*ihtiyac_rate + ihtiyac_rate)/100*data_test.iloc[i]["paid_amount"]
        else:
            is_accepted = 0
            
            
        tempRow = pd.DataFrame({"replication" : [rep+1], 
                                "iteration" : [i],
                                "event_id" : [data_test.iloc[i]['event_id']],
                                "paid_amount" : [data_test.iloc[i]['paid_amount']],
                                "prob" : [prob],
                                "offered_rate_ref" : [offeredRate],
                                "offered_rate" : [(offeredRate*ihtiyac_rate + ihtiyac_rate)/100],
                                "is_accepted" : [is_accepted],
                                "cum_collected" : [collected],
                                "collected/target" : [collected/target],
                                "per_profit" : [profit]})
        results_logreg_simulation = results_logreg_simulation.append(tempRow) 
        if collected >= target:
            print("early stopping for rep", rep+1, "at the customer indice of", i)
            break
        if flag == 0:
            if collected/target > 0.95:
                print("95% of the target has been reached for rep", rep+1, "at the customer indice of", i)
                flag = 1

In [None]:
sum(data[data["event_id"].isin(data_test[data_test["completed"]==1]["event_id"])]["yearly_rate"]/100*data_test[data_test["completed"]==1]["paid_amount"])

In [None]:
results_mbt_simulation[results_mbt_simulation["replication"]==1]['per_profit'].sum()

In [None]:
results_logreg_simulation[results_logreg_simulation["replication"]==1]['per_profit'].sum()

In [None]:
results_logreg_simulation.describe()

In [None]:
print(results_mbt_simulation.groupby('replication').cum_collected.max()/target*100)
print(results_mbt_simulation.groupby('replication').is_accepted.mean())
print(results_mbt_simulation.groupby('replication').offered_rate_ref.min())
print(results_mbt_simulation.groupby('replication').offered_rate.mean())
print(results_mbt_simulation.groupby('replication').per_profit.sum())

In [None]:
print(results_logreg_simulation.groupby('replication').cum_collected.max()/target*100)
print(results_logreg_simulation.groupby('replication').is_accepted.mean())
print(results_logreg_simulation.groupby('replication').offered_rate_ref.min())
print(results_logreg_simulation.groupby('replication').offered_rate.mean())
print(results_logreg_simulation.groupby('replication').per_profit.sum())

In [None]:
results_logreg_simulation[results_logreg_simulation["iteration"]==1]

In [None]:
results_mbt_simulation.groupby(by="iteration").mean()

In [None]:
results_2 = pd.DataFrame({"replication" : [], 
                                "iteration" : [],
                                "event_id" : [],
                                "paid_amount" : [],
                                "prob" : [],
                                "offered_rate_ref" : [],
                                "offered_rate" : [],
                                "is_accepted" : [],
                                "cum_collected" : [],
                                "collected/target" : [],
                                "per_profit" : []})
for rep in range(numb_of_rep):
    collected = 0
    flag = 0
    for i in range(n_d):
        lambda_r = (target-collected)/(n_d-i+1)
        prob = lambda_r/lambda_d
        profit = 0
        if prob > 0.95:
            prob = 0.95
        elif prob < 0.05:
            prob = 0.05
        customer_i = data_test.iloc[i:(i+1),].drop(["event_id","completed", "paid_amount"],axis=1)
        #offeredRate = give_price_reg(log_reg, prob, customer_i) #burası her model için farklı fonksiyon olacak
        if data_test_copy.iloc[[i]].labels.values[0] == 0:
            if model_0.coef_[0][0] > 0:
                offeredRate =  data_test_copy.iloc[[i]].yearly_rate_ref.values[0]
            else:
                offeredRate = give_price_reg(model_0,prob)
                if offeredRate < -0.2:
                    offeredRate = -0.2
                    prob = model_0.predict_proba(np.array(-0.2).reshape(-1,1))[:,1][0]
                
        if data_test_copy.iloc[[i]].labels.values[0] == 1:
            if model_1.coef_[0][0] > 0:
                offeredRate =  data_test_copy.iloc[[i]].yearly_rate_ref.values[0]
            else: 
                offeredRate = give_price_reg(model_1,prob)
                if offeredRate < -0.2:
                    offeredRate = -0.2
                    prob = model_1.predict_proba(np.array(-0.2).reshape(-1,1))[:,1][0]
        if data_test_copy.iloc[[i]].labels.values[0] == 2:
            if model_2.coef_[0][0] > 0:
                offeredRate =  data_test_copy.iloc[[i]].yearly_rate_ref.values[0]
            else: 
                offeredRate = give_price_reg(model_2,prob)
                if offeredRate < -0.2:
                    offeredRate = -0.2
                    prob = model_2.predict_proba(np.array(-0.2).reshape(-1,1))[:,1][0]
            
        if data_test_copy.iloc[[i]].labels.values[0] == 3:
            if model_3.coef_[0][0] > 0:
                offeredRate =  data_test_copy.iloc[[i]].yearly_rate_ref.values[0]
            else: 
                offeredRate = give_price_reg(model_3,prob)
                if offeredRate < -0.2:
                    offeredRate = -0.2
                    prob = model_3.predict_proba(np.array(-0.2).reshape(-1,1))[:,1][0]
        if data_test_copy.iloc[[i]].labels.values[0] == 4:
            if model_4.coef_[0][0] > 0:
                offeredRate =  data_test_copy.iloc[[i]].yearly_rate_ref.values[0]
            else: 
                offeredRate = give_price_reg(model_4,prob)
                if offeredRate < -0.2:
                    offeredRate = -0.2
                    prob = model_4.predict_proba(np.array(-0.2).reshape(-1,1))[:,1][0]
        if data_test_copy.iloc[[i]].labels.values[0] == 5:
            if model_4.coef_[0][0] > 0:
                offeredRate =  data_test_copy.iloc[[i]].yearly_rate_ref.values[0]
            else: 
                offeredRate = give_price_reg(model_4,prob)
                if offeredRate < -0.2:
                    offeredRate = -0.2
                    prob = model_4.predict_proba(np.array(-0.2).reshape(-1,1))[:,1][0]
        if random() <= prob:
            collected = collected + data_test.iloc[i]['paid_amount']   
            is_accepted = 1
            current_event_id = data_test.iloc[i]['event_id']
            ihtiyac_rate = data[data['event_id'] == current_event_id]['ihtiyac_rate'].values[0]
            profit = (offeredRate*ihtiyac_rate + ihtiyac_rate)/100*data_test.iloc[i]["paid_amount"]
        else:
            is_accepted = 0
            
            
        tempRow = pd.DataFrame({"replication" : [rep+1], 
                                "iteration" : [i],
                                "event_id" : [data_test.iloc[i]['event_id']],
                                "paid_amount" : [data_test.iloc[i]['paid_amount']],
                                "prob" : [prob],
                                "offered_rate_ref" : [offeredRate],
                                "offered_rate" : [(offeredRate*ihtiyac_rate + ihtiyac_rate)/100],
                                "is_accepted" : [is_accepted],
                                "cum_collected" : [collected],
                                "collected/target" : [collected/target],
                                "per_profit" : [profit]})
        results_2 = results_2.append(tempRow) 
        if collected >= target:
            print("early stopping for rep", rep+1, "at the customer indice of", i)
            break
        if flag == 0:
            if collected/target > 0.95:
                print("95% of the target has been reached for rep", rep+1, "at the customer indice of", i)
                flag = 1

## Model Validation Logistic Regression

In [96]:
seed(492)
results_logreg_validation = pd.DataFrame({"replication" : [],
                        "iteration" : [],
                        "event_id" : [],
                        "paid_amount" : [],
                        "prob":[],
                        "offered_rate" : [],                    
                        "is_accepted" : [],
                        "cum_collected" : [],
                       "cum_profit":[]})

for rep in range(numb_of_rep):
    collected = 0
    profit = 0
    for i in range(n_d):
        prob  = log_reg.predict_proba(data_test.iloc[[i]].drop(columns = ["event_id","completed","paid_amount"],axis = 1))[:,1][0]
        temp_ihtiyac = data[data['event_id'] == data_test.iloc[i]['event_id']].ihtiyac_rate.values[0]
        offeredRate = data_test.iloc[[i]].yearly_rate_ref.values[0]
        interest = (offeredRate*temp_ihtiyac + temp_ihtiyac)/100
        if random() <= prob:
            collected = collected + data_test.iloc[i]['paid_amount']
            profit = profit + data_test.iloc[i]['paid_amount']*interest
            is_accepted = 1
        else :
            is_accepted = 0
            
        tempRow = pd.DataFrame({"replication" :[rep], 
                                "iteration" : [i],
                                "event_id" : [data_test.iloc[i]['event_id']],
                                "paid_amount" : [data_test.iloc[i]['paid_amount']],
                                "prob" :[prob],
                                "offered_rate" : [offeredRate],                    
                                "is_accepted" : [is_accepted],
                                "cum_collected" : [collected],
                               "cum_profit":[profit]})
        results_logreg_validation = results_logreg_validation.append(tempRow)

In [None]:
results_logreg_validation

In [None]:
print(results_logreg_validation.groupby('replication').cum_collected.max()/target*100)
print(results_logreg_validation.groupby('replication').is_accepted.mean())
print(results_logreg_validation['offered_rate'].mean())
print(results_logreg_validation.groupby('replication').cum_profit.max())

## Model Validation Mob

In [79]:
seed(492)
results_mob_validation = pd.DataFrame({"replication" : [],
                        "iteration" : [],
                        "event_id" : [],
                        "paid_amount" : [],
                        "prob":[],
                        "offered_rate" : [],                    
                        "is_accepted" : [],
                        "cum_collected" : [],
                       "cum_profit":[]})

for rep in range(numb_of_rep):
    collected = 0
    profit = 0
    for i in range(n_d):
        prob  = prob_table.iloc[i].values[0]
        temp_ihtiyac = data[data['event_id'] == data_test.iloc[i]['event_id']].ihtiyac_rate.values[0]
        offeredRate = data_test.iloc[[i]].yearly_rate_ref.values[0]
        interest = (offeredRate*temp_ihtiyac + temp_ihtiyac)/100
        if random() <= prob:
            collected = collected + data_test.iloc[i]['paid_amount']
            profit = profit + data_test.iloc[i]['paid_amount']*interest
            is_accepted = 1
        else :
            is_accepted = 0
            
        tempRow = pd.DataFrame({"replication" :[rep], 
                                "iteration" : [i],
                                "event_id" : [data_test.iloc[i]['event_id']],
                                "paid_amount" : [data_test.iloc[i]['paid_amount']],
                                "prob" :[prob],
                                "offered_rate" : [offeredRate],                    
                                "is_accepted" : [is_accepted],
                                "cum_collected" : [collected],
                               "cum_profit":[profit]})
        results_mob_validation = results_mob_validation.append(tempRow)

In [None]:
print(results_mob_validation.groupby('replication').cum_collected.max()/target*100)
print(results_mob_validation.groupby('replication').is_accepted.mean())
print(results_mob_validation['offered_rate'].mean())
print(results_mob_validation.groupby('replication').cum_profit.max())

In [None]:
results_mob_validation