# Creating Pipeline

In [1]:
# getting device info
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 16491937646458356270, name: "/device:XLA_CPU:0"
 device_type: "XLA_CPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 2626237046526420509
 physical_device_desc: "device: XLA_CPU device", name: "/device:XLA_GPU:0"
 device_type: "XLA_GPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 11362407259280330709
 physical_device_desc: "device: XLA_GPU device", name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 15695549568
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 5103077543275652948
 physical_device_desc: "device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0"]

In [2]:
# loading library
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.metrics import log_loss

import tensorflow as tf
from tensorflow.keras.models import load_model

In [None]:
# loading raw train dataset
train_data = pd.read_csv('train.csv')

In [None]:
def preprocessing(data):
    '''
    This dataframe takes pandas dataframe as input, perform preprocessing(like nan removal, outlier removal) etc.
    and then return pandas dataframe as output
    
    data : pandas dataframe
    return : pandas dataframe and numpy array
    '''
    
    # taking all registration_init_time feature and storing them to registration_time
    # but before going to do that, just impute median date in place of nan
    data['registration_init_time'] = data['registration_init_time'].fillna(20150204.0)
    # getting registration_init_time array and store them to registration_time list
    registration_time = data['registration_init_time'].values
    
    # imputing 0 in place of nan values in the city column
    data['city'] = data['city'].fillna(0)

    # removing outliers
    data['bd'] = data['bd'].apply(lambda x: x if (x < 72.0) and (x > 0.0) else np.nan)
    # imputing 28 as age instead of nan
    data['bd'] = data['bd'].fillna(28.0)

    # replacing male with 1 in gender
    data['gender'] = data['gender'].replace(to_replace='male', value=1)
    # replacing male with 2 in gender
    data['gender'] = data['gender'].replace(to_replace='female', value=2)
    # replacing nan with 0 in gender
    data['gender'] = data['gender'].fillna(0)

    # replace 0 instead of nan in registered_via
    data['registered_via'] = data['registered_via'].fillna(0)

    # filling median date in place of nan in the df
    data['registration_init_time'] = data['registration_init_time'].fillna(20150204.0)
    # converting float date to datetime
    data['registration_init_time'] = pd.to_datetime(data['registration_init_time'], format='%Y%m%d')

    # imputing 0 in place of nan value in payment_method_id
    data['payment_method_id'] = data['payment_method_id'].fillna(0)

    # removing outliers
    data['payment_plan_days'] = data['payment_plan_days'].apply(lambda x: x if (x <= 30.0) else np.nan)
    # imputing 30 in place of nan in payment_plan_days
    data['payment_plan_days'] = data['payment_plan_days'].fillna(30.0)

    # removing outliers
    data['plan_list_price'] = data['plan_list_price'].apply(lambda x: x if (x <= 180.0) else np.nan)
    # imputing 149 in place of nan in plan_list_price
    data['plan_list_price'] = data['plan_list_price'].fillna(149.0)

    # removing outliers
    data['actual_amount_paid'] = data['actual_amount_paid'].apply(lambda x: x if (x <= 180.0) else np.nan)
    # imputing 149 in place of nan in actual_amount_paid
    data['actual_amount_paid'] = data['actual_amount_paid'].fillna(149.0)

    # imputing 2 in place of nan values in is_auto_renew
    data['is_auto_renew'] = data['is_auto_renew'].fillna(2)

    # filling median date in place of nan in the df
    data['transaction_date'] = data['transaction_date'].fillna(20170316.0)
    # converting float date to datetime
    data['transaction_date'] = pd.to_datetime(data['transaction_date'], format='%Y%m%d')

    # filling median date in place of nan in the df
    data['membership_expire_date'] = data['membership_expire_date'].fillna(20170419.0)
    # converting float date to datetime
    data['membership_expire_date'] = pd.to_datetime(data['membership_expire_date'], format='%Y%m%d')

    # imputing 2 in place of nan values in is_cancel
    data['is_cancel'] = data['is_cancel'].fillna(2)

    # filling median date in place of nan in the df
    data['date'] = data['date'].fillna(20170316.0)
    # converting float date to datetime
    data['date'] = pd.to_datetime(data['date'], format='%Y%m%d')

    # removing outliers
    data['num_25'] = data['num_25'].apply(lambda x: x if (x <= 15.0) else np.nan)
    # now I can impute 2 instead of nan in num_25
    data['num_25'] = data['num_25'].fillna(2.0)

    # removing outliers
    data['num_50'] = data['num_50'].apply(lambda x: x if (x <= 4.0) else np.nan)
    # now I can impute 0 instead of nan in num_50
    data['num_50'] = data['num_50'].fillna(0)

    # removing outliers
    data['num_75'] = data['num_75'].apply(lambda x: x if (x <= 3.0) else np.nan)
    # now I can impute 0 instead of nan in num_75
    data['num_75'] = data['num_75'].fillna(0)

    # removing outliers
    data['num_985'] = data['num_985'].apply(lambda x: x if (x <= 3.0) else np.nan)
    # now I can impute 0 instead of nan in num_985
    data['num_985'] = data['num_985'].fillna(0)

    # removing outliers
    data['num_100'] = data['num_100'].apply(lambda x: x if (x <= 74.0) else np.nan)
    # now I can impute 14 instead of nan in num_100
    data['num_100'] = data['num_100'].fillna(14.0)

    # removing outliers
    data['num_unq'] = data['num_unq'].apply(lambda x: x if (x <= 68.0) else np.nan)
    # now I can impute 16 instead of nan in num_unq
    data['num_unq'] = data['num_unq'].fillna(16.0)

    # removing outliers
    data['total_secs'] = data['total_secs'].apply(lambda x: x if (x <= 19167.549700000025) else np.nan)
    # now I can impute 3880.765 instead of nan in total_secs
    data['total_secs'] = data['total_secs'].fillna(3880.765)
    
    return data, registration_time

In [None]:
def featurization(data):
    '''
    This function takes pandas dataframe as input, create features and then return pandas dataframe as output
    
    data : input pandas dataframe
    return : pandas dataframe
    '''
    
    
    # Feature - 1 (getting weekday from date)
    data['date'] = pd.to_datetime(data['date'], errors='coerce')
    data['day_of_the_week'] = data['date'].dt.day_name().values
    
    # Feature - 2 (checking for the weekend)
    data['is_weekend'] = data['day_of_the_week'].apply(lambda x: 1 if (x == 'Saturday') or (x == 'Sunday') else 0)
    
    # Feature - 3 (checking for the weekday)
    data['is_weekday'] = data['is_weekend'].apply(lambda x: 1 if (x == 0) else 0)
    
    # Feature - 4 to 12 (sum based features)
    # Feature - 13 to 19 (mean based features)
    # Feature - 20 to 27 (standard deviation based features)
    # Feature - 28 (nunique based feature)
    # Feature - 29 and 30 (min and max based features)
    # Feature - 31 to 33 (mean based features for transaction)
    # Feature - 34 (transaction count)
    # Feature - 35 (transaction date max)
    # Feature - 36 (membership expiry date max)
    # Feature - 37 (membership expiry date count)
    def std(x):
        '''
        Finding standard deviation using numpy, to avoid getting nan values
        
        x : a numpy array
        return : standard deviation using numpy
        '''
        return np.std(x)

    
    # grouping them together for train data
    temp_df = data.groupby('msno').agg(num_25_sum=('num_25', 'sum'),
                                    num_50_sum=('num_50', 'sum'),
                                    num_75_sum=('num_75', 'sum'),
                                    num_985_sum=('num_985', 'sum'),
                                    num_100_sum=('num_100', 'sum'),
                                    num_unq_sum=('num_unq', 'sum'),
                                    total_secs_sum=('total_secs', 'sum'),
                                    is_weekend_sum=('is_weekend', 'sum'),
                                    is_weekday_sum=('is_weekday', 'sum'),
                                    num_25_mean=('num_25', 'mean'),
                                    num_50_mean=('num_50', 'mean'),
                                    num_75_mean=('num_75', 'mean'),
                                    num_985_mean=('num_985', 'mean'),
                                    num_100_mean=('num_100', 'mean'),
                                    num_unq_mean=('num_unq', 'mean'),
                                    total_secs_mean=('total_secs', 'mean'),
                                    num_25_std=('num_25', std),
                                    num_50_std=('num_50', std),
                                    num_75_std=('num_75', std),
                                    num_985_std=('num_985', std),
                                    num_100_std=('num_100', std),
                                    num_unq_std=('num_unq', std),
                                    total_secs_std=('total_secs', std),
                                    active_days=('date', 'nunique'),
                                    date_min=('date', 'min'),
                                    date_max=('date', 'max'),
                                    payment_plan_days_mean=('payment_plan_days', 'mean'),
                                    plan_list_price_mean=('plan_list_price', 'mean'),
                                    actual_amount_paid_mean=('actual_amount_paid', 'mean'),
                                    transaction_date_count=('transaction_date', 'nunique'),
                                    transaction_date_max=('transaction_date', 'max'),
                                    membership_expire_date_max=('membership_expire_date', 'max'),
                                    membership_expire_count=('membership_expire_date', 'nunique'))
    # merging them with the train dataset
    data = pd.merge(data, temp_df, on='msno', how='left')
    
    # Feature - 38 (activity period)
    data['date_min'] = pd.to_datetime(data['date_min'], errors='coerce')
    data['date_max'] = pd.to_datetime(data['date_max'], errors='coerce')
    
    data['activity_period'] = (data['date_max'] - data['date_min']).dt.days + 1
    
    # Feature - 39 (inactive days)
    data['inactive_days'] = data['date'].nunique() - data['active_days']
    
    # Feature - 40 (rare behaviour)
    # 0 (for not rare user) and 1 (for rare user)
    data['is_rare'] = data['active_days'].apply(lambda x: 0 if (x > 1) else 1)
    
    # Feature - 41 (average time per day)
    data['avg_time_perday'] = data['total_secs_sum'] / data['active_days']

    # Feature - 42 (unique tracks played per day)
    data['unq_track_perday'] = round(data['num_unq_sum'] / data['active_days'])

    # Feature - 43 (tracks played till 25% length per day)
    data['till_25_perday'] = round(data['num_25_sum'] / data['active_days'])


    # Feature - 44 (tracks played till 50% length per day)
    data['till_50_perday'] = round(data['num_50_sum'] / data['active_days'])


    # Feature - 45 (tracks played till 75% length per day)
    data['till_75_perday'] = round(data['num_75_sum'] / data['active_days'])


    # Feature - 46 (tracks played till 98.5% length per day)
    data['till_985_perday'] = round(data['num_985_sum'] / data['active_days'])


    # Feature - 47 (tracks played till 100% length per day)
    data['till_full_perday'] = round(data['num_100_sum'] / data['active_days'])
    
    # Feature - 48 (discount)
    data['discount'] = data['plan_list_price'] - data['actual_amount_paid']    
    # since there can be several values which are less than 0 for various reasons
    # so it's always good to set a lower limit, here I can set 0
    data['discount'] = data['discount'].clip(lower=0)
    
    # Feature - 49 (is_discount)
    data['is_discount'] = data['discount'].apply(lambda x: 1 if (x > 0) else 0)
    
    # Feature - 50 (days since final login)
    # Threshold that I set here is the last date, which is 31-03-2017
    data['days_since_last_login'] = (pd.to_datetime(data['date'].unique().max(), errors='coerce') - data['date_max']).dt.days
    
    # Feature - 51 (days left)
    data['membership_expire_date_max'] = pd.to_datetime(data['membership_expire_date_max'], errors='coerce')
    data['days_left'] = (data['membership_expire_date_max'] - pd.to_datetime(data['date'].unique().max(), errors='coerce')).dt.days
    
    # Since there are many negative values so I can set them to 0
    data['days_left'] = data['days_left'].clip(lower=0)
    
    # Feature - 52 (Loyality range)
    data['transaction_date_max'] = pd.to_datetime(data['transaction_date_max'], errors='coerce')
    data['registration_init_time'] = pd.to_datetime(data['registration_init_time'], errors='coerce')
    data['layality_range'] = (data['transaction_date_max'] - data['registration_init_time']).dt.days
    
    # Feature - 53 (price per day)
    data['Perday_price'] = data['actual_amount_paid'] / data['payment_plan_days']
    
    # Feature - 54 (days since final transaction)
    data['days_since_final_transaction'] = (pd.to_datetime(data['date'].unique().max(), errors='coerce') - data['transaction_date_max']).dt.days
    
    return data

In [None]:
def prepare_data(data, registration_date):
    '''
    This function is used to prepare final dataset before modeling, which typically removes non-useful features
    adding registration date features and removing duplicate rows
    
    data : input pandas dataframe
    registration_date : numpy array of registration date
    return : pandas dataframe
    '''
    
    # removing all non_useful features first
    data = data.drop(['registration_init_time',
                                  'transaction_date', 
                                  'membership_expire_date', 
                                  'date', 
                                  'num_25',
                                  'num_50',
                                  'num_75',
                                  'num_985',
                                  'num_100',
                                  'num_unq',
                                  'total_secs',
                                  'is_weekend',
                                  'is_weekday',
                                  'day_of_the_week', 
                                  'date_min', 
                                  'date_max', 
                                  'transaction_date_max', 
                                  'membership_expire_date_max'], axis=1, inplace=False)
    
    # creating new feature, and adding it to train dataset
    data['registration_init_time'] = registration_date
    
    # removing all duplicate rows for a particular id
    data = data.drop_duplicates('msno', keep='first', inplace=False)
    
    # replacing all nan and infinity to 0
    data = data.replace(np.nan, 0, inplace=False)
    data = data.replace(np.inf, 0, inplace=False)
    
    return data

In [None]:
def normalize(data):
    '''
    This function is used to normalize the already prepared dataframe
    
    data : input pandas dataframe
    return : normalized pandas dataframe
    '''
    
    result1 = data.copy()
    for feature_name in data.columns:
        if (str(feature_name) != str('msno') and str(feature_name)!=str('is_churn')):
            max_value = data[feature_name].max()
            min_value = data[feature_name].min()
            result1[feature_name] = (data[feature_name] - min_value) / (max_value - min_value)
    return result1

In [None]:
def main_workload(dataset):
    '''
    This function creates a final dataframe after all of the preprocessing, featurization, prparation and normalization.
    
    dataset : input pandas dataframe
    return : normlized pandas dataframe
    '''
    
    # preprocessing
    preprocessed_data, registration_date = preprocessing(dataset)
    print('Preprocessing done.')

    # Featurization
    featured_data = featurization(preprocessed_data)
    print('Featurization done.')

    # Dataset preparation
    prepared_data = prepare_data(featured_data, registration_date)
    print('Data prepared.')

    # Normalization
    normalized_dataframe = normalize(prepared_data)
    print('Normalization done.')

    return normalized_dataframe

In [None]:
# function calling
final_dataset = main_workload(train_data)
final_dataset.head()

Preprocessing done.
Featurization done.
Data prepared.
Normalization done.


Unnamed: 0,msno,is_churn,city,bd,gender,registered_via,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,...,till_985_perday,till_full_perday,discount,is_discount,days_since_last_login,days_left,layality_range,Perday_price,days_since_final_transaction,registration_init_time
0,ugx0CjOMzazClkFzU2xasmDZaoIqOUAZPsH1q0teWCg=,1,0.227273,0.385714,0.5,0.230769,0.0,1.0,0.827778,0.827778,...,0.0,0.006611,0.0,0.0,0.066667,0.008155,0.253862,0.0,0.018315,0.698681
1,f/NmvEzHfhINFEYZTR05prUdr+E+3+oewvweYz9cCQE=,1,0.590909,0.271429,0.5,0.230769,0.878049,1.0,1.0,1.0,...,0.0,0.001202,0.0,0.0,0.366667,0.004721,0.252818,0.0,0.02442,0.698681
2,zLo9f73nGGT1p21ltZC3ChiRnAVvgibMyazbCxvWPcg=,1,0.590909,0.242857,0.5,0.230769,0.414634,1.0,0.0,0.0,...,0.013889,0.00601,0.0,0.0,0.0,0.032618,0.25261,0.0,0.020757,0.698712
3,8iF/+8HY8lJKFrTc7iR9ZYGCG2Ecrogbc2Vy5YhsfhQ=,1,0.045455,0.385714,0.0,0.538462,1.0,1.0,0.827778,0.827778,...,0.0,0.042067,0.0,0.0,0.5,0.121459,0.153445,0.0,0.58486,0.766983
4,K6fja4+jmoZ5xG6BypqX80Uw/XKpMgrEMdG2edFOxnA=,1,0.590909,0.485714,1.0,0.538462,1.0,1.0,0.55,0.55,...,0.013889,0.078726,0.0,0.0,0.0,0.073391,0.246973,0.0,0.018315,0.767106


In [None]:
# saving csv to disk
final_dataset.to_csv('normalized_train.csv', index=False)

## Pipeline Functions

In [3]:
# loading train file
train_data = pd.read_csv('drive/My Drive/KKBox/normalized_train.csv')

### 1. To predict probabilities

In [4]:
def final_fun_1(data):
    '''
    This function is used to take single or multiple observations, and predict probabilities for them
    
    data : single or multiple observations from a pandas dataframe
    return : probabilities of belonging for each of the observation
    '''
    
    
    # first removing id and class label from those observations(single or multiple)
    data = data.drop(['msno', 'is_churn'], axis=1, inplace=False)
    
    # converting this mini dataset to numpy array
    data = data.values

    # loading best model from disk
    model = load_model('drive/My Drive/KKBox/best_model_2.h5')
    
    # getting predictions for data
    pred = model.predict(data)
    # converting 2d array to 1d using flatten
    pred = pred.flatten()
    
    return pred

### 2. To predict log-loss

In [5]:
def final_fun_2(data, labels):
    '''
    This function is used to take single or multiple observations and class labels, and predict log-loss for them
    
    data : single or multiple observations from a pandas dataframe
    labels : numpy array of ground truth values
    return : probabilities of belonging for each of the observation
    '''
    
    if not data.shape[0] == 1:

      # first removing id and class label from those observations(single or multiple)
      data = data.drop(['msno', 'is_churn'], axis=1, inplace=False)
    
      # converting this mini dataset to numpy array
      data = data.values

      # loading best model from disk
      model = load_model('drive/My Drive/KKBox/best_model_2.h5')
    
      # getting predictions for data
      pred = model.predict(data)
      # converting 2d array to 1d using flatten
      pred = pred.flatten()
      # multiplying 0.6 with probabilities
      pred = pred * 0.6
    
      return log_loss(labels, pred)

    else:

      arr = []
      pred_arr = []
    
      # first removing id and class label from those observations(single or multiple)
      data = data.drop(['msno', 'is_churn'], axis=1, inplace=False)
      
      # converting this mini dataset to numpy array
      data = data.values

      # loading best model from disk
      model = load_model('drive/My Drive/KKBox/best_model_2.h5')
      
      # getting predictions for data
      pred = model.predict(data)
      # converting 2d array to 1d using flatten
      pred = pred.flatten()
      # multiplying 0.6 with probabilities
      pred = pred * 0.6

      arr.append(1- pred[0])
      arr.append(pred[0])

      pred_arr.append(arr)

      y_true = []
      y_true.append(labels[0])
    
      return log_loss(y_true, pred_arr, labels=[0,1])

## Pipeline testing

### 1. For multiple points

In [10]:
# taking 10 points from dataset randomly
sampled_train = train_data.sample(10)
sampled_train

Unnamed: 0,msno,is_churn,city,bd,gender,registered_via,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,is_cancel,num_25_sum,num_50_sum,num_75_sum,num_985_sum,num_100_sum,num_unq_sum,total_secs_sum,is_weekend_sum,is_weekday_sum,num_25_mean,num_50_mean,num_75_mean,num_985_mean,num_100_mean,num_unq_mean,total_secs_mean,num_25_std,num_50_std,num_75_std,num_985_std,num_100_std,num_unq_std,total_secs_std,active_days,payment_plan_days_mean,plan_list_price_mean,actual_amount_paid_mean,transaction_date_count,membership_expire_count,activity_period,inactive_days,is_rare,avg_time_perday,unq_track_perday,till_25_perday,till_50_perday,till_75_perday,till_985_perday,till_full_perday,discount,is_discount,days_since_last_login,days_left,layality_range,Perday_price,days_since_final_transaction,registration_init_time
115030,bbJnbmcd0CjvWwzabG3mq+9j82LjZMMdN+G7PifNZpc=,0,0.181818,0.285714,1.0,0.230769,0.97561,1.0,0.827778,0.827778,0.5,0.0,0.00496,0.006607,0.003779,0.004135,0.003151,0.004904,0.003474,0.006731,0.005491,0.258333,0.2605,0.1805,0.208333,0.164378,0.319701,0.176149,0.518267,0.586,0.429333,0.464,0.197622,0.541493,0.188677,0.766667,1.0,0.827778,0.827778,0.0,0.0,0.9,0.233333,0.0,0.003771,0.005947,0.004902,0.006329,0.006993,0.006944,0.003606,0.0,0.0,0.033333,0.003863,0.291858,0.0,0.025641,0.694031
23633,7SIEeNY8vqefQXD63hckbp+yw9DbgWxy/+gBxEKOj+8=,1,0.227273,0.471429,1.0,0.692308,0.0,1.0,0.827778,0.827778,1.0,1.0,0.000107,0.0,0.0,0.0,0.000151,0.000137,0.000167,0.0,0.000323,0.133333,0.0,0.0,0.0,0.189189,0.223881,0.202927,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.827778,0.827778,0.0,0.0,0.0,1.0,1.0,0.004344,0.004248,0.002451,0.0,0.0,0.0,0.004207,0.0,0.0,0.5,0.008155,0.064092,0.0,0.018315,0.924634
865141,kg8MqVtlkp5No622fE8JGafYNO7scp8xXfDo4XYfbvo=,0,0.045455,0.385714,0.0,0.538462,1.0,1.0,0.55,0.55,0.5,0.0,0.000107,0.0,0.0,0.0,0.000151,0.000137,0.000167,0.0,0.000323,0.133333,0.0,0.0,0.0,0.189189,0.223881,0.202927,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.55,0.55,0.0,0.0,0.0,1.0,1.0,0.004344,0.004248,0.002451,0.0,0.0,0.0,0.004207,0.0,0.0,0.5,0.004721,0.477035,0.0,0.025641,0.536419
198718,iRztE2JWfGhkeY3Ya8px8a/KKYMThXfKmUkJaCnTTDQ=,0,0.045455,0.385714,0.0,0.538462,1.0,1.0,0.55,0.55,0.5,0.0,0.00048,0.001057,0.000581,0.000827,0.000183,0.000475,0.000266,0.001923,0.000646,0.15,0.25,0.166667,0.25,0.057432,0.182836,0.081013,0.197267,0.6125,0.577333,0.552667,0.124324,0.308358,0.157989,0.1,1.0,0.55,0.55,0.0,0.0,0.833333,0.9,0.0,0.001734,0.003398,0.002451,0.006329,0.0,0.006944,0.001202,0.0,0.0,0.033333,0.012876,0.290397,0.0,0.0,0.694784
856068,g1+rceVOy5cb6eK1oKR11sfkRcYDhMb9AvrFFi2d5F8=,0,0.181818,0.4,1.0,0.692308,0.97561,1.0,0.827778,0.827778,0.5,0.0,0.007253,0.0037,0.002907,0.003308,0.009475,0.007424,0.009166,0.005769,0.00969,0.2518,0.09725,0.0926,0.1111,0.329595,0.322537,0.309831,0.4344,0.339,0.372,0.314267,0.565405,0.569552,0.504974,0.566667,1.0,0.827778,0.827778,0.020833,0.004831,0.933333,0.433333,0.0,0.013265,0.012461,0.009804,0.006329,0.006993,0.006944,0.014724,0.0,0.0,0.033333,0.017597,0.530898,0.0,0.017094,0.462582
509546,Gbm5d67OcD4jKlT6/+XnrwsrBh5aSbPcQ2E6VTqJtKQ=,0,0.045455,0.385714,0.0,0.538462,1.0,1.0,0.55,0.55,0.5,0.0,0.00112,0.000264,0.000581,0.000827,0.000453,0.00052,0.000496,0.001923,0.001938,0.175,0.03125,0.083333,0.125,0.070946,0.093284,0.075417,0.382667,0.1654,0.288667,0.322733,0.080811,0.109433,0.083545,0.233333,1.0,0.55,0.55,0.0,0.0,0.933333,0.766667,0.0,0.001614,0.001699,0.003676,0.0,0.0,0.0,0.001502,0.0,0.0,0.0,0.008155,0.128184,0.0,0.014652,0.849314
217235,HyoPCvRnSvAaTvDUz6SRRUmBzBOz5WXyJ3UA8kl8WRA=,0,0.181818,0.3,0.5,0.538462,1.0,1.0,0.827778,0.827778,0.5,0.0,0.000107,0.0,0.0,0.0,0.000151,0.000137,0.000167,0.0,0.000323,0.133333,0.0,0.0,0.0,0.189189,0.223881,0.202927,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.827778,0.827778,0.0,0.0,0.0,1.0,1.0,0.004344,0.004248,0.002451,0.0,0.0,0.0,0.004207,0.0,0.0,0.5,0.008584,0.141127,0.0,0.013431,0.847776
324545,TGNK68ikup3Z2QQVX7NJ5eQf9V3MHDcV0HIr6KBAeGU=,0,0.0,0.385714,0.0,0.0,1.0,1.0,0.55,0.55,0.5,0.0,0.000107,0.0,0.0,0.0,0.000151,0.000137,0.000167,0.0,0.000323,0.133333,0.0,0.0,0.0,0.189189,0.223881,0.202927,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.55,0.55,0.0,0.0,0.0,1.0,1.0,0.004344,0.004248,0.002451,0.0,0.0,0.0,0.004207,0.0,0.0,0.5,0.008584,0.16952,0.0,0.013431,0.844579
282074,6L+0wuryBoeWL9lnCKpHI6SSgBeFzHUhVjqdjQQtgOk=,0,0.045455,0.385714,0.0,0.538462,1.0,1.0,0.827778,0.827778,0.5,0.0,0.007573,0.014799,0.010756,0.007718,0.00532,0.006913,0.005715,0.007692,0.007429,0.305467,0.45175,0.397667,0.3011,0.215,0.35,0.224476,0.439867,0.6275,0.642933,0.684667,0.250432,0.517612,0.259894,1.0,1.0,0.827778,0.827778,0.0,0.0,1.0,0.0,0.0,0.004802,0.006514,0.006127,0.012658,0.006993,0.006944,0.004808,0.0,0.0,0.0,0.006867,0.50501,0.0,0.018315,0.465657
737412,G7p0U1TYODDlWLk5UH4g4HJjWAxaSwhr9zE0bFNsScs=,0,0.590909,0.485714,1.0,0.692308,0.95122,1.0,0.827778,0.827778,0.5,0.0,0.002133,0.000529,0.002907,0.000551,0.002007,0.002548,0.002129,0.005769,0.001938,0.222267,0.04165,0.277833,0.055533,0.209459,0.333284,0.216003,0.294667,0.1863,0.711333,0.2484,0.379865,0.355224,0.331217,0.166667,1.0,0.827778,0.827778,0.020833,0.004831,0.766667,0.833333,0.0,0.009243,0.013027,0.008578,0.0,0.013986,0.0,0.009315,0.0,0.0,0.133333,0.025751,0.455741,0.0,0.0,0.539516


In [11]:
# pipeline function 1 call
final_fun_1(sampled_train)

array([0.0238806 , 0.9137967 , 0.04614686, 0.02403701, 0.02416257,
       0.02166689, 0.03438076, 0.03916668, 0.02161474, 0.02969514],
      dtype=float32)

In [12]:
# pipeline function 2 call
final_fun_2(sampled_train, sampled_train['is_churn'].values)

0.07613558461889625

### 2. For single point

In [13]:
# taking a point from dataset randomly
sampled_train = train_data.sample(1)
sampled_train

Unnamed: 0,msno,is_churn,city,bd,gender,registered_via,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,is_cancel,num_25_sum,num_50_sum,num_75_sum,num_985_sum,num_100_sum,num_unq_sum,total_secs_sum,is_weekend_sum,is_weekday_sum,num_25_mean,num_50_mean,num_75_mean,num_985_mean,num_100_mean,num_unq_mean,total_secs_mean,num_25_std,num_50_std,num_75_std,num_985_std,num_100_std,num_unq_std,total_secs_std,active_days,payment_plan_days_mean,plan_list_price_mean,actual_amount_paid_mean,transaction_date_count,membership_expire_count,activity_period,inactive_days,is_rare,avg_time_perday,unq_track_perday,till_25_perday,till_50_perday,till_75_perday,till_985_perday,till_full_perday,discount,is_discount,days_since_last_login,days_left,layality_range,Perday_price,days_since_final_transaction,registration_init_time
441796,gicrsFZ5qw/ycMrHUtDUaSZLRV5YBZNHF0wI+4KRdCE=,0,0.045455,0.385714,0.0,0.538462,1.0,1.0,0.827778,0.827778,0.5,0.0,0.003093,0.006342,0.004651,0.005237,0.006626,0.006136,0.007633,0.005769,0.005491,0.168067,0.261,0.232,0.275333,0.360811,0.42194,0.403974,0.484667,0.4773,0.635067,0.6416,0.445405,0.513433,0.533757,0.733333,1.0,0.827778,0.827778,0.0,0.0,1.0,0.266667,0.0,0.008645,0.00793,0.003676,0.006329,0.006993,0.006944,0.008113,0.0,0.0,0.0,0.001288,0.280585,0.0,0.034188,0.69546


In [14]:
# pipeline function 1 call
final_fun_1(sampled_train)



array([0.02294197], dtype=float32)

In [15]:
# pipeline function 2 call
final_fun_2(sampled_train, sampled_train['is_churn'].values)



0.023209237631301262