In [None]:
import numpy as np
import pandas as pd

df = pd.read_csv('../input/santander-customer-transaction-prediction/train.csv',index_col=0)

zeros_index = df.loc[df.target == 0,:].index
ones_index = df.loc[df.target == 1,:].index

final_list = list(zeros_index)
final_list.extend(list(np.random.choice(ones_index, 2000, replace=False)))
df = df.loc[final_list,:]
df.to_csv('undersampled.csv',index=False)
del zeros_index,ones_index

In [None]:
# Note hclip and lclip isnt being used in the data_query but its still passed

In [None]:
# Run below commands in google colab
# install Java8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# download spark3.0.0
!wget -q https://downloads.apache.org/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
# unzip it
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
# install findspark 
!pip install -q findspark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "./spark-3.1.1-bin-hadoop3.2"

In [None]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Santander Fast Sample').getOrCreate()

In [None]:
data_customer=spark.read.csv('./undersampled.csv', header=True, inferSchema=True)
#print(data_customer.columns)

In [None]:
# Getting lcip and hclip before beginning sampling

perc_query = '''
select
    percentile_approx({column}, array(0.95, 0.01)) as perc
from {data_table}
where {date_column} = '{ing_date}'
and {segment_column} = "{segment}"
and model_name= '{model_name}'
'''.format(column='frm_dnb_employee_site_cnt', db='ws_mkt_dst',
            data_table='tb_sb_gbt_meta_purch_mdls_ads_uid015_v004', ing_date='2021-01-01',
            segment_column='lcm_segment_sb_chnl',
            date_column='selection_date',
            segment='development', model_name='sb_metapurchase_sb_chnl_30d')

# frm_dnb_employee_site_cnt is used by Bucketizer to create distinct bins in case of use of fast_sampler.py
# ws_mkt_dst.tb_sb_gbt_meta_purch_mdls_ads_uid015_v004 is the Master ADS for UK. This is the main database for UK Metapurchase
# lcm_segment_sb_chnl='development'  Chose any One of the LCM segment
# sb_metapurchase_sb_chnl_30d The model name tells we are training a Metapurchase Model which predicts for 30 days
perc_query_completed = '''
select percentile_approx(frm_dnb_employee_site_cnt, array(0.95, 0.01)) as perc 
from ws_mkt_dst.tb_sb_gbt_meta_purch_mdls_ads_uid015_v004
where selection_date='2021-01-01'
and
lcm_segment_sb_chnl='development'
and
model_name='sb_metapurchase_sb_chnl_30d'
'''

output = [30.0,0.0] # spark2array(pquery, 'perc')[0]) in the original code

hclip, lclip = list(map(int, output)) # hclip, lclip = list(map(int, spark2array(pquery, 'perc')[0]))
print(hclip,lclip)

In [None]:
# transformed, c_vals = compute_clusters(df_zero, n, clips, k)
# Enetering Compute Clusters   

# Sampling Function 
#sampling(data_query, clips, k, tgt_column, sample_column, rid_name,segment, date, type, split, gamma, segment_column,date_column, force=False) 
k= 10
tgt_column  = 'target'
split = 0.95
data_customer = data_customer.na.fill(k + 1)
df_zero = data_customer.filter('{} = 0'.format(tgt_column))
df_ones = data_customer.filter('{} = 1'.format(tgt_column))
n_ones = df_ones.count()
n = df_zero.count()
z_perc = float(n_ones * split) / float((1 - split) * n) # percentage by which you have to downsample  the majority class according to split (0.95)
print(z_perc)

In [None]:
# Entering Computer Clusters Subfunction : compute_clusters(df, n, clips, k, remove_clusters=False, min_req_clusters=5)
#transformed, c_vals = compute_clusters(df_zero, n, clips, k)

k = 10
min_req_clusters=5
clips  = (lclip,hclip)

splits = np.linspace(clips[0], clips[1],11) # array([30., 27., 24., 21., 18., 15., 12.,  9.,  6.,  3.,  0.])
splits  = splits[1:-1] # array([27., 24., 21., 18., 15., 12.,  9.,  6.,  3.]) exlcudes the first and last values that is hclip and lclip
splits = np.insert(splits, [0, len(splits)], [-float('inf'), float('inf')]) #add NegativeInfinity and PositiveInfinity as the bounds of your splits to prevent out of Bucketizer bounds exception.

In [None]:
#Bucketizer
from pyspark.ml.feature import Bucketizer
from pyspark.sql import functions as F
bucketizer = Bucketizer(splits=splits, inputCol="var_45", outputCol="clusters_ids") # choosing a variable with high variance here out of the dataset
df = bucketizer.transform(df_zero)
cm = df.groupBy('clusters_ids').agg(F.count('*'))
cm.show()
cluster_counts = np.array([[x['clusters_ids'], x['count(1)']] for x in cm.collect()])

In [None]:
remove_clusters=False


# if remove_clusters is set to true , default = False running remove_unpopulated_clusters(cluster_counts, n = df_zero.count(), thresh=0.02)
thresh = 0.2
cluster_perc = cluster_counts[:, 1] / float(n)
cluster_sorted_idx = np.argsort(cluster_perc)

# clusters that contain less then `trash_thresh`% of the dataset
trash_mask = np.cumsum(cluster_perc[cluster_sorted_idx]) > thresh
# clusters have more then 1% of the dataset
big_clusters = cluster_perc[cluster_sorted_idx] > 0.01
kept_clusters = trash_mask | big_clusters

# sort clusters based on percantage
vals = cluster_counts[cluster_sorted_idx]
c_vals = vals[kept_clusters] # Clusters that remain

if len(c_vals) < min_req_clusters:
    raise RuntimeError("Number of clusters is less then five")
else:
    c_vals = cluster_counts
    c_vals[:, 0] = list(map(int,c_vals[:, 0])) # just convert clusters to integers
    print(c_vals)

In [None]:
#Get Sample Fractions
gamma = 0.2
force = False
perc = z_perc
clusters = cluster_counts

prev_dist_perc = gamma * perc
uni_dist_perc = perc * (1 - gamma)
print(prev_dist_perc,uni_dist_perc)
n_uni_dist = float(sum(clusters[:, 1]) * uni_dist_perc)
n_clusters = len(clusters)

fractions = {x[0]: min([prev_dist_perc + (n_uni_dist / float((x[1] * n_clusters))), 1.0]) for x in clusters}
fractions

In [None]:
sampled_data  = df.stat.sampleBy('clusters_ids', fractions={8.0: 1.0,
                                                            0.0: 0.06491368862359255,
                                                            7.0: 1.0,
                                                            1.0: 0.4356199635996894,
                                                            4.0: 0.572786234615266,
                                                            3.0: 0.52717104788937,
                                                            2.0: 0.48053818664338255,
                                                            6.0: 0.7962134763671148,
                                                            5.0: 0.6636679533882314,
                                                            9.0: 0.6916791040480812}, seed=1)
sampled_data.groupBy('target').agg(F.count('*')).show() # reduced from 179902

In [None]:
# Combining the responders and non_responders
random_cols = ['target', 'var_0', 'var_1', 'var_2', 'var_3', 'var_4', 'var_5', 'var_6', 'var_7']
purchasers_data = df_ones.select(random_cols)
sampled_data = sampled_data.select(random_cols).union(purchasers_data)
sampled_data.groupBy('target').agg(F.count('*')).show()
print(1- (2000/37220), 'is the required split as set')