In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from imblearn.under_sampling import RandomUnderSampler

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('../input/santander-customer-transaction-prediction/train.csv',index_col=0)

zeros_index = df.loc[df.target == 0,:].index
ones_index = df.loc[df.target == 1,:].index

final_list = list(zeros_index)
final_list.extend(list(np.random.choice(ones_index, 2000, replace=False)))
df = df.loc[final_list,:]
df.to_csv('undersampled.csv',index=False)
del zeros_index,ones_index

In [None]:
# Run below commands in google colab
# install Java8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# download spark3.0.0
!wget -q https://downloads.apache.org/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
# unzip it
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
# install findspark 
!pip install -q findspark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "./spark-3.1.1-bin-hadoop3.2"

In [None]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Santander Customer Classification').getOrCreate()

In [None]:
data_customer=spark.read.csv('./undersampled.csv', header=True, inferSchema=True)
print(data_customer.columns)
data_customer=data_customer.na.fill(0)

In [None]:
tgt_column  = 'target'
split = 0.95
df_zero = data_customer.filter('{} = 0'.format(tgt_column))
df_ones = data_customer.filter('{} = 1'.format(tgt_column))
n_ones = df_ones.count()
n = df_zero.count()
z_perc = float(n_ones * split) / float((1 - split) * n)
print(z_perc)

In [None]:
# Vector Assembler Excluding the target Column , the original function in code also involves standarization
num_cols = data_customer.columns[1:] # float_cols used in original code 
from pyspark.ml.feature import VectorAssembler
assemble = VectorAssembler(inputCols=num_cols, outputCol='features')
assembled_data=assemble.transform(df_zero)

#Standardizing Data
scale=StandardScaler(inputCol='features',outputCol='standardized',withMean=True,withStd=True)
data_scale=scale.fit(assembled_data)
data_scale_output = data_scale.transform(assembled_data)

In [None]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.sql import functions as F

In [None]:
# Compute Clusters and the steps that follow 
kmeans = KMeans(k=5,featuresCol='standardized',
                        maxIter=100, seed=10, initSteps=10)
model = kmeans.fit(data_scale_output.select('standardized'))
transformed = model.transform(data_scale_output)
cm = transformed.groupBy('prediction').agg(F.count('*'))
cm.show()

In [None]:
thresh = 0.02 # atleast these many % of data should be there in each cluster
#  returns total records per cluster [ [1,76], ...]
cluster_counts =  np.array( [ [x['prediction'], x['count(1)'] ] for x in cm.collect()] )
cluster_perc = cluster_counts[:, 1] / float(n) # get percentages
print(cluster_perc)

In [None]:
# Removing Unpopulated Clusters
cluster_sorted_idx = np.argsort(cluster_perc)
trash_mask = np.cumsum(cluster_perc[cluster_sorted_idx]) > thresh # [False,  True,  True,  True,  True]
big_clusters = cluster_perc[cluster_sorted_idx] > 0.01
kept_clusters = trash_mask | big_clusters
vals = cluster_counts[cluster_sorted_idx]
clusters = vals[kept_clusters]
len(clusters)

In [None]:
#get_sample_fractions(z_perc, gamma, n, c_vals, force)
gamma = 0.2
force = False
perc = z_perc

prev_dist_perc = gamma * perc
uni_dist_perc = perc * (1 - gamma)
print(prev_dist_perc,uni_dist_perc)
n_uni_dist = float(sum(clusters[:, 1]) * uni_dist_perc)
n_clusters = len(clusters)

fractions = {x[0]: min([prev_dist_perc + (n_uni_dist / float((x[1] * n_clusters))), 1.0]) for x in clusters}
fractions
sampled_data  = transformed.stat.sampleBy('prediction', fractions={0: 0.2122923328983849,
                                                                 1: 0.21198470311434814,
                                                                 4: 0.2106494748652401,
                                                                 3: 0.20999896400126306})
sampled_data.groupBy('target').agg(F.count('*')).show() # reduced from 179902

In [None]:
# Combining the responders and non_responders
random_cols = ['target', 'var_0', 'var_1', 'var_2', 'var_3', 'var_4', 'var_5', 'var_6', 'var_7', 'var_8', 'var_9', 'var_10']
purchasers_data = df_ones.select(random_cols)
sampled_data = sampled_data.select(random_cols).union(purchasers_data)
sampled_data.groupBy('target').agg(F.count('*')).show()
print(1- (2000/37865), 'is the required split as set')