In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, IntegerType, DoubleType
from pyspark.sql.functions import udf
from pyspark.sql import functions as f
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import PCA
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.ml.feature import OneHotEncoder
from pyspark.sql.types import *
from pyspark.ml.regression import LinearRegression
import six
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [2]:
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("xor") \
    .config("spark.executor.memory", '2g') \
    .config('spark.executor.cores', '1') \
    .config('spark.cores.max', '1') \
    .config("spark.driver.memory",'1g') \
    .getOrCreate()

sc = spark.sparkContext

In [3]:
data_schema = StructType([
    StructField('Date',StringType(), False),
    StructField('Customer_Code',DoubleType(), False),
    StructField('Gender',IntegerType(), False),
    StructField('Foreigner_Index',IntegerType(), False),
    StructField('Channel',StringType(), False),
    StructField('Province_Name',StringType(), False),
    StructField('Active',IntegerType(), False),
    StructField('Segmentation',IntegerType(), False),
    StructField('Savings_Account',IntegerType(), False),
    StructField('Guarantees',IntegerType(), False),
    StructField('Current_Accounts',IntegerType(), False),
    StructField('Derivative',IntegerType(), False),
    StructField('Payroll_Account',IntegerType(), False),
    StructField('Junior_Account',IntegerType(), False),
    StructField('More_Particular_Account',IntegerType(), False),
    StructField('Particular_Account',IntegerType(), False),
    StructField('Particular_Plus_Account',IntegerType(), False),
    StructField('Short_Term_Deposits',IntegerType(), False),
    StructField('Medium_Term_Deposits',IntegerType(), False),
    StructField('Long_Term_Deposits',IntegerType(), False),
    StructField('e-Account',IntegerType(), False),
    StructField('Funds',IntegerType(), False),
    StructField('Mortgage',IntegerType(), False),
    StructField('Pensions',IntegerType(), False),
    StructField('Loans',IntegerType(), False),
    StructField('Taxes',IntegerType(), False),
    StructField('Credit_Card',IntegerType(), False),
    StructField('Securities',IntegerType(), False),
    StructField('Home_Account',IntegerType(), False),
    StructField('Payroll',IntegerType(), False),
    StructField('Pensions_two',IntegerType(), False),
    StructField('Direct_Debit',IntegerType(), False),
    StructField('Age_Range',IntegerType(), False),
    StructField('Months_Range',IntegerType(), False),
    StructField('Income_Range',IntegerType(), False)
])

In [4]:
df = spark.read.csv(
    'santander_df_clean.csv', header=True, schema=data_schema
).cache()

In [5]:
df.columns

['Date',
 'Customer_Code',
 'Gender',
 'Foreigner_Index',
 'Channel',
 'Province_Name',
 'Active',
 'Segmentation',
 'Savings_Account',
 'Guarantees',
 'Current_Accounts',
 'Derivative',
 'Payroll_Account',
 'Junior_Account',
 'More_Particular_Account',
 'Particular_Account',
 'Particular_Plus_Account',
 'Short_Term_Deposits',
 'Medium_Term_Deposits',
 'Long_Term_Deposits',
 'e-Account',
 'Funds',
 'Mortgage',
 'Pensions',
 'Loans',
 'Taxes',
 'Credit_Card',
 'Securities',
 'Home_Account',
 'Payroll',
 'Pensions_two',
 'Direct_Debit',
 'Age_Range',
 'Months_Range',
 'Income_Range']

Going to investigate all 24 Products, and develop linear regression models to create implicit ratings. If a product has an insignificant level of customers, or doesn't have a strong correlation with any features, its implicit rating will be 0. I will look at the correlation between the binary dummy variables and the products.

#### Savings_Account ( 0 )

In [6]:
df.groupBy('Savings_Account').count().show()

+---------------+--------+
|Savings_Account|   count|
+---------------+--------+
|              1|    1363|
|              0|10291257|
+---------------+--------+



In [None]:
#Fewer than .1% of customers have a savings account
Rating[Savings_Account] = 0 

#### Guarantees ( 0 )

In [6]:
df.groupBy('Guarantees').count().show()

+----------+--------+
|Guarantees|   count|
+----------+--------+
|         1|     310|
|         0|10292310|
+----------+--------+



In [None]:
#Fewer than .1% of customers have a 'Guarantees'
Rating[Guarantees] = 0 

#### Current_Accounts ( LR )

In [7]:
df.groupBy('Current_Accounts').count().show()

+----------------+-------+
|Current_Accounts|  count|
+----------------+-------+
|               1|8009093|
|               0|2283527|
+----------------+-------+



In [6]:
current_df = df.select('Gender',  'Foreigner_Index', 'Active', 'Current_Accounts')
for i in current_df.columns:
    if not( isinstance(current_df.select(i).take(1)[0][0], six.string_types)):
        print( "Correlation to Current_Accounts for ", i, current_df.stat.corr('Current_Accounts',i))

Correlation to Current_Accounts for  Gender 0.05007959153329324
Correlation to Current_Accounts for  Foreigner_Index -0.010748941450459924
Correlation to Current_Accounts for  Active -0.10595967640205521
Correlation to Current_Accounts for  Current_Accounts 1.0


In [5]:
current_df = df.select('Customer_Code', 'Age_Range', 'Months_Range', 'Income_Range', 'Segmentation','Current_Accounts')
encoder = OneHotEncoder(inputCols =['Age_Range', 'Months_Range', 'Income_Range', 'Segmentation'],
                                 outputCols =['Age_RangeH', 'Months_RangeH', 'Income_RangeH', 'SegmentationH'])
encoder = encoder.fit(current_df)
current_df = encoder.transform(current_df)
vectorAssembler = VectorAssembler(inputCols = ['Age_RangeH', 'Months_RangeH', 'Income_RangeH', 'SegmentationH'], outputCol = 'features')
current_df = vectorAssembler.transform(current_df)
current_df = current_df.select(['Customer_Code','features', 'Current_Accounts'])
current_df.show(3)

+-------------+--------------------+----------------+
|Customer_Code|            features|Current_Accounts|
+-------------+--------------------+----------------+
|      15930.0|(15,[3,12,13],[1....|               1|
|      15930.0|(15,[3,12,13],[1....|               1|
|      15930.0|(15,[3,12,13],[1....|               1|
+-------------+--------------------+----------------+
only showing top 3 rows



In [6]:
splits = current_df.randomSplit([0.6, 0.4])
train_df = splits[0]
test_df = splits[1]

In [8]:
lr = LinearRegression(featuresCol = 'features', labelCol='Current_Accounts', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(train_df)
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 0.415525
r2: 0.000000


In [None]:
rf = RandomForestClassifier(labelCol="Current_Accounts", featuresCol="features")
evaluator = BinaryClassificationEvaluator(
    labelCol="Current_Accounts", rawPredictionCol="prediction", metricName="areaUnderROC")
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [3, 6, 10]).build()
crossval = CrossValidator(
    estimator=rf,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=3)
model = crossval.fit(train_df)

In [21]:
model = model.bestModel
model.getNumTrees

10

In [22]:
model = model.bestModel
predictions = model.transform(test_df)
ROC = evaluator.evaluate(predictions)
print("Area under ROC Curve = %g" % ROC)

Area under ROC Curve = 0.522048


In [44]:
predictions.groupBy('prediction').count().show()

+----------+-------+
|prediction|  count|
+----------+-------+
|       0.0|  40549|
|       1.0|3046917|
+----------+-------+



In [45]:
predictions.show(3)

+--------------------+----------------+--------------------+--------------------+----------+
|            features|Current_Accounts|       rawPrediction|         probability|prediction|
+--------------------+----------------+--------------------+--------------------+----------+
|(15,[0,4,10,14],[...|               0|[6.50677861653242...|[0.65067786165324...|       0.0|
|(15,[0,4,11,14],[...|               0|[6.50677861653242...|[0.65067786165324...|       0.0|
|(15,[0,4,11,14],[...|               0|[6.50677861653242...|[0.65067786165324...|       0.0|
+--------------------+----------------+--------------------+--------------------+----------+
only showing top 3 rows



In [7]:
LR = LogisticRegression(labelCol="Current_Accounts", featuresCol="features", maxIter=100, regParam=0.01,
                        elasticNetParam=1.0)
LRmodel = LR.fit(train_df)
Currentpredictions = LRmodel.transform(test_df)

In [8]:
evaluator = BinaryClassificationEvaluator(labelCol='Current_Accounts')
auroc = evaluator.evaluate(Currentpredictions, {evaluator.metricName: "areaUnderROC"})
print("Area under ROC Curve: {:.4f}".format(auroc))

Area under ROC Curve: 0.7226


In [64]:
Currentpredictions.show(3)

+-------------+--------------------+----------------+--------------------+--------------------+----------+
|Customer_Code|            features|Current_Accounts|       rawPrediction|         probability|prediction|
+-------------+--------------------+----------------+--------------------+--------------------+----------+
|      15889.0|(15,[3,13],[1.0,1...|               1|[-0.6161026503375...|[0.35066836088241...|       1.0|
|      15889.0|(15,[3,13],[1.0,1...|               1|[-0.6161026503375...|[0.35066836088241...|       1.0|
|      15889.0|(15,[3,13],[1.0,1...|               1|[-0.6161026503375...|[0.35066836088241...|       1.0|
+-------------+--------------------+----------------+--------------------+--------------------+----------+
only showing top 3 rows



#### Derivative ( 0 )

In [7]:
df.groupBy('Derivative').count().show()

+----------+--------+
|Derivative|   count|
+----------+--------+
|         1|    5211|
|         0|10287409|
+----------+--------+



In [None]:
#Fewer than .1% of customers have a Derivative
Rating[Derivative] = 0 

#### Payroll_Account ( LR )

In [9]:
df.groupBy('Payroll_Account').count().show()

+---------------+-------+
|Payroll_Account|  count|
+---------------+-------+
|              1|1036365|
|              0|9256255|
+---------------+-------+



In [11]:
payroll_df = df.select('Gender',  'Foreigner_Index', 'Active', 'Payroll_Account')
for i in payroll_df.columns:
    if not( isinstance(payroll_df.select(i).take(1)[0][0], six.string_types)):
        print( "Correlation to Payroll_Account for ", i, payroll_df.stat.corr('Payroll_Account',i))

Correlation to Payroll_Account for  Gender -0.030316400642578074
Correlation to Payroll_Account for  Foreigner_Index -0.00517684620680248
Correlation to Payroll_Account for  Active 0.3030775727099324
Correlation to Payroll_Account for  Payroll_Account 1.0


In [9]:
payroll_df = df.select('Customer_Code','Active', 'Age_Range', 'Months_Range', 'Income_Range', 'Segmentation','Payroll_Account')
encoder = OneHotEncoder(inputCols =['Active','Age_Range', 'Months_Range', 'Income_Range', 'Segmentation'],
                                 outputCols =['ActiveH','Age_RangeH', 'Months_RangeH', 'Income_RangeH', 'SegmentationH'])
encoder = encoder.fit(payroll_df)
payroll_df = encoder.transform(payroll_df)
vectorAssembler = VectorAssembler(inputCols = ['ActiveH','Age_RangeH', 'Months_RangeH', 'Income_RangeH', 'SegmentationH'], outputCol = 'features')
payroll_df = vectorAssembler.transform(payroll_df)
payroll_df = payroll_df.select(['Customer_Code','features', 'Payroll_Account'])
payroll_df.show(3)

+-------------+--------------------+---------------+
|Customer_Code|            features|Payroll_Account|
+-------------+--------------------+---------------+
|      15930.0|(16,[4,13,14],[1....|              0|
|      15930.0|(16,[4,13,14],[1....|              0|
|      15930.0|(16,[4,13,14],[1....|              0|
+-------------+--------------------+---------------+
only showing top 3 rows



In [10]:
splits = payroll_df.randomSplit([0.6, 0.4])
train_df = splits[0]
test_df = splits[1]

In [27]:
rf = RandomForestClassifier(labelCol="Payroll_Account", featuresCol="features")
evaluator = BinaryClassificationEvaluator(
    labelCol="Payroll_Account", rawPredictionCol="prediction", metricName="areaUnderROC")
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [3, 6, 10]).build()
crossval = CrossValidator(
    estimator=rf,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=3)
model = crossval.fit(train_df)

In [28]:
model = model.bestModel
predictions = model.transform(test_df)
ROC = evaluator.evaluate(predictions)
print("Area under ROC Curve = %g" % ROC)

Area under ROC Curve = 0.5


In [11]:
LR = LogisticRegression(labelCol="Payroll_Account", featuresCol="features", maxIter=100, regParam=0.01,
                        elasticNetParam=1.0)
LRmodel = LR.fit(train_df)
Payroll_Accountpredictions = LRmodel.transform(test_df)
evaluator = BinaryClassificationEvaluator(labelCol='Payroll_Account')
auroc = evaluator.evaluate(Payroll_Accountpredictions, {evaluator.metricName: "areaUnderROC"})
print("Area under ROC Curve: {:.4f}".format(auroc))

Area under ROC Curve: 0.8166


The Logistic Regression Model appaears to be superior for this dataset, so I will use only it for all products here on out.

#### Junior_Account ( LR )

In [10]:
df.groupBy('Junior_Account').count().show()

+--------------+--------+
|Junior_Account|   count|
+--------------+--------+
|             1|  118837|
|             0|10173783|
+--------------+--------+



In [9]:
current_df = df.select('Gender',  'Foreigner_Index', 'Active', 'Junior_Account')
for i in current_df.columns:
    if not( isinstance(current_df.select(i).take(1)[0][0], six.string_types)):
        print( "Correlation to Junior_Accounts for ", i, current_df.stat.corr('Junior_Account',i))

Correlation to Junior_Accounts for  Gender 0.006947242747022523
Correlation to Junior_Accounts for  Foreigner_Index -0.020453048347726314
Correlation to Junior_Accounts for  Active 0.06089195163825482
Correlation to Junior_Accounts for  Junior_Account 1.0


In [12]:
Junior_Account_df = df.select('Customer_Code','Age_Range', 'Months_Range', 'Income_Range', 'Segmentation','Junior_Account')
encoder = OneHotEncoder(inputCols =['Age_Range', 'Months_Range', 'Income_Range', 'Segmentation'],
                                 outputCols =['Age_RangeH', 'Months_RangeH', 'Income_RangeH', 'SegmentationH'])
encoder = encoder.fit(Junior_Account_df)
Junior_Account_df = encoder.transform(Junior_Account_df)
vectorAssembler = VectorAssembler(inputCols = ['Age_RangeH', 'Months_RangeH', 'Income_RangeH', 'SegmentationH'], outputCol = 'features')
Junior_Account_df = vectorAssembler.transform(Junior_Account_df)
Junior_Account_df = Junior_Account_df.select(['Customer_Code','features', 'Junior_Account'])
splits = Junior_Account_df.randomSplit([0.6, 0.4])
train_df = splits[0]
test_df = splits[1]

In [13]:
LR = LogisticRegression(labelCol="Junior_Account", featuresCol="features", maxIter=100, regParam=0.01,
                        elasticNetParam=1.0)
LRmodel = LR.fit(train_df)
LRpredictions = LRmodel.transform(test_df)
evaluator = BinaryClassificationEvaluator(labelCol='Junior_Account')
auroc = evaluator.evaluate(LRpredictions, {evaluator.metricName: "areaUnderROC"})
print("Area under ROC Curve: {:.4f}".format(auroc))

Area under ROC Curve: 0.9197


#### More_Particular_Account ( LR )

In [11]:
df.groupBy('More_Particular_Account').count().show()

+-----------------------+--------+
|More_Particular_Account|   count|
+-----------------------+--------+
|                      1|  108324|
|                      0|10184296|
+-----------------------+--------+



In [31]:
More_Particular_df = df.select('Gender',  'Foreigner_Index', 'Active', 'More_Particular_Account')
for i in More_Particular_df.columns:
    if not( isinstance(More_Particular_df.select(i).take(1)[0][0], six.string_types)):
        print( "Correlation to More_Particular_Account for ", i, More_Particular_df.stat.corr('More_Particular_Account',i))

Correlation to More_Particular_Account for  Gender -0.010663105239629682
Correlation to More_Particular_Account for  Foreigner_Index 0.0033808939415948995
Correlation to More_Particular_Account for  Active 0.04515863963992152
Correlation to More_Particular_Account for  More_Particular_Account 1.0


In [14]:
More_Particular_Account_df = df.select('Customer_Code','Age_Range', 'Months_Range', 'Income_Range', 'Segmentation','More_Particular_Account')
encoder = OneHotEncoder(inputCols =['Age_Range', 'Months_Range', 'Income_Range', 'Segmentation'],
                                 outputCols =['Age_RangeH', 'Months_RangeH', 'Income_RangeH', 'SegmentationH'])
encoder = encoder.fit(More_Particular_Account_df)
More_Particular_Account_df = encoder.transform(More_Particular_Account_df)
vectorAssembler = VectorAssembler(inputCols = ['Age_RangeH', 'Months_RangeH', 'Income_RangeH', 'SegmentationH'], outputCol = 'features')
More_Particular_Account_df = vectorAssembler.transform(More_Particular_Account_df)
More_Particular_Account_df = More_Particular_Account_df.select(['Customer_Code','features', 'More_Particular_Account'])
splits = More_Particular_Account_df.randomSplit([0.6, 0.4])
train_df = splits[0]
test_df = splits[1]

In [15]:
LR = LogisticRegression(labelCol="More_Particular_Account", featuresCol="features", maxIter=100, regParam=0.01,
                        elasticNetParam=1.0)
LRmodel = LR.fit(train_df)
More_Particular_Accountpredictions = LRmodel.transform(test_df)
evaluator = BinaryClassificationEvaluator(labelCol='More_Particular_Account')
auroc = evaluator.evaluate(More_Particular_Accountpredictions, {evaluator.metricName: "areaUnderROC"})
print("Area under ROC Curve: {:.4f}".format(auroc))

Area under ROC Curve: 0.5000


#### Particular_Account ( LR )

In [12]:
df.groupBy('Particular_Account').count().show()

+------------------+-------+
|Particular_Account|  count|
+------------------+-------+
|                 1|1749357|
|                 0|8543263|
+------------------+-------+



In [30]:
Particular_Account_df = df.select('Gender',  'Foreigner_Index', 'Active', 'Particular_Account')
for i in Particular_Account_df.columns:
    if not( isinstance(Particular_Account_df.select(i).take(1)[0][0], six.string_types)):
        print( "Correlation to Particular_Account for ", i, Particular_Account_df.stat.corr('Particular_Account',i))

Correlation to Particular_Account for  Gender -0.07060039224986418
Correlation to Particular_Account for  Foreigner_Index -0.033491255665048764
Correlation to Particular_Account for  Active 0.12640958493466675
Correlation to Particular_Account for  Particular_Account 1.0


In [16]:
Particular_Account_df = df.select('Customer_Code','Age_Range', 'Months_Range', 'Income_Range', 'Segmentation','Particular_Account')
encoder = OneHotEncoder(inputCols =['Age_Range', 'Months_Range', 'Income_Range', 'Segmentation'],
                                 outputCols =['Age_RangeH', 'Months_RangeH', 'Income_RangeH', 'SegmentationH'])
encoder = encoder.fit(Particular_Account_df)
Particular_Account_df = encoder.transform(Particular_Account_df)
vectorAssembler = VectorAssembler(inputCols = ['Age_RangeH', 'Months_RangeH', 'Income_RangeH', 'SegmentationH'], outputCol = 'features')
Particular_Account_df = vectorAssembler.transform(Particular_Account_df)
Particular_Account_df = Particular_Account_df.select(['Customer_Code','features', 'Particular_Account'])
splits = Particular_Account_df.randomSplit([0.6, 0.4])
train_df = splits[0]
test_df = splits[1]

In [17]:
LR = LogisticRegression(labelCol="Particular_Account", featuresCol="features", maxIter=100, regParam=0.01,
                        elasticNetParam=1.0)
LRmodel = LR.fit(train_df)
Particular_Accountpredictions = LRmodel.transform(test_df)
evaluator = BinaryClassificationEvaluator(labelCol='Particular_Account')
auroc = evaluator.evaluate(Particular_Accountpredictions, {evaluator.metricName: "areaUnderROC"})
print("Area under ROC Curve: {:.4f}".format(auroc))

Area under ROC Curve: 0.8756


#### Particular_Plus_Account (LR)

In [13]:
df.groupBy('Particular_Plus_Account').count().show()

+-----------------------+-------+
|Particular_Plus_Account|  count|
+-----------------------+-------+
|                      1| 587160|
|                      0|9705460|
+-----------------------+-------+



In [28]:
Particular_Plus_df = df.select('Gender',  'Foreigner_Index', 'Active', 'Particular_Plus_Account')
for i in Particular_Plus_df.columns:
    if not( isinstance(Particular_Plus_df.select(i).take(1)[0][0], six.string_types)):
        print( "Correlation to Particular_Plus_Account for ", i, Particular_Plus_df.stat.corr('Particular_Plus_Account',i))

Correlation to Particular_Plus_Account for  Gender -0.032542597586215735
Correlation to Particular_Plus_Account for  Foreigner_Index -0.010798075673421845
Correlation to Particular_Plus_Account for  Active 0.14137984642094625
Correlation to Particular_Plus_Account for  Particular_Plus_Account 1.0


In [18]:
Particular_Plus_Account_df = df.select('Customer_Code','Age_Range', 'Months_Range', 'Income_Range', 'Segmentation','Particular_Plus_Account')
encoder = OneHotEncoder(inputCols =['Age_Range', 'Months_Range', 'Income_Range', 'Segmentation'],
                                 outputCols =['Age_RangeH', 'Months_RangeH', 'Income_RangeH', 'SegmentationH'])
encoder = encoder.fit(Particular_Plus_Account_df)
Particular_Plus_Account_df = encoder.transform(Particular_Plus_Account_df)
vectorAssembler = VectorAssembler(inputCols = ['Age_RangeH', 'Months_RangeH', 'Income_RangeH', 'SegmentationH'], outputCol = 'features')
Particular_Plus_Account_df = vectorAssembler.transform(Particular_Plus_Account_df)
Particular_Plus_Account_df = Particular_Plus_Account_df.select(['Customer_Code','features', 'Particular_Plus_Account'])
splits = Particular_Plus_Account_df.randomSplit([0.6, 0.4])
train_df = splits[0]
test_df = splits[1]

In [19]:
LR = LogisticRegression(labelCol="Particular_Plus_Account", featuresCol="features", maxIter=100, regParam=0.01,
                        elasticNetParam=1.0)
LRmodel = LR.fit(train_df)
Particular_Plus_Accountpredictions = LRmodel.transform(test_df)
evaluator = BinaryClassificationEvaluator(labelCol='Particular_Plus_Account')
auroc = evaluator.evaluate(Particular_Plus_Accountpredictions, {evaluator.metricName: "areaUnderROC"})
print("Area under ROC Curve: {:.4f}".format(auroc))

Area under ROC Curve: 0.8265


#### Short_Term_Deposits ( 0 )

In [14]:
df.groupBy('Short_Term_Deposits').count().show()

+-------------------+--------+
|Short_Term_Deposits|   count|
+-------------------+--------+
|                  1|    8894|
|                  0|10283726|
+-------------------+--------+



In [None]:
#Fewer than .1% of customers have a Short_Term_Deposits account
Rating[Short_Term_Deposits] = 0 

#### Medium_Term_Deposits ( LR )

In [15]:
df.groupBy('Medium_Term_Deposits').count().show()

+--------------------+--------+
|Medium_Term_Deposits|   count|
+--------------------+--------+
|                   1|   22454|
|                   0|10270166|
+--------------------+--------+



In [27]:
Medium_Term_df = df.select('Gender',  'Foreigner_Index', 'Active', 'Medium_Term_Deposits')
for i in Medium_Term_df.columns:
    if not(isinstance(Medium_Term_df.select(i).take(1)[0][0], six.string_types)):
        print( "Correlation to Medium_Term_Deposits for ", i, Medium_Term_df.stat.corr('Medium_Term_Deposits',i))

Correlation to Medium_Term_Deposits for  Gender -0.003512467505477735
Correlation to Medium_Term_Deposits for  Foreigner_Index -0.0026938296530928706
Correlation to Medium_Term_Deposits for  Active 0.04321088779058333
Correlation to Medium_Term_Deposits for  Medium_Term_Deposits 1.0


In [20]:
Medium_Term_Deposits_df = df.select('Customer_Code','Age_Range', 'Months_Range', 'Income_Range', 'Segmentation','Medium_Term_Deposits')
encoder = OneHotEncoder(inputCols =['Age_Range', 'Months_Range', 'Income_Range', 'Segmentation'],
                                 outputCols =['Age_RangeH', 'Months_RangeH', 'Income_RangeH', 'SegmentationH'])
encoder = encoder.fit(Medium_Term_Deposits_df)
Medium_Term_Deposits_df = encoder.transform(Medium_Term_Deposits_df)
vectorAssembler = VectorAssembler(inputCols = ['Age_RangeH', 'Months_RangeH', 'Income_RangeH', 'SegmentationH'], outputCol = 'features')
Medium_Term_Deposits_df = vectorAssembler.transform(Medium_Term_Deposits_df)
Medium_Term_Deposits_df = Medium_Term_Deposits_df.select(['Customer_Code','features', 'Medium_Term_Deposits'])
splits = Medium_Term_Deposits_df.randomSplit([0.6, 0.4])
train_df = splits[0]
test_df = splits[1]

In [21]:
LR = LogisticRegression(labelCol="Medium_Term_Deposits", featuresCol="features", maxIter=100, regParam=0.01,
                        elasticNetParam=1.0)
LRmodel = LR.fit(train_df)
Medium_Term_Depositspredictions = LRmodel.transform(test_df)
evaluator = BinaryClassificationEvaluator(labelCol='Medium_Term_Deposits')
auroc = evaluator.evaluate(Medium_Term_Depositspredictions, {evaluator.metricName: "areaUnderROC"})
print("Area under ROC Curve: {:.4f}".format(auroc))

Area under ROC Curve: 0.5000


#### Long_Term_Deposits ( LR )

In [16]:
df.groupBy('Long_Term_Deposits').count().show()

+------------------+-------+
|Long_Term_Deposits|  count|
+------------------+-------+
|                 1| 548516|
|                 0|9744104|
+------------------+-------+



In [26]:
Long_Term_Deposits_df = df.select('Gender',  'Foreigner_Index', 'Active', 'Long_Term_Deposits')
for i in Long_Term_Deposits_df.columns:
    if not( isinstance(Long_Term_Deposits_df.select(i).take(1)[0][0], six.string_types)):
        print( "Correlation to Long_Term_Deposits for ", i, Long_Term_Deposits_df.stat.corr('Long_Term_Deposits',i))

Correlation to Long_Term_Deposits for  Gender -0.026517707467546375
Correlation to Long_Term_Deposits for  Foreigner_Index -0.03205956190794697
Correlation to Long_Term_Deposits for  Active 0.2185275552136404
Correlation to Long_Term_Deposits for  Long_Term_Deposits 1.0


In [22]:
Long_Term_Deposits_df = df.select('Customer_Code','Active','Age_Range', 'Months_Range', 'Income_Range', 'Segmentation','Long_Term_Deposits')
encoder = OneHotEncoder(inputCols =['Active','Age_Range', 'Months_Range', 'Income_Range', 'Segmentation'],
                                 outputCols =['ActiveH','Age_RangeH', 'Months_RangeH', 'Income_RangeH', 'SegmentationH'])
encoder = encoder.fit(Long_Term_Deposits_df)
Long_Term_Deposits_df = encoder.transform(Long_Term_Deposits_df)
vectorAssembler = VectorAssembler(inputCols = ['ActiveH','Age_RangeH', 'Months_RangeH', 'Income_RangeH', 'SegmentationH'], outputCol = 'features')
Long_Term_Deposits_df = vectorAssembler.transform(Long_Term_Deposits_df)
Long_Term_Deposits_df = Long_Term_Deposits_df.select(['Customer_Code','features', 'Long_Term_Deposits'])
splits = Long_Term_Deposits_df.randomSplit([0.6, 0.4])
train_df = splits[0]
test_df = splits[1]

In [23]:
LR = LogisticRegression(labelCol="Long_Term_Deposits", featuresCol="features", maxIter=100, regParam=0.01,
                        elasticNetParam=1.0)
LRmodel = LR.fit(train_df)
Long_Term_Depositspredictions = LRmodel.transform(test_df)
evaluator = BinaryClassificationEvaluator(labelCol='Long_Term_Deposits')
auroc = evaluator.evaluate(Long_Term_Depositspredictions, {evaluator.metricName: "areaUnderROC"})
print("Area under ROC Curve: {:.4f}".format(auroc))

Area under ROC Curve: 0.8888


#### e-Account (LR)

In [20]:
df.groupBy('e-Account').count().show()

+---------+-------+
|e-Account|  count|
+---------+-------+
|        1|1077798|
|        0|9214822|
+---------+-------+



In [25]:
e_Account_df = df.select('Gender',  'Foreigner_Index', 'Active', 'e-Account')
for i in e_Account_df.columns:
    if not( isinstance(e_Account_df.select(i).take(1)[0][0], six.string_types)):
        print( "Correlation to e-Account for ", i, e_Account_df.stat.corr('e-Account',i))

Correlation to e-Account for  Gender -0.04694970724159377
Correlation to e-Account for  Foreigner_Index -0.028789494913759924
Correlation to e-Account for  Active 0.27054356960091075
Correlation to e-Account for  e-Account 1.0


In [24]:
e_Account_df = df.select('Customer_Code','Active', 'Age_Range', 'Months_Range', 'Income_Range', 'Segmentation','e-Account')
encoder = OneHotEncoder(inputCols =['Active','Age_Range', 'Months_Range', 'Income_Range', 'Segmentation'],
                                 outputCols =['ActiveH','Age_RangeH', 'Months_RangeH', 'Income_RangeH', 'SegmentationH'])
encoder = encoder.fit(e_Account_df)
e_Account_df = encoder.transform(e_Account_df)
vectorAssembler = VectorAssembler(inputCols = ['ActiveH','Age_RangeH', 'Months_RangeH', 'Income_RangeH', 'SegmentationH'], outputCol = 'features')
e_Account_df = vectorAssembler.transform(e_Account_df)
e_Account_df = e_Account_df.select(['Customer_Code','features', 'e-Account'])
splits = e_Account_df.randomSplit([0.6, 0.4])
train_df = splits[0]
test_df = splits[1]

In [25]:
LR = LogisticRegression(labelCol="e-Account", featuresCol="features", maxIter=100, regParam=0.01,
                        elasticNetParam=1.0)
LRmodel = LR.fit(train_df)
e_Accountpredictions = LRmodel.transform(test_df)
evaluator = BinaryClassificationEvaluator(labelCol='e-Account')
auroc = evaluator.evaluate(e_Accountpredictions, {evaluator.metricName: "areaUnderROC"})
print("Area under ROC Curve: {:.4f}".format(auroc))

Area under ROC Curve: 0.8129


#### Funds (LR)

In [21]:
df.groupBy('Funds').count().show()

+-----+--------+
|Funds|   count|
+-----+--------+
|    1|  248133|
|    0|10044487|
+-----+--------+



In [24]:
Funds_df = df.select('Gender',  'Foreigner_Index', 'Active', 'Funds')
for i in Funds_df.columns:
    if not( isinstance(Funds_df.select(i).take(1)[0][0], six.string_types)):
        print( "Correlation to Funds for ", i, Funds_df.stat.corr('Funds',i))

Correlation to Funds for  Gender -0.04692703280449765
Correlation to Funds for  Foreigner_Index -0.017022772519029603
Correlation to Funds for  Active 0.1439464429735304
Correlation to Funds for  Funds 1.0


In [26]:
Funds_df = df.select('Customer_Code','Age_Range', 'Months_Range', 'Income_Range', 'Segmentation','Funds')
encoder = OneHotEncoder(inputCols =['Age_Range', 'Months_Range', 'Income_Range', 'Segmentation'],
                                 outputCols =['Age_RangeH', 'Months_RangeH', 'Income_RangeH', 'SegmentationH'])
encoder = encoder.fit(Funds_df)
Funds_df = encoder.transform(Funds_df)
vectorAssembler = VectorAssembler(inputCols = ['Age_RangeH', 'Months_RangeH', 'Income_RangeH', 'SegmentationH'], outputCol = 'features')
Funds_df = vectorAssembler.transform(Funds_df)
Funds_df = Funds_df.select(['Customer_Code','features', 'Funds'])
splits = Funds_df.randomSplit([0.6, 0.4])
train_df = splits[0]
test_df = splits[1]

In [27]:
LR = LogisticRegression(labelCol="Funds", featuresCol="features", maxIter=100, regParam=0.01,
                        elasticNetParam=1.0)
LRmodel = LR.fit(train_df)
Fundspredictions = LRmodel.transform(test_df)
evaluator = BinaryClassificationEvaluator(labelCol='Funds')
auroc = evaluator.evaluate(Fundspredictions, {evaluator.metricName: "areaUnderROC"})
print("Area under ROC Curve: {:.4f}".format(auroc))

Area under ROC Curve: 0.7747


#### Mortgage (LR)

In [22]:
df.groupBy('Mortgage').count().show()

+--------+--------+
|Mortgage|   count|
+--------+--------+
|       1|   79897|
|       0|10212723|
+--------+--------+



In [23]:
Mortgage_df = df.select('Gender',  'Foreigner_Index', 'Active', 'Mortgage')
for i in Mortgage_df.columns:
    if not( isinstance(Mortgage_df.select(i).take(1)[0][0], six.string_types)):
        print( "Correlation to Mortgage for ", i, Mortgage_df.stat.corr('Mortgage',i))

Correlation to Mortgage for  Gender -0.030095779138293813
Correlation to Mortgage for  Foreigner_Index -0.012050523584328222
Correlation to Mortgage for  Active 0.08169778744820423
Correlation to Mortgage for  Mortgage 1.0


In [28]:
Mortgage_df = df.select('Customer_Code','Age_Range', 'Months_Range', 'Income_Range', 'Segmentation','Mortgage')
encoder = OneHotEncoder(inputCols =['Age_Range', 'Months_Range', 'Income_Range', 'Segmentation'],
                                 outputCols =['Age_RangeH', 'Months_RangeH', 'Income_RangeH', 'SegmentationH'])
encoder = encoder.fit(Mortgage_df)
Mortgage_df = encoder.transform(Mortgage_df)
vectorAssembler = VectorAssembler(inputCols = ['Age_RangeH', 'Months_RangeH', 'Income_RangeH', 'SegmentationH'], outputCol = 'features')
Mortgage_df = vectorAssembler.transform(Mortgage_df)
Mortgage_df = Mortgage_df.select(['Customer_Code','features', 'Mortgage'])
splits = Mortgage_df.randomSplit([0.6, 0.4])
train_df = splits[0]
test_df = splits[1]

In [29]:
LR = LogisticRegression(labelCol="Mortgage", featuresCol="features", maxIter=100, regParam=0.01,
                        elasticNetParam=1.0)
LRmodel = LR.fit(train_df)
Mortgagepredictions = LRmodel.transform(test_df)
evaluator = BinaryClassificationEvaluator(labelCol='Mortgage')
auroc = evaluator.evaluate(Mortgagepredictions, {evaluator.metricName: "areaUnderROC"})
print("Area under ROC Curve: {:.4f}".format(auroc))

Area under ROC Curve: 0.5000


#### Pensions (LR)

In [23]:
df.groupBy('Pensions').count().show()

+--------+--------+
|Pensions|   count|
+--------+--------+
|       1|  123737|
|       0|10168883|
+--------+--------+



In [22]:
Pensions_df = df.select('Gender',  'Foreigner_Index', 'Active', 'Pensions')
for i in Pensions_df.columns:
    if not( isinstance(Pensions_df.select(i).take(1)[0][0], six.string_types)):
        print( "Correlation to Pensions for ", i, Pensions_df.stat.corr('Pensions',i))

Correlation to Pensions for  Gender -0.01815304828631781
Correlation to Pensions for  Foreigner_Index -0.0038319166610656314
Correlation to Pensions for  Active 0.10112164019193774
Correlation to Pensions for  Pensions 1.0


In [30]:
Pensions_df = df.select('Customer_Code','Age_Range', 'Months_Range', 'Income_Range', 'Segmentation','Pensions')
encoder = OneHotEncoder(inputCols =['Age_Range', 'Months_Range', 'Income_Range', 'Segmentation'],
                                 outputCols =['Age_RangeH', 'Months_RangeH', 'Income_RangeH', 'SegmentationH'])
encoder = encoder.fit(Pensions_df)
Pensions_df = encoder.transform(Pensions_df)
vectorAssembler = VectorAssembler(inputCols = ['Age_RangeH', 'Months_RangeH', 'Income_RangeH', 'SegmentationH'], outputCol = 'features')
Pensions_df = vectorAssembler.transform(Pensions_df)
Pensions_df = Pensions_df.select(['Customer_Code','features', 'Pensions'])
splits = Pensions_df.randomSplit([0.6, 0.4])
train_df = splits[0]
test_df = splits[1]

In [31]:
LR = LogisticRegression(labelCol="Pensions", featuresCol="features", maxIter=100, regParam=0.01,
                        elasticNetParam=1.0)
LRmodel = LR.fit(train_df)
Pensionspredictions = LRmodel.transform(test_df)
evaluator = BinaryClassificationEvaluator(labelCol='Pensions')
auroc = evaluator.evaluate(Pensionspredictions, {evaluator.metricName: "areaUnderROC"})
print("Area under ROC Curve: {:.4f}".format(auroc))

Area under ROC Curve: 0.7813


#### Loans (LR)

In [24]:
df.groupBy('Loans').count().show()

+-----+--------+
|Loans|   count|
+-----+--------+
|    1|   34305|
|    0|10258315|
+-----+--------+



In [21]:
Loans_df = df.select('Gender',  'Foreigner_Index', 'Active', 'Loans')
for i in Loans_df.columns:
    if not( isinstance(Loans_df.select(i).take(1)[0][0], six.string_types)):
        print( "Correlation to Loans for ", i, Loans_df.stat.corr('Loans',i))

Correlation to Loans for  Gender -0.016345241083973866
Correlation to Loans for  Foreigner_Index 0.03816512475194234
Correlation to Loans for  Active 0.03251360151206657
Correlation to Loans for  Loans 1.0


In [32]:
Loans_df = df.select('Customer_Code','Active', 'Age_Range', 'Months_Range', 'Income_Range', 'Segmentation','Loans')
encoder = OneHotEncoder(inputCols =['Age_Range', 'Months_Range', 'Income_Range', 'Segmentation'],
                                 outputCols =['Age_RangeH', 'Months_RangeH', 'Income_RangeH', 'SegmentationH'])
encoder = encoder.fit(Loans_df)
Loans_df = encoder.transform(Loans_df)
vectorAssembler = VectorAssembler(inputCols = ['Age_RangeH', 'Months_RangeH', 'Income_RangeH', 'SegmentationH'], outputCol = 'features')
Loans_df = vectorAssembler.transform(Loans_df)
Loans_df = Loans_df.select(['Customer_Code','features', 'Loans'])
splits = Loans_df.randomSplit([0.6, 0.4])
train_df = splits[0]
test_df = splits[1]

In [33]:
LR = LogisticRegression(labelCol="Loans", featuresCol="features", maxIter=100, regParam=0.01,
                        elasticNetParam=1.0)
LRmodel = LR.fit(train_df)
Loanspredictions = LRmodel.transform(test_df)
evaluator = BinaryClassificationEvaluator(labelCol='Loans')
auroc = evaluator.evaluate(Loanspredictions, {evaluator.metricName: "areaUnderROC"})
print("Area under ROC Curve: {:.4f}".format(auroc))

Area under ROC Curve: 0.5000


#### Taxes (LR)

In [26]:
df.groupBy('Taxes').count().show()

+-----+-------+
|Taxes|  count|
+-----+-------+
|    1| 693694|
|    0|9598926|
+-----+-------+



In [20]:
Taxes_df = df.select('Gender',  'Foreigner_Index', 'Active', 'Taxes')
for i in Taxes_df.columns:
    if not( isinstance(Taxes_df.select(i).take(1)[0][0], six.string_types)):
        print( "Correlation to Taxes for ", i, Taxes_df.stat.corr('Taxes',i))

Correlation to Taxes for  Gender -0.056825807751829
Correlation to Taxes for  Foreigner_Index 0.013623825782699007
Correlation to Taxes for  Active 0.21344938247865125
Correlation to Taxes for  Taxes 1.0


In [34]:
Taxes_df = df.select('Customer_Code','Active', 'Age_Range', 'Months_Range', 'Income_Range', 'Segmentation','Taxes')
encoder = OneHotEncoder(inputCols =['Active', 'Age_Range', 'Months_Range', 'Income_Range', 'Segmentation'],
                                 outputCols =['ActiveH', 'Age_RangeH', 'Months_RangeH', 'Income_RangeH', 'SegmentationH'])
encoder = encoder.fit(Taxes_df)
Taxes_df = encoder.transform(Taxes_df)
vectorAssembler = VectorAssembler(inputCols = ['ActiveH','Age_RangeH', 'Months_RangeH', 'Income_RangeH', 'SegmentationH'], outputCol = 'features')
Taxes_df = vectorAssembler.transform(Taxes_df)
Taxes_df = Taxes_df.select(['Customer_Code','features', 'Taxes'])
splits = Taxes_df.randomSplit([0.6, 0.4])
train_df = splits[0]
test_df = splits[1]

In [35]:
LR = LogisticRegression(labelCol="Taxes", featuresCol="features", maxIter=100, regParam=0.01,
                        elasticNetParam=1.0)
LRmodel = LR.fit(train_df)
Taxespredictions = LRmodel.transform(test_df)
evaluator = BinaryClassificationEvaluator(labelCol='Taxes')
auroc = evaluator.evaluate(Taxespredictions, {evaluator.metricName: "areaUnderROC"})
print("Area under ROC Curve: {:.4f}".format(auroc))

Area under ROC Curve: 0.8017


#### Credit_Card (LR)

In [27]:
df.groupBy('Credit_Card').count().show()

+-----------+-------+
|Credit_Card|  count|
+-----------+-------+
|          1| 592459|
|          0|9700161|
+-----------+-------+



In [17]:
Credit_Card_df = df.select('Gender',  'Foreigner_Index', 'Active', 'Credit_Card')
for i in Credit_Card_df.columns:
    if not( isinstance(Credit_Card_df.select(i).take(1)[0][0], six.string_types)):
        print( "Correlation to Credit_Card for ", i, Credit_Card_df.stat.corr('Credit_Card',i))

Correlation to Credit_Card for  Gender -0.053822880152472496
Correlation to Credit_Card for  Foreigner_Index 0.0022015999417327814
Correlation to Credit_Card for  Active 0.22713687273622554
Correlation to Credit_Card for  Credit_Card 1.0


In [36]:
Credit_Card_df = df.select('Customer_Code','Active', 'Age_Range', 'Months_Range', 'Income_Range', 'Segmentation','Credit_Card')
encoder = OneHotEncoder(inputCols =['Active','Age_Range', 'Months_Range', 'Income_Range', 'Segmentation'],
                                 outputCols =['ActiveH','Age_RangeH', 'Months_RangeH', 'Income_RangeH', 'SegmentationH'])
encoder = encoder.fit(Credit_Card_df)
Credit_Card_df = encoder.transform(Credit_Card_df)
vectorAssembler = VectorAssembler(inputCols = ['ActiveH','Age_RangeH', 'Months_RangeH', 'Income_RangeH', 'SegmentationH'], outputCol = 'features')
Credit_Card_df = vectorAssembler.transform(Credit_Card_df)
Credit_Card_df = Credit_Card_df.select(['Customer_Code','features', 'Credit_Card'])
splits = Credit_Card_df.randomSplit([0.6, 0.4])
train_df = splits[0]
test_df = splits[1]

In [37]:
LR = LogisticRegression(labelCol="Credit_Card", featuresCol="features", maxIter=100, regParam=0.01,
                        elasticNetParam=1.0)
LRmodel = LR.fit(train_df)
Credit_Cardpredictions = LRmodel.transform(test_df)
evaluator = BinaryClassificationEvaluator(labelCol='Credit_Card')
auroc = evaluator.evaluate(Credit_Cardpredictions, {evaluator.metricName: "areaUnderROC"})
print("Area under ROC Curve: {:.4f}".format(auroc))

Area under ROC Curve: 0.8408


#### Securities (LR)

In [28]:
df.groupBy('Securities').count().show()

+----------+-------+
|Securities|  count|
+----------+-------+
|         1| 343641|
|         0|9948979|
+----------+-------+



In [16]:
Securities_df = df.select('Gender',  'Foreigner_Index', 'Active', 'Securities')
for i in Securities_df.columns:
    if not( isinstance(Securities_df.select(i).take(1)[0][0], six.string_types)):
        print( "Correlation to Securities for ", i, Securities_df.stat.corr('Securities',i))

Correlation to Securities for  Gender -0.06187978128174594
Correlation to Securities for  Foreigner_Index -0.017876744641905203
Correlation to Securities for  Active 0.16882723797847662
Correlation to Securities for  Securities 1.0


In [38]:
Securities_df = df.select('Customer_Code','Age_Range', 'Months_Range', 'Income_Range', 'Segmentation','Securities')
encoder = OneHotEncoder(inputCols =['Age_Range', 'Months_Range', 'Income_Range', 'Segmentation'],
                                 outputCols =['Age_RangeH', 'Months_RangeH', 'Income_RangeH', 'SegmentationH'])
encoder = encoder.fit(Securities_df)
Securities_df = encoder.transform(Securities_df)
vectorAssembler = VectorAssembler(inputCols = ['Age_RangeH', 'Months_RangeH', 'Income_RangeH', 'SegmentationH'], outputCol = 'features')
Securities_df = vectorAssembler.transform(Securities_df)
Securities_df = Securities_df.select(['Customer_Code','features', 'Securities'])
splits = Securities_df.randomSplit([0.6, 0.4])
train_df = splits[0]
test_df = splits[1]

In [39]:
LR = LogisticRegression(labelCol="Securities", featuresCol="features", maxIter=100, regParam=0.01,
                        elasticNetParam=1.0)
LRmodel = LR.fit(train_df)
Securitiespredictions = LRmodel.transform(test_df)
evaluator = BinaryClassificationEvaluator(labelCol='Securities')
auroc = evaluator.evaluate(Securitiespredictions, {evaluator.metricName: "areaUnderROC"})
print("Area under ROC Curve: {:.4f}".format(auroc))

Area under ROC Curve: 0.8030


#### Home_Account (LR)

In [29]:
df.groupBy('Home_Account').count().show()

+------------+--------+
|Home_Account|   count|
+------------+--------+
|           1|   52244|
|           0|10240376|
+------------+--------+



In [15]:
Home_Account_df = df.select('Gender',  'Foreigner_Index', 'Active', 'Home_Account')
for i in Home_Account_df.columns:
    if not( isinstance(Home_Account_df.select(i).take(1)[0][0], six.string_types)):
        print( "Correlation to Home_Account for ", i, Home_Account_df.stat.corr('Home_Account',i))

Correlation to Home_Account for  Gender -0.011317095372186508
Correlation to Home_Account for  Foreigner_Index -0.001980165499324975
Correlation to Home_Account for  Active 0.034274595019433286
Correlation to Home_Account for  Home_Account 1.0


In [40]:
Home_Account_df = df.select('Customer_Code','Age_Range', 'Months_Range', 'Income_Range', 'Segmentation','Home_Account')
encoder = OneHotEncoder(inputCols =['Age_Range', 'Months_Range', 'Income_Range', 'Segmentation'],
                                 outputCols =['Age_RangeH', 'Months_RangeH', 'Income_RangeH', 'SegmentationH'])
encoder = encoder.fit(Home_Account_df)
Home_Account_df = encoder.transform(Home_Account_df)
vectorAssembler = VectorAssembler(inputCols = ['Age_RangeH', 'Months_RangeH', 'Income_RangeH', 'SegmentationH'], outputCol = 'features')
Home_Account_df = vectorAssembler.transform(Home_Account_df)
Home_Account_df = Home_Account_df.select(['Customer_Code','features', 'Home_Account'])
splits = Home_Account_df.randomSplit([0.6, 0.4])
train_df = splits[0]
test_df = splits[1]

In [41]:
LR = LogisticRegression(labelCol="Home_Account", featuresCol="features", maxIter=100, regParam=0.01,
                        elasticNetParam=1.0)
LRmodel = LR.fit(train_df)
Home_Accountpredictions = LRmodel.transform(test_df)
evaluator = BinaryClassificationEvaluator(labelCol='Home_Account')
auroc = evaluator.evaluate(Home_Accountpredictions, {evaluator.metricName: "areaUnderROC"})
print("Area under ROC Curve: {:.4f}".format(auroc))

Area under ROC Curve: 0.5000


#### Payroll (LR)

In [33]:
df.groupBy('Payroll').count().show()

+-------+-------+
|Payroll|  count|
+-------+-------+
|      0|9589574|
|      1| 703046|
+-------+-------+



In [13]:
Payroll_df = df.select('Gender',  'Foreigner_Index', 'Active', 'Payroll')
for i in Payroll_df.columns:
    if not( isinstance(Payroll_df.select(i).take(1)[0][0], six.string_types)):
        print( "Correlation to Payroll for ", i, Payroll_df.stat.corr('Payroll',i))

Correlation to Payroll for  Gender -0.022059038486154304
Correlation to Payroll for  Foreigner_Index -0.008036220433788807
Correlation to Payroll for  Active 0.24984857002019106
Correlation to Payroll for  Payroll 1.0


In [42]:
Payroll_df = df.select('Customer_Code','Active', 'Age_Range', 'Months_Range', 'Income_Range', 'Segmentation','Payroll')
encoder = OneHotEncoder(inputCols =['Active','Age_Range', 'Months_Range', 'Income_Range', 'Segmentation'],
                                 outputCols =['ActiveH','Age_RangeH', 'Months_RangeH', 'Income_RangeH', 'SegmentationH'])
encoder = encoder.fit(Payroll_df)
Payroll_df = encoder.transform(Payroll_df)
vectorAssembler = VectorAssembler(inputCols = ['ActiveH','Age_RangeH', 'Months_RangeH', 'Income_RangeH', 'SegmentationH'], outputCol = 'features')
Payroll_df = vectorAssembler.transform(Payroll_df)
Payroll_df = Payroll_df.select(['Customer_Code','features', 'Payroll'])
splits = Payroll_df.randomSplit([0.6, 0.4])
train_df = splits[0]
test_df = splits[1]

In [43]:
LR = LogisticRegression(labelCol="Payroll", featuresCol="features", maxIter=100, regParam=0.01,
                        elasticNetParam=1.0)
LRmodel = LR.fit(train_df)
Payrollpredictions = LRmodel.transform(test_df)
evaluator = BinaryClassificationEvaluator(labelCol='Payroll')
auroc = evaluator.evaluate(Payrollpredictions, {evaluator.metricName: "areaUnderROC"})
print("Area under ROC Curve: {:.4f}".format(auroc))

Area under ROC Curve: 0.8227


#### Pensions_two (LR)

In [36]:
df.groupBy('Pensions_two').count().show()

+------------+-------+
|Pensions_two|  count|
+------------+-------+
|           0|9528238|
|           1| 764382|
+------------+-------+



In [12]:
Pensionst_df = df.select('Gender',  'Foreigner_Index', 'Active', 'Pensions_two')
for i in Pensionst_df.columns:
    if not( isinstance(Pensionst_df.select(i).take(1)[0][0], six.string_types)):
        print( "Correlation to Pensions_two for ", i, Pensionst_df.stat.corr('Pensions_two',i))

Correlation to Pensions_two for  Gender -0.023482010325987058
Correlation to Pensions_two for  Foreigner_Index -0.009628463864864042
Correlation to Pensions_two for  Active 0.2612848984798003
Correlation to Pensions_two for  Pensions_two 1.0


In [44]:
Pensions_two_df = df.select('Customer_Code','Active', 'Age_Range', 'Months_Range', 'Income_Range', 'Segmentation','Pensions_two')
encoder = OneHotEncoder(inputCols =['Active','Age_Range', 'Months_Range', 'Income_Range', 'Segmentation'],
                                 outputCols =['ActiveH','Age_RangeH', 'Months_RangeH', 'Income_RangeH', 'SegmentationH'])
encoder = encoder.fit(Pensions_two_df)
Pensions_two_df = encoder.transform(Pensions_two_df)
vectorAssembler = VectorAssembler(inputCols = ['ActiveH','Age_RangeH', 'Months_RangeH', 'Income_RangeH', 'SegmentationH'], outputCol = 'features')
Pensions_two_df = vectorAssembler.transform(Pensions_two_df)
Pensions_two_df = Pensions_two_df.select(['Customer_Code','features', 'Pensions_two'])
splits = Pensions_two_df.randomSplit([0.6, 0.4])
train_df = splits[0]
test_df = splits[1]

In [45]:
LR = LogisticRegression(labelCol="Pensions_two", featuresCol="features", maxIter=100, regParam=0.01,
                        elasticNetParam=1.0)
LRmodel = LR.fit(train_df)
Pensions_twopredictions = LRmodel.transform(test_df)
evaluator = BinaryClassificationEvaluator(labelCol='Pensions_two')
auroc = evaluator.evaluate(Pensions_twopredictions, {evaluator.metricName: "areaUnderROC"})
print("Area under ROC Curve: {:.4f}".format(auroc))

Area under ROC Curve: 0.8059


#### Direct_Debit (LR)

In [10]:
df.groupBy('Direct_Debit').count().show()

+------------+-------+
|Direct_Debit|  count|
+------------+-------+
|           1|1627210|
|           0|8665410|
+------------+-------+



In [11]:
direct_df = df.select('Gender',  'Foreigner_Index', 'Active', 'Direct_Debit')
for i in direct_df.columns:
    if not( isinstance(direct_df.select(i).take(1)[0][0], six.string_types)):
        print( "Correlation to Direct_Debit for ", i, direct_df.stat.corr('Direct_Debit',i))

Correlation to Direct_Debit for  Gender -0.05355695624237305
Correlation to Direct_Debit for  Foreigner_Index -0.0019241473309739895
Correlation to Direct_Debit for  Active 0.39745594130309425
Correlation to Direct_Debit for  Direct_Debit 1.0


In [46]:
Direct_Debit_df = df.select('Customer_Code','Active', 'Age_Range', 'Months_Range', 'Income_Range', 'Segmentation','Direct_Debit')
encoder = OneHotEncoder(inputCols =['Active','Age_Range', 'Months_Range', 'Income_Range', 'Segmentation'],
                                 outputCols =['ActiveH','Age_RangeH', 'Months_RangeH', 'Income_RangeH', 'SegmentationH'])
encoder = encoder.fit(Direct_Debit_df)
Direct_Debit_df = encoder.transform(Direct_Debit_df)
vectorAssembler = VectorAssembler(inputCols = ['ActiveH','Age_RangeH', 'Months_RangeH', 'Income_RangeH', 'SegmentationH'], outputCol = 'features')
Direct_Debit_df = vectorAssembler.transform(Direct_Debit_df)
Direct_Debit_df = Direct_Debit_df.select(['Customer_Code','features', 'Direct_Debit'])
splits = Direct_Debit_df.randomSplit([0.6, 0.4])
train_df = splits[0]
test_df = splits[1]

In [47]:
LR = LogisticRegression(labelCol="Direct_Debit", featuresCol="features", maxIter=100, regParam=0.01,
                        elasticNetParam=1.0)
LRmodel = LR.fit(train_df)
Direct_Debitpredictions = LRmodel.transform(test_df)
evaluator = BinaryClassificationEvaluator(labelCol='Direct_Debit')
auroc = evaluator.evaluate(Direct_Debitpredictions, {evaluator.metricName: "areaUnderROC"})
print("Area under ROC Curve: {:.4f}".format(auroc))

Area under ROC Curve: 0.8338


#### All Predictions Dataframe Construction

In [48]:
Currentpredictions = Currentpredictions.select('Customer_Code','Current_Accounts', 'prediction')
Currentpredictions = Currentpredictions.withColumnRenamed("prediction","Rating")
Currentpredictions = Currentpredictions.withColumnRenamed("Current_Accounts","Item")
Currentpredictions = Currentpredictions.replace(0,1, subset='Item')
Currentpredictions.show(3)

+-------------+----+------+
|Customer_Code|Item|Rating|
+-------------+----+------+
|      15889.0|   1|   1.0|
|      15889.0|   1|   1.0|
|      15889.0|   1|   1.0|
+-------------+----+------+
only showing top 3 rows



In [49]:
Payroll_Accountpredictions = Payroll_Accountpredictions.select('Customer_Code','Payroll_Account', 'prediction')
Payroll_Accountpredictions = Payroll_Accountpredictions.withColumnRenamed("prediction","Rating")
Payroll_Accountpredictions = Payroll_Accountpredictions.withColumnRenamed("Payroll_Account","Item")
Payroll_Accountpredictions = Payroll_Accountpredictions.replace([0,1],[2,2], subset='Item')
Payroll_Accountpredictions.show(3)

+-------------+----+------+
|Customer_Code|Item|Rating|
+-------------+----+------+
|      15889.0|   2|   0.0|
|      15889.0|   2|   0.0|
|      15889.0|   2|   0.0|
+-------------+----+------+
only showing top 3 rows



In [50]:
LRpredictions = LRpredictions.select('Customer_Code','Junior_Account', 'prediction')
LRpredictions = LRpredictions.withColumnRenamed("prediction","Rating")
LRpredictions = LRpredictions.withColumnRenamed("Junior_Account","Item")
LRpredictions = LRpredictions.replace([0,1],[3,3], subset='Item')

More_Particular_Accountpredictions = More_Particular_Accountpredictions.select('Customer_Code','More_Particular_Account', 'prediction')
More_Particular_Accountpredictions = More_Particular_Accountpredictions.withColumnRenamed("prediction","Rating")
More_Particular_Accountpredictions = More_Particular_Accountpredictions.withColumnRenamed("Junior_Account","Item")
More_Particular_Accountpredictions = More_Particular_Accountpredictions.replace([0,1],[4,4], subset='Item')

Particular_Accountpredictions = Particular_Accountpredictions.select('Customer_Code','Particular_Account', 'prediction')
Particular_Accountpredictions = Particular_Accountpredictions.withColumnRenamed("prediction","Rating")
Particular_Accountpredictions = Particular_Accountpredictions.withColumnRenamed("Particular_Account","Item")
Particular_Accountpredictions = Particular_Accountpredictions.replace([0,1],[5,5], subset='Item')

Particular_Plus_Accountpredictions = Particular_Plus_Accountpredictions.select('Customer_Code','Particular_Plus_Account', 'prediction')
Particular_Plus_Accountpredictions = Particular_Plus_Accountpredictions.withColumnRenamed("prediction","Rating")
Particular_Plus_Accountpredictions = Particular_Plus_Accountpredictions.withColumnRenamed("Particular_Plus_Account","Item")
Particular_Plus_Accountpredictions = Particular_Plus_Accountpredictions.replace([0,1],[6,6], subset='Item')

Medium_Term_Depositspredictions = Medium_Term_Depositspredictions.select('Customer_Code','Medium_Term_Deposits', 'prediction')
Medium_Term_Depositspredictions = Medium_Term_Depositspredictions.withColumnRenamed("prediction","Rating")
Medium_Term_Depositspredictions = Medium_Term_Depositspredictions.withColumnRenamed("Medium_Term_Deposits","Item")
Medium_Term_Depositspredictions = Medium_Term_Depositspredictions.replace([0,1],[7,7], subset='Item')

In [51]:
Long_Term_Depositspredictions = Long_Term_Depositspredictions.select('Customer_Code','Long_Term_Deposits', 'prediction')
Long_Term_Depositspredictions = Long_Term_Depositspredictions.withColumnRenamed("prediction","Rating")
Long_Term_Depositspredictions = Long_Term_Depositspredictions.withColumnRenamed("Long_Term_Deposits","Item")
Long_Term_Depositspredictions = Long_Term_Depositspredictions.replace([0,1],[8,8], subset='Item')

e_Accountpredictions = e_Accountpredictions.select('Customer_Code','e-Account', 'prediction')
e_Accountpredictions = e_Accountpredictions.withColumnRenamed("prediction","Rating")
e_Accountpredictions = e_Accountpredictions.withColumnRenamed("e-Account","Item")
e_Accountpredictions = e_Accountpredictions.replace([0,1],[9,9], subset='Item')

Fundspredictions = Fundspredictions.select('Customer_Code','Funds', 'prediction')
Fundspredictions = Fundspredictions.withColumnRenamed("Funds","Rating")
Fundspredictions = Fundspredictions.withColumnRenamed("Funds","Item")
Fundspredictions = Fundspredictions.replace([0,1],[10,10], subset='Item')

Mortgagepredictions = Mortgagepredictions.select('Customer_Code','Mortgage', 'prediction')
Mortgagepredictions = Mortgagepredictions.withColumnRenamed("prediction","Rating")
Mortgagepredictions = Mortgagepredictions.withColumnRenamed("Mortgage","Item")
Mortgagepredictions = Mortgagepredictions.replace([0,1],[11,11], subset='Item')

Pensionspredictions = Pensionspredictions.select('Customer_Code','Pensions', 'prediction')
Pensionspredictions = Pensionspredictions.withColumnRenamed("prediction","Rating")
Pensionspredictions = Pensionspredictions.withColumnRenamed("Pensions","Item")
Pensionspredictions = Pensionspredictions.replace([0,1],[12,12], subset='Item')


In [52]:
Loanspredictions = Loanspredictions.select('Customer_Code','Loans', 'prediction')
Loanspredictions = Loanspredictions.withColumnRenamed("prediction","Rating")
Loanspredictions = Loanspredictions.withColumnRenamed("Loans","Item")
Loanspredictions = Loanspredictions.replace([0,1],[13,13], subset='Item')

Taxespredictions = Taxespredictions.select('Customer_Code','Taxes', 'prediction')
Taxespredictions = Taxespredictions.withColumnRenamed("prediction","Rating")
Taxespredictions = Taxespredictions.withColumnRenamed("Taxes","Item")
Taxespredictions = Taxespredictions.replace([0,1],[14,14], subset='Item')

In [53]:
Credit_Cardpredictions = Credit_Cardpredictions.select('Customer_Code','Credit_Card', 'prediction')
Credit_Cardpredictions = Credit_Cardpredictions.withColumnRenamed("prediction","Rating")
Credit_Cardpredictions = Credit_Cardpredictions.withColumnRenamed("Credit_Card","Item")
Credit_Cardpredictions = Credit_Cardpredictions.replace([0,1],[15,15], subset='Item')

Securitiespredictions = Securitiespredictions.select('Customer_Code','Securities', 'prediction')
Securitiespredictions = Securitiespredictions.withColumnRenamed("prediction","Rating")
Securitiespredictions = Securitiespredictions.withColumnRenamed("Securities","Item")
Securitiespredictions = Securitiespredictions.replace([0,1],[16,16], subset='Item')


In [54]:
Home_Accountpredictions = Home_Accountpredictions.select('Customer_Code','Home_Account', 'prediction')
Home_Accountpredictions = Home_Accountpredictions.withColumnRenamed("prediction","Rating")
Home_Accountpredictions = Home_Accountpredictions.withColumnRenamed("Home_Account","Item")
Home_Accountpredictions = Home_Accountpredictions.replace([0,1],[17,17], subset='Item')

Payrollpredictions = Payrollpredictions.select('Customer_Code','Payroll', 'prediction')
Payrollpredictions = Payrollpredictions.withColumnRenamed("prediction","Rating")
Payrollpredictions = Payrollpredictions.withColumnRenamed("Payroll","Item")
Payrollpredictions = Payrollpredictions.replace([0,1],[18,18], subset='Item')

Pensions_twopredictions = Pensions_twopredictions.select('Customer_Code','Pensions_two', 'prediction')
Pensions_twopredictions = Pensions_twopredictions.withColumnRenamed("prediction","Rating")
Pensions_twopredictions = Pensions_twopredictions.withColumnRenamed("Pensions_two","Item")
Pensions_twopredictions = Pensions_twopredictions.replace([0,1],[19,19], subset='Item')

Direct_Debitpredictions = Direct_Debitpredictions.select('Customer_Code','Direct_Debit', 'prediction')
Direct_Debitpredictions = Direct_Debitpredictions.withColumnRenamed("prediction","Rating")
Direct_Debitpredictions = Direct_Debitpredictions.withColumnRenamed("Direct_Debit","Item")
Direct_Debitpredictions = Direct_Debitpredictions.replace([0,1],[20,20], subset='Item')

In [57]:
allpredictionsdf = Currentpredictions.union(Payroll_Accountpredictions)
allpredictionsdf.show(1)

+-------------+----+------+
|Customer_Code|Item|Rating|
+-------------+----+------+
|      15889.0|   1|   1.0|
+-------------+----+------+
only showing top 1 row



In [58]:
allpredictionsdf.count()

8234369

In [59]:
allpredictionsdf = allpredictionsdf.union(LRpredictions)
allpredictionsdf = allpredictionsdf.union(More_Particular_Accountpredictions)

In [60]:
allpredictionsdf.count()

16469621

In [61]:
#Build Dataframe to bring to next notebook for Collaborative Filtering
allpredictionsdf = allpredictionsdf.union(Particular_Accountpredictions)
allpredictionsdf = allpredictionsdf.union(Particular_Plus_Accountpredictions)
allpredictionsdf = allpredictionsdf.union(Medium_Term_Depositspredictions)
allpredictionsdf = allpredictionsdf.union(Long_Term_Depositspredictions)
allpredictionsdf = allpredictionsdf.union(e_Accountpredictions)
allpredictionsdf = allpredictionsdf.union(Fundspredictions)
allpredictionsdf = allpredictionsdf.union(Mortgagepredictions)
allpredictionsdf = allpredictionsdf.union(Pensionspredictions)
allpredictionsdf = allpredictionsdf.union(Loanspredictions)
allpredictionsdf = allpredictionsdf.union(Taxespredictions)
allpredictionsdf = allpredictionsdf.union(Credit_Cardpredictions)
allpredictionsdf = allpredictionsdf.union(Securitiespredictions)
allpredictionsdf = allpredictionsdf.union(Home_Accountpredictions)
allpredictionsdf = allpredictionsdf.union(Payrollpredictions)
allpredictionsdf = allpredictionsdf.union(Pensions_twopredictions)
allpredictionsdf = allpredictionsdf.union(Direct_Debitpredictions)
allpredictionsdf.count()

82341861

In [63]:
allpredictionsdf.write.csv('santander_df_logisticpred.csv')

In [64]:
allpredictionsdf.printSchema()

root
 |-- Customer_Code: double (nullable = true)
 |-- Item: integer (nullable = true)
 |-- Rating: double (nullable = false)

