In [1]:
## Set Python - Spark environment.
import os
import sys
os.environ["SPARK_HOME"] = "/usr/hdp/current/spark2-client"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
sys.path.insert(0, os.environ["PYLIB"] + "/py4j-0.10.6-src.zip")
sys.path.insert(0, os.environ["PYLIB"] + "/pyspark.zip")

In [2]:
## Create SparkContext, SparkSession
from os.path import expanduser, join, abspath

from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark import SparkContext
sc = SparkContext()

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs:///apps/hive/warehouse/'

spark = SparkSession \
    .builder \
    .appName("Spark Machine Learning Example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .enableHiveSupport() \
    .getOrCreate()

In [3]:
spark


In [51]:
bankdata = spark.read.format("csv")\
        .option("header", "false")\
        .option("inferSchema", "true")\
        .load("file:///home/2052B43/bank_data.csv")

In [52]:
bankdata.show(5)

+---+-----------+-------+---------+---+----+---+---+--------+---+----+----+----+----+----+-------+----+
|_c0|        _c1|    _c2|      _c3|_c4| _c5|_c6|_c7|     _c8|_c9|_c10|_c11|_c12|_c13|_c14|   _c15|_c16|
+---+-----------+-------+---------+---+----+---+---+--------+---+----+----+----+----+----+-------+----+
| 30| unemployed|married|  primary| no|1787| no| no|cellular| 19| oct|  79|   1|  -1|   0|unknown|  no|
| 33|   services|married|secondary| no|4789|yes|yes|cellular| 11| may| 220|   1| 339|   4|failure|  no|
| 35| management| single| tertiary| no|1350|yes| no|cellular| 16| apr| 185|   1| 330|   1|failure|  no|
| 30| management|married| tertiary| no|1476|yes|yes| unknown|  3| jun| 199|   4|  -1|   0|unknown|  no|
| 59|blue-collar|married|secondary| no|   0|yes| no| unknown|  5| may| 226|   1|  -1|   0|unknown|  no|
+---+-----------+-------+---------+---+----+---+---+--------+---+----+----+----+----+----+-------+----+
only showing top 5 rows



In [53]:
bankdata = bankdata.selectExpr("_c0 as age", "_c1 as job", "_c2 as marital_status", "_c3 as education","_c4 as default","_c5 as balance","_c6 as housing","_c7 as loan","_c8 as contact","_c9 as day","_c10 as month","_c11 as duration","_c12 as campaign","_c13 as pdays","_c14 as previous","_c15 as poutcome","_c16 as label")

In [54]:
bankdata.dtypes

[('age', 'int'),
 ('job', 'string'),
 ('marital_status', 'string'),
 ('education', 'string'),
 ('default', 'string'),
 ('balance', 'int'),
 ('housing', 'string'),
 ('loan', 'string'),
 ('contact', 'string'),
 ('day', 'int'),
 ('month', 'string'),
 ('duration', 'int'),
 ('campaign', 'int'),
 ('pdays', 'int'),
 ('previous', 'int'),
 ('poutcome', 'string'),
 ('label', 'string')]

In [55]:
## Description of the data
bankdata.describe().show()

+-------+------------------+-------+--------------+---------+-------+------------------+-------+----+--------+------------------+-----+------------------+------------------+------------------+------------------+--------+-----+
|summary|               age|    job|marital_status|education|default|           balance|housing|loan| contact|               day|month|          duration|          campaign|             pdays|          previous|poutcome|label|
+-------+------------------+-------+--------------+---------+-------+------------------+-------+----+--------+------------------+-----+------------------+------------------+------------------+------------------+--------+-----+
|  count|              4521|   4521|          4521|     4521|   4521|              4521|   4521|4521|    4521|              4521| 4521|              4521|              4521|              4521|              4521|    4521| 4521|
|   mean| 41.17009511170095|   null|          null|     null|   null|1422.6578190665782|   n

In [56]:
bankdata.describe('balance').show()

+-------+------------------+
|summary|           balance|
+-------+------------------+
|  count|              4521|
|   mean|1422.6578190665782|
| stddev|3009.6381424673395|
|    min|             -3313|
|    max|             71188|
+-------+------------------+



In [18]:
bankdata.createOrReplaceTempView("bankdataTable")

In [57]:
from pyspark.sql.functions import when, col
bankdata= bankdata.withColumn("balance", when(bankdata["balance"]<0,0).otherwise(bankdata["balance"]))

In [58]:
bankdata.describe('balance').show()

+-------+----------------+
|summary|         balance|
+-------+----------------+
|  count|            4521|
|   mean|1449.33399690334|
| stddev|2993.72586633407|
|    min|               0|
|    max|           71188|
+-------+----------------+



In [60]:
bankdata.dtypes

[('age', 'int'),
 ('job', 'string'),
 ('marital_status', 'string'),
 ('education', 'string'),
 ('default', 'string'),
 ('balance', 'int'),
 ('housing', 'string'),
 ('loan', 'string'),
 ('contact', 'string'),
 ('day', 'int'),
 ('month', 'string'),
 ('duration', 'int'),
 ('campaign', 'int'),
 ('pdays', 'int'),
 ('previous', 'int'),
 ('poutcome', 'string'),
 ('label', 'string')]

In [61]:
cat = ('job','marital_status','education','default','balance','housing','loan','contact','month','poutcome','label')

In [63]:
from pyspark.ml.feature import StringIndexer
for col in cat:
    indexer = StringIndexer(inputCol=col, outputCol=col+"Index" )
    indexerModel1 = indexer.fit(bankdata) 
    bankdata = indexerModel1.transform(bankdata)
    bankdata.show(5)

+---+-------------+--------------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+-----+--------+
|age|          job|marital_status|education|default|balance|housing|loan| contact|day|month|duration|campaign|pdays|previous|poutcome|label|jobIndex|
+---+-------------+--------------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+-----+--------+
| 30|   unemployed|       married|  primary|     no|   1787|     no|  no|cellular| 19|  oct|      79|       1|   -1|       0| unknown|   no|     8.0|
| 33|     services|       married|secondary|     no|   4789|    yes| yes|cellular| 11|  may|     220|       1|  339|       4| failure|   no|     4.0|
| 35|   management|        single| tertiary|     no|   1350|    yes|  no|cellular| 16|  apr|     185|       1|  330|       1| failure|   no|     0.0|
| 30|   management|       married| tertiary|     no|   1476|    yes| yes| unknown|  3|  jun|     199

+---+-------------+--------------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+-----+--------+-------------------+--------------+------------+
|age|          job|marital_status|education|default|balance|housing|loan| contact|day|month|duration|campaign|pdays|previous|poutcome|label|jobIndex|marital_statusIndex|educationIndex|defaultIndex|
+---+-------------+--------------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+-----+--------+-------------------+--------------+------------+
| 30|   unemployed|       married|  primary|     no|   1787|     no|  no|cellular| 19|  oct|      79|       1|   -1|       0| unknown|   no|     8.0|                0.0|           2.0|         0.0|
| 33|     services|       married|secondary|     no|   4789|    yes| yes|cellular| 11|  may|     220|       1|  339|       4| failure|   no|     4.0|                0.0|           0.0|         0.0|
| 35|   ma

+---+-------------+--------------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+-----+--------+-------------------+--------------+------------+------------+------------+
|age|          job|marital_status|education|default|balance|housing|loan| contact|day|month|duration|campaign|pdays|previous|poutcome|label|jobIndex|marital_statusIndex|educationIndex|defaultIndex|balanceIndex|housingIndex|
+---+-------------+--------------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+-----+--------+-------------------+--------------+------------+------------+------------+
| 30|   unemployed|       married|  primary|     no|   1787|     no|  no|cellular| 19|  oct|      79|       1|   -1|       0| unknown|   no|     8.0|                0.0|           2.0|         0.0|       520.0|         1.0|
| 33|     services|       married|secondary|     no|   4789|    yes| yes|cellular| 11|  may|     220|   

+---+-------------+--------------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+-----+--------+-------------------+--------------+------------+------------+------------+---------+------------+
|age|          job|marital_status|education|default|balance|housing|loan| contact|day|month|duration|campaign|pdays|previous|poutcome|label|jobIndex|marital_statusIndex|educationIndex|defaultIndex|balanceIndex|housingIndex|loanIndex|contactIndex|
+---+-------------+--------------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+-----+--------+-------------------+--------------+------------+------------+------------+---------+------------+
| 30|   unemployed|       married|  primary|     no|   1787|     no|  no|cellular| 19|  oct|      79|       1|   -1|       0| unknown|   no|     8.0|                0.0|           2.0|         0.0|       520.0|         1.0|      0.0|         0.0|
| 33|     se

+---+-------------+--------------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+-----+--------+-------------------+--------------+------------+------------+------------+---------+------------+----------+-------------+
|age|          job|marital_status|education|default|balance|housing|loan| contact|day|month|duration|campaign|pdays|previous|poutcome|label|jobIndex|marital_statusIndex|educationIndex|defaultIndex|balanceIndex|housingIndex|loanIndex|contactIndex|monthIndex|poutcomeIndex|
+---+-------------+--------------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+-----+--------+-------------------+--------------+------------+------------+------------+---------+------------+----------+-------------+
| 30|   unemployed|       married|  primary|     no|   1787|     no|  no|cellular| 19|  oct|      79|       1|   -1|       0| unknown|   no|     8.0|                0.0|           2.0|

In [65]:
  bankdata.show(1)

+---+----------+--------------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+-----+--------+-------------------+--------------+------------+------------+------------+---------+------------+----------+-------------+----------+
|age|       job|marital_status|education|default|balance|housing|loan| contact|day|month|duration|campaign|pdays|previous|poutcome|label|jobIndex|marital_statusIndex|educationIndex|defaultIndex|balanceIndex|housingIndex|loanIndex|contactIndex|monthIndex|poutcomeIndex|labelIndex|
+---+----------+--------------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+-----+--------+-------------------+--------------+------------+------------+------------+---------+------------+----------+-------------+----------+
| 30|unemployed|       married|  primary|     no|   1787|     no|  no|cellular| 19|  oct|      79|       1|   -1|       0| unknown|   no|     8.0|              