### Creating the path for invoking the Pyspark and the required functions

In [1]:
import os
import sys
os.environ["SPARK_HOME"] = "/usr/hdp/current/spark2-client"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
sys.path.insert(0, os.environ["PYLIB"] + "/py4j-0.10.4-src.zip")
sys.path.insert(0, os.environ["PYLIB"] + "/pyspark.zip")

## Creating the spark Conf Properties and Spark Session




In [2]:
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder \
...     .master("local") \
...     .appName("Universal Bank Data Set") \
...     .config(conf = SparkConf()) \
...     .getOrCreate()


 AppName Sets a name for the application, which will be shown in the Spark web UI.
    

 Config Sets a config option. Options set using this method are automatically propagated to both SparkConf and SparkSession‘s own configuration.

master is a Spark, Mesos or YARN cluster URL, or a special “local” string to run in local modemaster 

getOrCreate() Gets an existing SparkSession or, if there is no existing one, creates a new one based on the options set in this builder.

### Important classes of pyspark.sql  and Data Frames

### Creating the data frame  using Spark 

With a SparkSession, applications can create DataFrames from an existing RDD, from a Hive table, or from Spark data sources.

### Creating DataFrame from the RDD

In [4]:
from pyspark import SparkContext
from pyspark.sql import SQLContext


In [5]:
conf = SparkConf().setAppName('UniversalBank')

In [6]:
conf = conf.setMaster("local")
sc   = spark.sparkContext
sqlContext = SQLContext(sc)

In [7]:
univ_bankdataRDD = sc.textFile("file:///home/mahidharv/bank.csv")

In [8]:
type(univ_bankdataRDD)

pyspark.rdd.RDD

In [9]:
univ_bankdataRDD.take(3)

[u'30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no',
 u'33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no',
 u'35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no']

In [10]:
df = univ_bankdataRDD.map(lambda x:x.split(","))\
 .toDF(["age","job","marital","education","default","balance",\
        "housing","loan","contact","day","month","duration","campaign","pdays","previous","poutcome","y"])

In [11]:
df.registerTempTable("bank")

In [12]:
df2 = spark.sql("select * from bank")

In [13]:
sorted(df.collect()) == sorted(df2.collect())

True

In [14]:
df2.show()

+---+-------------+-------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+---+
|age|          job|marital|education|default|balance|housing|loan| contact|day|month|duration|campaign|pdays|previous|poutcome|  y|
+---+-------------+-------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+---+
| 30|   unemployed|married|  primary|     no|   1787|     no|  no|cellular| 19|  oct|      79|       1|   -1|       0| unknown| no|
| 33|     services|married|secondary|     no|   4789|    yes| yes|cellular| 11|  may|     220|       1|  339|       4| failure| no|
| 35|   management| single| tertiary|     no|   1350|    yes|  no|cellular| 16|  apr|     185|       1|  330|       1| failure| no|
| 30|   management|married| tertiary|     no|   1476|    yes| yes| unknown|  3|  jun|     199|       4|   -1|       0| unknown| no|
| 59|  blue-collar|married|secondary|     no|      0|    yes|  no| unknown| 

In [15]:
df2.dtypes

[('age', 'string'),
 ('job', 'string'),
 ('marital', 'string'),
 ('education', 'string'),
 ('default', 'string'),
 ('balance', 'string'),
 ('housing', 'string'),
 ('loan', 'string'),
 ('contact', 'string'),
 ('day', 'string'),
 ('month', 'string'),
 ('duration', 'string'),
 ('campaign', 'string'),
 ('pdays', 'string'),
 ('previous', 'string'),
 ('poutcome', 'string'),
 ('y', 'string')]

In [16]:
type(df2)

pyspark.sql.dataframe.DataFrame

In [17]:
df2.cache()

DataFrame[age: string, job: string, marital: string, education: string, default: string, balance: string, housing: string, loan: string, contact: string, day: string, month: string, duration: string, campaign: string, pdays: string, previous: string, poutcome: string, y: string]

In [18]:
df2.count()

4521

### Creating the data frame from the existing data sources 

In [19]:
univ_bankdata = spark.read.csv("file:///home/mahidharv/bank.csv",
                              header=False,
                              inferSchema=True)

In [20]:
univ_bankdata.head(5)

[Row(_c0=30, _c1=u'unemployed', _c2=u'married', _c3=u'primary', _c4=u'no', _c5=1787, _c6=u'no', _c7=u'no', _c8=u'cellular', _c9=19, _c10=u'oct', _c11=79, _c12=1, _c13=-1, _c14=0, _c15=u'unknown', _c16=u'no'),
 Row(_c0=33, _c1=u'services', _c2=u'married', _c3=u'secondary', _c4=u'no', _c5=4789, _c6=u'yes', _c7=u'yes', _c8=u'cellular', _c9=11, _c10=u'may', _c11=220, _c12=1, _c13=339, _c14=4, _c15=u'failure', _c16=u'no'),
 Row(_c0=35, _c1=u'management', _c2=u'single', _c3=u'tertiary', _c4=u'no', _c5=1350, _c6=u'yes', _c7=u'no', _c8=u'cellular', _c9=16, _c10=u'apr', _c11=185, _c12=1, _c13=330, _c14=1, _c15=u'failure', _c16=u'no'),
 Row(_c0=30, _c1=u'management', _c2=u'married', _c3=u'tertiary', _c4=u'no', _c5=1476, _c6=u'yes', _c7=u'yes', _c8=u'unknown', _c9=3, _c10=u'jun', _c11=199, _c12=4, _c13=-1, _c14=0, _c15=u'unknown', _c16=u'no'),
 Row(_c0=59, _c1=u'blue-collar', _c2=u'married', _c3=u'secondary', _c4=u'no', _c5=0, _c6=u'yes', _c7=u'no', _c8=u'unknown', _c9=5, _c10=u'may', _c11=226, _

In [21]:
univ_bankdata.show()

+---+-------------+-------+---------+---+----+---+---+--------+---+----+----+----+----+----+-------+----+
|_c0|          _c1|    _c2|      _c3|_c4| _c5|_c6|_c7|     _c8|_c9|_c10|_c11|_c12|_c13|_c14|   _c15|_c16|
+---+-------------+-------+---------+---+----+---+---+--------+---+----+----+----+----+----+-------+----+
| 30|   unemployed|married|  primary| no|1787| no| no|cellular| 19| oct|  79|   1|  -1|   0|unknown|  no|
| 33|     services|married|secondary| no|4789|yes|yes|cellular| 11| may| 220|   1| 339|   4|failure|  no|
| 35|   management| single| tertiary| no|1350|yes| no|cellular| 16| apr| 185|   1| 330|   1|failure|  no|
| 30|   management|married| tertiary| no|1476|yes|yes| unknown|  3| jun| 199|   4|  -1|   0|unknown|  no|
| 59|  blue-collar|married|secondary| no|   0|yes| no| unknown|  5| may| 226|   1|  -1|   0|unknown|  no|
| 35|   management| single| tertiary| no| 747| no| no|cellular| 23| feb| 141|   2| 176|   3|failure|  no|
| 36|self-employed|married| tertiary| no| 307|

In [22]:
type(univ_bankdata)

pyspark.sql.dataframe.DataFrame

In [23]:
univ_bankdata.columns

['_c0',
 '_c1',
 '_c2',
 '_c3',
 '_c4',
 '_c5',
 '_c6',
 '_c7',
 '_c8',
 '_c9',
 '_c10',
 '_c11',
 '_c12',
 '_c13',
 '_c14',
 '_c15',
 '_c16']

In [24]:
old_names = univ_bankdata.schema.names

In [25]:
old_names

['_c0',
 '_c1',
 '_c2',
 '_c3',
 '_c4',
 '_c5',
 '_c6',
 '_c7',
 '_c8',
 '_c9',
 '_c10',
 '_c11',
 '_c12',
 '_c13',
 '_c14',
 '_c15',
 '_c16']

In [26]:
new_names = ["age","job","marital","education","default","balance","housing","loan","contact","day","month","duration","campaign","pdays","previous","poutcome","y"]

In [27]:
univ_bankdata= reduce(lambda univ_bankdata, idx: univ_bankdata.withColumnRenamed(old_names[idx], new_names[idx]), range(len(old_names)), univ_bankdata)

In [28]:
univ_bankdata.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- y: string (nullable = true)



In [29]:
univ_bankdata.show()

+---+-------------+-------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+---+
|age|          job|marital|education|default|balance|housing|loan| contact|day|month|duration|campaign|pdays|previous|poutcome|  y|
+---+-------------+-------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+---+
| 30|   unemployed|married|  primary|     no|   1787|     no|  no|cellular| 19|  oct|      79|       1|   -1|       0| unknown| no|
| 33|     services|married|secondary|     no|   4789|    yes| yes|cellular| 11|  may|     220|       1|  339|       4| failure| no|
| 35|   management| single| tertiary|     no|   1350|    yes|  no|cellular| 16|  apr|     185|       1|  330|       1| failure| no|
| 30|   management|married| tertiary|     no|   1476|    yes| yes| unknown|  3|  jun|     199|       4|   -1|       0| unknown| no|
| 59|  blue-collar|married|secondary|     no|      0|    yes|  no| unknown| 

In [30]:
#sc.stop()

### Basic Functions on the data frame objects

How to change the data type of the attribute ?
Is it Possible to give the data type while creating or reading the data from the location

In [31]:
univ_bankdata = sqlContext.sql("select cast(age as int) as age,\
                               job,marital,education,default, \
                               (case when (balance < 0) then double(0) else double(balance) end) as balance,\
                               housing,loan,contact,cast(day as int) as day,\
                               month,cast(duration as double) as duration,\
                               cast(campaign as double) as campaign,\
                               (case when (pdays < 0) then double(0) else double(pdays) end) as pdays,\
                               cast(previous as double) as previous,poutcome,\
                               (case when (y = 'no') then int(0) else int(1) end) as Approved from bank")


In [32]:
univ_bankdata.dtypes

[('age', 'int'),
 ('job', 'string'),
 ('marital', 'string'),
 ('education', 'string'),
 ('default', 'string'),
 ('balance', 'double'),
 ('housing', 'string'),
 ('loan', 'string'),
 ('contact', 'string'),
 ('day', 'int'),
 ('month', 'string'),
 ('duration', 'double'),
 ('campaign', 'double'),
 ('pdays', 'double'),
 ('previous', 'double'),
 ('poutcome', 'string'),
 ('Approved', 'int')]

### sort the rows based on the age 

In [33]:
sorted_data = univ_bankdata.sort(univ_bankdata.age,ascending=False)

In [34]:
sorted_data.show()

+---+----------+--------+---------+-------+-------+-------+----+---------+---+-----+--------+--------+-----+--------+--------+--------+
|age|       job| marital|education|default|balance|housing|loan|  contact|day|month|duration|campaign|pdays|previous|poutcome|Approved|
+---+----------+--------+---------+-------+-------+-------+----+---------+---+-----+--------+--------+-----+--------+--------+--------+
| 87|   retired| married|  primary|     no|  230.0|     no|  no| cellular| 30|  oct|   144.0|     1.0|  0.0|     0.0| unknown|       1|
| 86|   retired| married|secondary|     no| 1503.0|     no|  no|telephone| 18|  mar|   165.0|     3.0|101.0|     1.0|   other|       0|
| 84|   retired|divorced|  primary|     no|  639.0|     no|  no|telephone| 18|  may|   353.0|     3.0|  0.0|     0.0| unknown|       1|
| 83|   retired|divorced|  primary|     no|    0.0|     no|  no|telephone| 31|  may|   664.0|     1.0| 77.0|     3.0| success|       0|
| 83|   retired|divorced|  primary|     no| 1097

### Find the max,min,count,sum , mean,stdev for the balance

In [35]:
from pyspark.sql import functions as F

In [36]:
balance_statistics = univ_bankdata.select('balance').agg(F.min(col='balance').alias("min-balance"),
                                                        F.max(col='balance').alias("max-balance"),
                                                        F.mean(col='balance').alias("avg-balance"),
                                                        F.stddev_pop(col='balance').alias("stdev-balance"))

In [37]:
balance_statistics.show()

+-----------+-----------+----------------+------------------+
|min-balance|max-balance|     avg-balance|     stdev-balance|
+-----------+-----------+----------------+------------------+
|        0.0|    71188.0|1449.33399690334|2993.3947569080356|
+-----------+-----------+----------------+------------------+



In [38]:
univ_bankdata.describe().show()

+-------+------------------+-------+--------+---------+-------+----------------+-------+----+--------+------------------+-----+------------------+------------------+-----------------+------------------+--------+-------------------+
|summary|               age|    job| marital|education|default|         balance|housing|loan| contact|               day|month|          duration|          campaign|            pdays|          previous|poutcome|           Approved|
+-------+------------------+-------+--------+---------+-------+----------------+-------+----+--------+------------------+-----+------------------+------------------+-----------------+------------------+--------+-------------------+
|  count|              4521|   4521|    4521|     4521|   4521|            4521|   4521|4521|    4521|              4521| 4521|              4521|              4521|             4521|              4521|    4521|               4521|
|   mean| 41.17009511170095|   null|    null|     null|   null|1449.3339

### Count the number of classes in the Categorical attribute

In [39]:
classcount_contact = univ_bankdata.groupby('contact').count()

In [40]:
classcount_contact.show()

+---------+-----+
|  contact|count|
+---------+-----+
|  unknown| 1324|
| cellular| 2896|
|telephone|  301|
+---------+-----+



### Reframing  the data set by partitioning the data 

In [41]:
data = univ_bankdata.repartition("age")

In [42]:
data.show(10)

+---+-----------+--------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+--------+
|age|        job| marital|education|default|balance|housing|loan| contact|day|month|duration|campaign|pdays|previous|poutcome|Approved|
+---+-----------+--------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+--------+
| 31|blue-collar| married|secondary|     no|  360.0|    yes| yes|cellular| 29|  jan|    89.0|     1.0|241.0|     1.0| failure|       0|
| 31|   services| married|secondary|     no|  132.0|     no|  no|cellular|  7|  jul|   148.0|     1.0|152.0|     1.0|   other|       0|
| 31| technician| married|secondary|     no|  171.0|     no|  no|cellular| 27|  aug|    81.0|     3.0|  0.0|     0.0| unknown|       0|
| 31|   services| married|secondary|     no|  338.0|    yes|  no|cellular| 28|  jan|   155.0|     1.0|  0.0|     0.0| unknown|       0|
| 31| management|  single| tertiary|     no| 624

### Basic Plots using Plotly

In [43]:
import plotly
from plotly.graph_objs import Histogram , Layout
plotly.offline.init_notebook_mode(connected=True)
plotly.offline.iplot({
    "data": [Histogram(x=univ_bankdata.toPandas()['balance'])],
    "layout": Layout(title="Histogram of balance")
})

### Binning the balance by using the user defined functions and adding a column to the data frame

In [44]:
import pyspark.sql.functions as func
from pyspark.sql.functions import udf
from pyspark.sql.types import *

In [45]:
def balance_bin(balance):
    """
    Accepts transaction amount and returns the correspondin bin
    0       - 500       -> Bin01
    501     - 1000      -> Bin02
    1001    - 2000      -> Bin03
    2001    - 4000      -> Bin04
    4001    - 6000      -> Bin05
    6001    - 8000      -> Bin06
    8001    - 10000     -> Bin07
    10001   - 20000     -> Bin08
    20001   - 30000     -> Bin09
    30001   - 40000     -> Bin10
    40001   - 50000     -> Bin11
    50001   - 60000     -> Bin12
    60001   - 70000     -> Bin13
    70001   - 80000     -> Bin14
    """
    
    if float(balance) > 0:
        balance_amount = float(balance)
    else:
        balance_amount = float(0)
    
    if balance_amount <= 500: return str("Bin01")
    elif (balance_amount > 500 and balance_amount <= 1000): return str("Bin02")
    elif (balance_amount > 1000 and balance_amount <= 2000): return str("Bin03")
    elif (balance_amount > 2000 and balance_amount <= 4000): return str("Bin04")
    elif (balance_amount > 4000 and balance_amount <= 6000): return str("Bin05")
    elif (balance_amount > 6000 and balance_amount <= 8000): return str("Bin06")
    elif (balance_amount > 8000 and balance_amount <= 10000): return str("Bin07")
    elif (balance_amount > 10000 and balance_amount <= 20000): return str("Bin08")
    elif (balance_amount > 20000 and balance_amount <= 30000): return str("Bin09")
    elif (balance_amount > 30000 and balance_amount <= 40000): return str("Bin10")
    elif (balance_amount > 40000 and balance_amount <= 50000): return str("Bin11")
    elif (balance_amount > 50000 and balance_amount <= 60000): return str("Bin12")
    elif (balance_amount > 60000 and balance_amount <= 70000): return str("Bin13")
    else: return str("Bin14")
    

In [46]:
udf_balance_bin = udf(balance_bin, StringType())

In [47]:
new_data = univ_bankdata.withColumn('balance_bin',udf_balance_bin('balance'))

In [48]:
new_data.show(10)

+---+-------------+-------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+--------+-----------+
|age|          job|marital|education|default|balance|housing|loan| contact|day|month|duration|campaign|pdays|previous|poutcome|Approved|balance_bin|
+---+-------------+-------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+--------+-----------+
| 30|   unemployed|married|  primary|     no| 1787.0|     no|  no|cellular| 19|  oct|    79.0|     1.0|  0.0|     0.0| unknown|       0|      Bin03|
| 33|     services|married|secondary|     no| 4789.0|    yes| yes|cellular| 11|  may|   220.0|     1.0|339.0|     4.0| failure|       0|      Bin05|
| 35|   management| single| tertiary|     no| 1350.0|    yes|  no|cellular| 16|  apr|   185.0|     1.0|330.0|     1.0| failure|       0|      Bin03|
| 30|   management|married| tertiary|     no| 1476.0|    yes| yes| unknown|  3|  jun|   199.0|     4.0|  0

### How many data points have age between 25 and 35

In [50]:
agebtw2535 = univ_bankdata.select(univ_bankdata.balance,univ_bankdata.age,univ_bankdata.age.between(25,35))

In [54]:
agebtw2535.groupby('((age >= 25) AND (age <= 35))').count().show()

+-----------------------------+-----+
|((age >= 25) AND (age <= 35))|count|
+-----------------------------+-----+
|                         true| 1585|
|                        false| 2936|
+-----------------------------+-----+

