In [1]:
from pyspark import SparkContext

In [2]:
sc = SparkContext('local','insurance')  

In [9]:
from pyspark.sql import SQLContext
sql_c=SQLContext(sc)

In [4]:
import pandas as pd

#### RDD- Resilient Distributed Dataset
An RDD is a distributed collection of elements.    
RDDs are immutable.   
Spark automatically distributes the data contained in RDDs across the cluster and parallelizes the operations performed on them.


#### Creating RDDs
1. Parallelize a collection
2. Referencing a dataset

### Parallelize a collection

In [5]:
numbers = range(100)
data_rdd = sc.parallelize(numbers)

In [7]:
# Display the first 10 elements in the RDD
data_rdd.take(10)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

### Referencing a dataset

In [12]:
df = sql_c.read.csv('insurance.csv').rdd

### Transformations
A Transformation is a function that takes an RDD as input and produces another RDD as output.   
They are not executed immediately- they are executed when an Action is called on that RDD.


#### Map:
The map() method is applied to every element of the RDD

In [15]:
# Copy columns 1 and 2 into the new RDD
data_map=df.map(lambda row: (row[1],row[2]))
data_map.take(5)

[(u'sex', u'bmi'),
 (u'female', u'27.9'),
 (u'male', u'33.77'),
 (u'male', u'33'),
 (u'male', u'22.705')]

#### Filter:
The filter() method is used to select elements from the dataset that fit a specified criteria.   

In [16]:
# Copy only the Female customers into the new RDD
data_filtered=df.filter(lambda row:row[1]=='female')
data_filtered.take(10)

[Row(_c0=u'19', _c1=u'female', _c2=u'27.9', _c3=u'0', _c4=u'yes', _c5=u'southwest', _c6=u'16884.924'),
 Row(_c0=u'31', _c1=u'female', _c2=u'25.74', _c3=u'0', _c4=u'no', _c5=u'southeast', _c6=u'3756.6216'),
 Row(_c0=u'46', _c1=u'female', _c2=u'33.44', _c3=u'1', _c4=u'no', _c5=u'southeast', _c6=u'8240.5896'),
 Row(_c0=u'37', _c1=u'female', _c2=u'27.74', _c3=u'3', _c4=u'no', _c5=u'northwest', _c6=u'7281.5056'),
 Row(_c0=u'60', _c1=u'female', _c2=u'25.84', _c3=u'0', _c4=u'no', _c5=u'northwest', _c6=u'28923.13692'),
 Row(_c0=u'62', _c1=u'female', _c2=u'26.29', _c3=u'0', _c4=u'yes', _c5=u'southeast', _c6=u'27808.7251'),
 Row(_c0=u'56', _c1=u'female', _c2=u'39.82', _c3=u'0', _c4=u'no', _c5=u'southeast', _c6=u'11090.7178'),
 Row(_c0=u'52', _c1=u'female', _c2=u'30.78', _c3=u'1', _c4=u'no', _c5=u'northeast', _c6=u'10797.3362'),
 Row(_c0=u'60', _c1=u'female', _c2=u'36.005', _c3=u'0', _c4=u'no', _c5=u'northeast', _c6=u'13228.84695'),
 Row(_c0=u'30', _c1=u'female', _c2=u'32.4', _c3=u'1', _c4=u'no',

#### Distinct:
It returns a new dataset that contains the distinct elements of the source dataset. It removes duplicate data.

In [19]:
data_distinct=df.distinct()

### Actions
Methods that produce non-RDD values like a result set, value, file, etc

#### Take:
Returns n number of rows from the dataset

In [40]:
data_map.take(10)

[(u'sex', u'bmi'),
 (u'female', u'27.9'),
 (u'male', u'33.77'),
 (u'male', u'33'),
 (u'male', u'22.705'),
 (u'male', u'28.88'),
 (u'female', u'25.74'),
 (u'female', u'33.44'),
 (u'female', u'27.74'),
 (u'male', u'29.83')]

#### Collect:
Returns all the elements in the dataset

In [None]:
data_map.collect()

#### Reduce:
Reduces the elements of an RDD using a specified method. It runs on each partition and returns to the DRIVER node, where it runs again.

In [None]:
print(data_rdd.map(lambda row:row[0]).reduce(lambda x,y:x+y))

#### Count:
Returns the number of elements in an RDD


In [None]:
print data_rdd.count()