# Introduction to Spark

We'll work with a data set containing the names of all of the guests who have appeared on The Daily Show.

### Loading the data set into an RDD

In [1]:
import findspark

In [2]:
findspark.init()

In [3]:
import pyspark

In [4]:
sc = pyspark.SparkContext()

In [5]:
raw_data = sc.textFile("daily_show.tsv")
raw_data

daily_show.tsv MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0

In [7]:
raw_data.take(5)

['YEAR\tGoogleKnowlege_Occupation\tShow\tGroup\tRaw_Guest_List',
 '1999\tactor\t1/11/99\tActing\tMichael J. Fox',
 '1999\tComedian\t1/12/99\tComedy\tSandra Bernhard',
 '1999\ttelevision actress\t1/13/99\tActing\tTracey Ullman',
 '1999\tfilm actress\t1/14/99\tActing\tGillian Anderson']

### Tallying up the number of guests

In [8]:
daily_show = raw_data.map(lambda line: line.split('\t'))
daily_show

PythonRDD[4] at RDD at PythonRDD.scala:53

In [9]:
tally = daily_show.map(lambda x: (x[0], 1)).reduceByKey(lambda x,y: x+y)

In [14]:
print(tally)

PythonRDD[9] at RDD at PythonRDD.scala:53


In [15]:
tally.take(tally.count())

[('YEAR', 1),
 ('2002', 159),
 ('2003', 166),
 ('2004', 164),
 ('2007', 141),
 ('2010', 165),
 ('2011', 163),
 ('2012', 164),
 ('2013', 166),
 ('2014', 163),
 ('2015', 100),
 ('1999', 166),
 ('2000', 169),
 ('2001', 157),
 ('2005', 162),
 ('2006', 161),
 ('2008', 164),
 ('2009', 163)]

In [11]:
def filter_year(line):
    
    if line[0] == 'YEAR':
        return False
    else:
        return True

In [17]:
filtered_daily_show = daily_show.filter(lambda line: filter_year(line))

In [18]:
filtered_daily_show.take(filtered_daily_show.count())

[['1999', 'actor', '1/11/99', 'Acting', 'Michael J. Fox'],
 ['1999', 'Comedian', '1/12/99', 'Comedy', 'Sandra Bernhard'],
 ['1999', 'television actress', '1/13/99', 'Acting', 'Tracey Ullman'],
 ['1999', 'film actress', '1/14/99', 'Acting', 'Gillian Anderson'],
 ['1999', 'actor', '1/18/99', 'Acting', 'David Alan Grier'],
 ['1999', 'actor', '1/19/99', 'Acting', 'William Baldwin'],
 ['1999', 'Singer-lyricist', '1/20/99', 'Musician', 'Michael Stipe'],
 ['1999', 'model', '1/21/99', 'Media', 'Carmen Electra'],
 ['1999', 'actor', '1/25/99', 'Acting', 'Matthew Lillard'],
 ['1999', 'stand-up comedian', '1/26/99', 'Comedy', 'David Cross'],
 ['1999', 'actress', '1/27/99', 'Acting', 'Yasmine Bleeth'],
 ['1999', 'actor', '1/28/99', 'Acting', 'D. L. Hughley'],
 ['1999', 'television actress', '10/18/99', 'Acting', 'Rebecca Gayheart'],
 ['1999', 'Comedian', '10/19/99', 'Comedy', 'Steven Wright'],
 ['1999', 'actress', '10/20/99', 'Acting', 'Amy Brenneman'],
 ['1999', 'actress', '10/21/99', 'Acting', 'M

### Filtering out guests with unknown profession

In [19]:
filtered_daily_show.filter(lambda line: line[1] != '') \
                   .map(lambda line: (line[1].lower(), 1)) \
                   .reduceByKey(lambda x,y: x+y) \
                   .take(10)

[('actor', 596),
 ('film actress', 21),
 ('model', 9),
 ('stand-up comedian', 44),
 ('actress', 271),
 ('television personality', 13),
 ('comic', 2),
 ('musician', 19),
 ('film actor', 19),
 ('journalist', 253)]