In [36]:
df = spark.read.csv('file:///opt/data/txt/adult/adult.data', header=True, inferSchema=True)

In [37]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fn1wgt: double (nullable = true)
 |-- education: string (nullable = true)
 |-- education-num: double (nullable = true)
 |-- martial-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital-gain: double (nullable = true)
 |-- capital-loss: double (nullable = true)
 |-- hours-per-week: double (nullable = true)
 |-- native-country: string (nullable = true)
 |-- income: string (nullable = true)



In [38]:
df.count()

32561

In [39]:
grpByIncome = df.groupBy('income').count()

In [40]:
grpByIncome.show()

+------+-----+
|income|count|
+------+-----+
|  >50K| 7841|
| <=50K|24720|
+------+-----+



In [41]:
df.describe('age').show()

+-------+------------------+
|summary|               age|
+-------+------------------+
|  count|             32561|
|   mean| 38.58164675532078|
| stddev|13.640432553581356|
|    min|                17|
|    max|                90|
+-------+------------------+



In [42]:
df.describe('capital-gain').show()

+-------+------------------+
|summary|      capital-gain|
+-------+------------------+
|  count|             32561|
|   mean|1077.6488437087312|
| stddev| 7385.292084840354|
|    min|               0.0|
|    max|           99999.0|
+-------+------------------+



In [43]:
df.describe('hours-per-week').show()

+-------+------------------+
|summary|    hours-per-week|
+-------+------------------+
|  count|             32561|
|   mean|40.437455852092995|
| stddev|12.347428681731838|
|    min|               1.0|
|    max|              99.0|
+-------+------------------+



In [44]:
grpByGender = df.groupBy('sex')

In [45]:
type(grpByGender)

pyspark.sql.group.GroupedData

In [46]:
grpByGender.mean('age').show()

+-------+-----------------+
|    sex|         avg(age)|
+-------+-----------------+
|   Male|39.43354749885268|
| Female|36.85823043357163|
+-------+-----------------+



In [47]:
grpByGender.mean('hours-per-week').show()

+-------+-------------------+
|    sex|avg(hours-per-week)|
+-------+-------------------+
|   Male|  42.42808627810923|
| Female| 36.410361154953115|
+-------+-------------------+



In [48]:
grpByGenderIncome = df.groupBy(['income', 'sex'])

In [49]:
grpByGenderIncome.count().show()

+------+-------+-----+
|income|    sex|count|
+------+-------+-----+
|  >50K|   Male| 6662|
|  >50K| Female| 1179|
| <=50K| Female| 9592|
| <=50K|   Male|15128|
+------+-------+-----+



In [50]:
grpByOccupationIncome = df.groupBy(['occupation', 'income'])

In [51]:
grpByOccupationIncome.count().sort(['income', 'count'], ascending=0).show()

+------------------+------+-----+
|        occupation|income|count|
+------------------+------+-----+
|   Exec-managerial|  >50K| 1968|
|    Prof-specialty|  >50K| 1859|
|             Sales|  >50K|  983|
|      Craft-repair|  >50K|  929|
|      Adm-clerical|  >50K|  507|
|  Transport-moving|  >50K|  320|
|      Tech-support|  >50K|  283|
| Machine-op-inspct|  >50K|  250|
|   Protective-serv|  >50K|  211|
|                 ?|  >50K|  191|
|     Other-service|  >50K|  137|
|   Farming-fishing|  >50K|  115|
| Handlers-cleaners|  >50K|   86|
|      Armed-Forces|  >50K|    1|
|   Priv-house-serv|  >50K|    1|
|      Adm-clerical| <=50K| 3263|
|      Craft-repair| <=50K| 3170|
|     Other-service| <=50K| 3158|
|             Sales| <=50K| 2667|
|    Prof-specialty| <=50K| 2281|
+------------------+------+-----+
only showing top 20 rows



In [52]:
df.createOrReplaceTempView('census')

In [54]:
dfAgeIncome = spark.sql('select age, income from census limit 10')

In [55]:
dfAgeIncome.show()

+---+------+
|age|income|
+---+------+
| 39| <=50K|
| 50| <=50K|
| 38| <=50K|
| 53| <=50K|
| 28| <=50K|
| 37| <=50K|
| 49| <=50K|
| 52|  >50K|
| 31|  >50K|
| 42|  >50K|
+---+------+



In [56]:
type(dfAgeIncome)

pyspark.sql.dataframe.DataFrame

In [61]:
avgHoursPerWeekByEducation = spark.sql('select education, round(avg(`hours-per-week`), 2) as avgHoursPerWeek from census group by education order by avgHoursPerWeek')

In [62]:
avgHoursPerWeekByEducation.show()

+-------------+---------------+
|    education|avgHoursPerWeek|
+-------------+---------------+
|         11th|          33.93|
|         12th|          35.78|
|    Preschool|          36.65|
|         10th|          37.05|
|          9th|          38.04|
|      1st-4th|          38.26|
| Some-college|          38.85|
|      5th-6th|           38.9|
|      7th-8th|          39.37|
|   Assoc-acdm|           40.5|
|      HS-grad|          40.58|
|    Assoc-voc|          41.61|
|    Bachelors|          42.61|
|      Masters|          43.84|
|    Doctorate|          46.97|
|  Prof-school|          47.43|
+-------------+---------------+



In [92]:
urlPG9="jdbc:postgresql://us/pysparkbookdb?user=postgres&password=Passw0rd"

In [93]:
dfStudents = spark.read.format('jdbc').options(url=urlPG9, database='pysparkbookdb', dbtable='studenttable').load()

In [94]:
dfStudents.show()

+--------------------+--------------------+------+
|           studentid|                name|gender|
+--------------------+--------------------+------+
|si1              ...|Robin            ...| M    |
|si2              ...|Maria            ...| F    |
|si3              ...|Julie            ...| F    |
|si4              ...|Bob              ...| M    |
|si6              ...|William          ...| M    |
+--------------------+--------------------+------+



In [95]:
from pyspark.sql.functions import trim

In [96]:
dfStudents = dfStudents.select(trim(dfStudents.studentid), trim(dfStudents.name), dfStudents.gender)

In [97]:
dfStudents.show()

+---------------+----------+------+
|trim(studentid)|trim(name)|gender|
+---------------+----------+------+
|            si1|     Robin| M    |
|            si2|     Maria| F    |
|            si3|     Julie| F    |
|            si4|       Bob| M    |
|            si6|   William| M    |
+---------------+----------+------+



In [98]:
dfStudents = dfStudents.withColumnRenamed('trim(studentid)', 'studentID').withColumnRenamed('trim(name)', 'Name').withColumnRenamed('gender', 'Gender')

In [99]:
dfStudents.printSchema()

root
 |-- studentID: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Gender: string (nullable = true)



In [100]:
dfStudents.show()

+---------+-------+------+
|studentID|   Name|Gender|
+---------+-------+------+
|      si1|  Robin| M    |
|      si2|  Maria| F    |
|      si3|  Julie| F    |
|      si4|    Bob| M    |
|      si6|William| M    |
+---------+-------+------+



In [101]:
dfSubjects = sqlContext.read.format('json').load('file:///opt/data/txt/pyspark_recipes/subjects.json')

In [102]:
dfSubjects.show()

+---------+-------+
|studentID|subject|
+---------+-------+
|      si1| Python|
|      si3|   Java|
|      si1|   Java|
|      si2| Python|
|      si3|   Ruby|
|      si4|    C++|
|      si5|      C|
|      si4| Python|
|      si2|   Java|
+---------+-------+



In [103]:
dfJoined = dfSubjects.join(dfStudents, dfSubjects.studentID == dfStudents.studentID, how='inner')

In [104]:
dfJoined.show()

+---------+-------+---------+-----+------+
|studentID|subject|studentID| Name|Gender|
+---------+-------+---------+-----+------+
|      si1|   Java|      si1|Robin| M    |
|      si1| Python|      si1|Robin| M    |
|      si2|   Java|      si2|Maria| F    |
|      si2| Python|      si2|Maria| F    |
|      si3|   Ruby|      si3|Julie| F    |
|      si3|   Java|      si3|Julie| F    |
|      si4| Python|      si4|  Bob| M    |
|      si4|    C++|      si4|  Bob| M    |
+---------+-------+---------+-----+------+



In [105]:
dfJoined = dfJoined.select(dfSubjects.studentID, 'subject', 'Name', 'Gender')

In [106]:
dfJoined.columns

['studentID', 'subject', 'Name', 'Gender']

In [107]:
dfJoined.write.format('json').save('file:///opt/data/tmp/dfJoined')

In [108]:
dfLeftOuter = dfSubjects.join(dfStudents, dfSubjects.studentID==dfStudents.studentID, how='left_outer')

In [109]:
dfLeftOuter.show()

+---------+-------+---------+-----+------+
|studentID|subject|studentID| Name|Gender|
+---------+-------+---------+-----+------+
|      si5|      C|     null| null|  null|
|      si2| Python|      si2|Maria| F    |
|      si2|   Java|      si2|Maria| F    |
|      si4|    C++|      si4|  Bob| M    |
|      si4| Python|      si4|  Bob| M    |
|      si3|   Java|      si3|Julie| F    |
|      si3|   Ruby|      si3|Julie| F    |
|      si1| Python|      si1|Robin| M    |
|      si1|   Java|      si1|Robin| M    |
+---------+-------+---------+-----+------+



In [110]:
dfLeftOuter = dfLeftOuter.select(dfSubjects.studentID, 'subject', 'Name', 'Gender')

In [111]:
props = {'user': 'postgres', 'password': 'Passw0rd'}

In [112]:
dfLeftOuter.write.jdbc(url=urlPG9, table='left_outer', mode='overwrite', properties=props)