In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql import functions as func

-----

In [2]:
def mapper(line):
    fields = line.split(',')
    return Row(id=int(fields[0]), name=str(fields[1]), \
               age=int(fields[2]), numFriends=int(fields[3]))

In [3]:
# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/06/17 18:20:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
lines = spark.sparkContext.textFile("fakefriends.csv")
people = lines.map(mapper)

In [5]:
# Infer the schema, and register the DataFrame as a table.
schemaPeople = spark.createDataFrame(people).cache()
schemaPeople.createOrReplaceTempView("people")

                                                                                

In [6]:
# SQL can be run over DataFrames that have been registered as a table.
teenagers = spark.sql("SELECT * FROM people WHERE age >= 13 AND age <= 19")

In [7]:
# The results of SQL queries are RDDs and support all the normal RDD operations.
for teen in teenagers.collect():
  print(teen)

Row(id=21, name='Miles', age=19, numFriends=268)
Row(id=52, name='Beverly', age=19, numFriends=269)
Row(id=54, name='Brunt', age=19, numFriends=5)
Row(id=106, name='Beverly', age=18, numFriends=499)
Row(id=115, name='Dukat', age=18, numFriends=397)
Row(id=133, name='Quark', age=19, numFriends=265)
Row(id=136, name='Will', age=19, numFriends=335)
Row(id=225, name='Elim', age=19, numFriends=106)
Row(id=304, name='Will', age=19, numFriends=404)
Row(id=341, name='Data', age=18, numFriends=326)
Row(id=366, name='Keiko', age=19, numFriends=119)
Row(id=373, name='Quark', age=19, numFriends=272)
Row(id=377, name='Beverly', age=18, numFriends=418)
Row(id=404, name='Kasidy', age=18, numFriends=24)
Row(id=409, name='Nog', age=19, numFriends=267)
Row(id=439, name='Data', age=18, numFriends=417)
Row(id=444, name='Keiko', age=18, numFriends=472)
Row(id=492, name='Dukat', age=19, numFriends=36)
Row(id=494, name='Kasidy', age=18, numFriends=194)


In [8]:
# We can also use functions instead of SQL queries:
schemaPeople.groupBy("age").count().orderBy("age").show()

+---+-----+
|age|count|
+---+-----+
| 18|    8|
| 19|   11|
| 20|    5|
| 21|    8|
| 22|    7|
| 23|   10|
| 24|    5|
| 25|   11|
| 26|   17|
| 27|    8|
| 28|   10|
| 29|   12|
| 30|   11|
| 31|    8|
| 32|   11|
| 33|   12|
| 34|    6|
| 35|    8|
| 36|   10|
| 37|    9|
+---+-----+
only showing top 20 rows



In [9]:
spark.stop()

--------

In [10]:
# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

In [11]:
# DF created with schema
people = spark.read.option("header", "true").option("inferSchema", "true").csv("fakefriends.csv")

In [12]:
people.printSchema()

root
 |-- 0: integer (nullable = true)
 |-- Will: string (nullable = true)
 |-- 33: integer (nullable = true)
 |-- 385: integer (nullable = true)



In [13]:
people.sample(fraction=0.9)

DataFrame[0: int, Will: string, 33: int, 385: int]

--------------

In [14]:
lines = spark.sparkContext.textFile("fakefriends.csv")
people = lines.map(mapper)

In [15]:
# Infer the schema, and register the DataFrame as a table.
schemaPeople = spark.createDataFrame(people).cache()
schemaPeople.createOrReplaceTempView("people")

                                                                                

In [16]:
schemaPeople

DataFrame[id: bigint, name: string, age: bigint, numFriends: bigint]

In [17]:
friends = schemaPeople.select(schemaPeople.age, schemaPeople.numFriends)

In [18]:
print(friends)

DataFrame[age: bigint, numFriends: bigint]


In [19]:
friends.groupBy("age").avg("numFriends").show()

+---+------------------+
|age|   avg(numFriends)|
+---+------------------+
| 26|242.05882352941177|
| 29|215.91666666666666|
| 65|             298.2|
| 54| 278.0769230769231|
| 19|213.27272727272728|
| 22|206.42857142857142|
| 34|             245.5|
| 50|             254.6|
| 57| 258.8333333333333|
| 43|230.57142857142858|
| 32| 207.9090909090909|
| 31|            267.25|
| 39|169.28571428571428|
| 25|197.45454545454547|
| 68|             269.6|
| 58|116.54545454545455|
| 27|           228.125|
| 63|             384.0|
| 56| 306.6666666666667|
| 51|302.14285714285717|
+---+------------------+
only showing top 20 rows



In [20]:
friends.groupBy("age").avg("numFriends").sort("age").show()

+---+------------------+
|age|   avg(numFriends)|
+---+------------------+
| 18|           343.375|
| 19|213.27272727272728|
| 20|             165.0|
| 21|           350.875|
| 22|206.42857142857142|
| 23|             246.3|
| 24|             233.8|
| 25|197.45454545454547|
| 26|242.05882352941177|
| 27|           228.125|
| 28|             209.1|
| 29|215.91666666666666|
| 30| 235.8181818181818|
| 31|            267.25|
| 32| 207.9090909090909|
| 33| 325.3333333333333|
| 34|             245.5|
| 35|           211.625|
| 36|             246.6|
| 37|249.33333333333334|
+---+------------------+
only showing top 20 rows



In [21]:
friends.groupBy("age").agg(func.round(func.avg("numFriends"),2).alias("avg of friends")).sort("age").show()

+---+--------------+
|age|avg of friends|
+---+--------------+
| 18|        343.38|
| 19|        213.27|
| 20|         165.0|
| 21|        350.88|
| 22|        206.43|
| 23|         246.3|
| 24|         233.8|
| 25|        197.45|
| 26|        242.06|
| 27|        228.13|
| 28|         209.1|
| 29|        215.92|
| 30|        235.82|
| 31|        267.25|
| 32|        207.91|
| 33|        325.33|
| 34|         245.5|
| 35|        211.63|
| 36|         246.6|
| 37|        249.33|
+---+--------------+
only showing top 20 rows



In [22]:
from pyspark.sql.types import IntegerType

In [23]:
def square(x):
    return x*x

In [24]:
spark.udf.register("square", square, IntegerType())

<function __main__.square(x)>

In [26]:
# df = spark.sql("select square(age) from friends.show()")

-------

In [27]:
inputDf = spark.read.text('book.txt')

In [31]:
words = inputDf.select(func.explode(func.split(inputDf.value, '\\W+')).alias("word"))

In [32]:
words.filter(words.word != "")

DataFrame[word: string]

In [37]:
lower = words.select(func.lower(words.word).alias("word"))

In [43]:
sorted_word = lower.groupBy("word").count().sort("count")

In [44]:
sorted_word.show()

+-------------+-----+
|         word|count|
+-------------+-----+
|          125|    1|
| manipulation|    1|
|       graphs|    1|
|indoctrinated|    1|
|       column|    1|
|    traveling|    1|
|     slightly|    1|
| inflammatory|    1|
|   variations|    1|
|       spared|    1|
|          800|    1|
|    indicator|    1|
|        hires|    1|
|           07|    1|
|   surrounded|    1|
|     retailer|    1|
|          fax|    1|
|   afterwards|    1|
|        boost|    1|
|    directors|    1|
+-------------+-----+
only showing top 20 rows



-----------

In [45]:
from pyspark.sql import SparkSession, functions as func
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

In [47]:
schema = StructType([
    StructField("id", StringType(), True),
    StructField("date", IntegerType(), True),
    StructField("measure", StringType(), True),
    StructField("temp", FloatType(), True)
])

In [50]:
df = spark.read.schema(schema).csv("1800.csv")

In [51]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- date: integer (nullable = true)
 |-- measure: string (nullable = true)
 |-- temp: float (nullable = true)



In [52]:
minTemp = df.filter(df.measure == "TMIN")

In [55]:
stationTemp = minTemp.select("id", "temp")

In [60]:
stationTemp.groupBy("id").min("temp").show()

+-----------+---------+
|         id|min(temp)|
+-----------+---------+
|ITE00100554|   -148.0|
|EZE00100082|   -135.0|
+-----------+---------+



In [61]:
spark.stop()