In [1]:
from pyspark.sql import SQLContext
from pyspark.sql import Row

# generate RDD

In [2]:
localFile = sc.textFile("file:///etc/passwd")
rdd = localFile.map(lambda line : line.split(":"))
rdd.take(5)

[[u'root', u'x', u'0', u'0', u'root', u'/root', u'/bin/bash'],
 [u'bin', u'x', u'1', u'1', u'bin', u'/bin', u'/sbin/nologin'],
 [u'daemon', u'x', u'2', u'2', u'daemon', u'/sbin', u'/sbin/nologin'],
 [u'adm', u'x', u'3', u'4', u'adm', u'/var/adm', u'/sbin/nologin'],
 [u'lp', u'x', u'4', u'7', u'lp', u'/var/spool/lpd', u'/sbin/nologin']]

# generate DataFrame

In [3]:
rows = rdd.map(lambda t : 
    Row(
        username = t[0],
        password = t[1],
        userId = int(t[2]),
        groupId = int(t[3]),
        userIdInfo = t[4],
        homeDirectory = t[5],
        commandShell = t[6]
    )
)
rows.take(5)

[Row(commandShell=u'/bin/bash', groupId=0, homeDirectory=u'/root', password=u'x', userId=0, userIdInfo=u'root', username=u'root'),
 Row(commandShell=u'/sbin/nologin', groupId=1, homeDirectory=u'/bin', password=u'x', userId=1, userIdInfo=u'bin', username=u'bin'),
 Row(commandShell=u'/sbin/nologin', groupId=2, homeDirectory=u'/sbin', password=u'x', userId=2, userIdInfo=u'daemon', username=u'daemon'),
 Row(commandShell=u'/sbin/nologin', groupId=4, homeDirectory=u'/var/adm', password=u'x', userId=3, userIdInfo=u'adm', username=u'adm'),
 Row(commandShell=u'/sbin/nologin', groupId=7, homeDirectory=u'/var/spool/lpd', password=u'x', userId=4, userIdInfo=u'lp', username=u'lp')]

In [4]:
sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(rows)
df.printSchema()

root
 |-- commandShell: string (nullable = true)
 |-- groupId: long (nullable = true)
 |-- homeDirectory: string (nullable = true)
 |-- password: string (nullable = true)
 |-- userId: long (nullable = true)
 |-- userIdInfo: string (nullable = true)
 |-- username: string (nullable = true)



In [5]:
df.show(5)

+-------------+-------+--------------+--------+------+----------+--------+
| commandShell|groupId| homeDirectory|password|userId|userIdInfo|username|
+-------------+-------+--------------+--------+------+----------+--------+
|    /bin/bash|      0|         /root|       x|     0|      root|    root|
|/sbin/nologin|      1|          /bin|       x|     1|       bin|     bin|
|/sbin/nologin|      2|         /sbin|       x|     2|    daemon|  daemon|
|/sbin/nologin|      4|      /var/adm|       x|     3|       adm|     adm|
|/sbin/nologin|      7|/var/spool/lpd|       x|     4|        lp|      lp|
+-------------+-------+--------------+--------+------+----------+--------+
only showing top 5 rows



# select

In [6]:
df.select("username", "password", "userId", "groupId", "userIdInfo", "homeDirectory", "commandShell").show(5)

+--------+--------+------+-------+----------+--------------+-------------+
|username|password|userId|groupId|userIdInfo| homeDirectory| commandShell|
+--------+--------+------+-------+----------+--------------+-------------+
|    root|       x|     0|      0|      root|         /root|    /bin/bash|
|     bin|       x|     1|      1|       bin|          /bin|/sbin/nologin|
|  daemon|       x|     2|      2|    daemon|         /sbin|/sbin/nologin|
|     adm|       x|     3|      4|       adm|      /var/adm|/sbin/nologin|
|      lp|       x|     4|      7|        lp|/var/spool/lpd|/sbin/nologin|
+--------+--------+------+-------+----------+--------------+-------------+
only showing top 5 rows



In [7]:
df.select(df.username, df.password, df.userId, df.groupId, df.userIdInfo, df.homeDirectory, df.commandShell).show(5)

+--------+--------+------+-------+----------+--------------+-------------+
|username|password|userId|groupId|userIdInfo| homeDirectory| commandShell|
+--------+--------+------+-------+----------+--------------+-------------+
|    root|       x|     0|      0|      root|         /root|    /bin/bash|
|     bin|       x|     1|      1|       bin|          /bin|/sbin/nologin|
|  daemon|       x|     2|      2|    daemon|         /sbin|/sbin/nologin|
|     adm|       x|     3|      4|       adm|      /var/adm|/sbin/nologin|
|      lp|       x|     4|      7|        lp|/var/spool/lpd|/sbin/nologin|
+--------+--------+------+-------+----------+--------------+-------------+
only showing top 5 rows



In [8]:
df.select(df["username"], df["password"], df["userId"], df["groupId"], df["userIdInfo"], df["homeDirectory"], df["commandShell"]).show(5)

+--------+--------+------+-------+----------+--------------+-------------+
|username|password|userId|groupId|userIdInfo| homeDirectory| commandShell|
+--------+--------+------+-------+----------+--------------+-------------+
|    root|       x|     0|      0|      root|         /root|    /bin/bash|
|     bin|       x|     1|      1|       bin|          /bin|/sbin/nologin|
|  daemon|       x|     2|      2|    daemon|         /sbin|/sbin/nologin|
|     adm|       x|     3|      4|       adm|      /var/adm|/sbin/nologin|
|      lp|       x|     4|      7|        lp|/var/spool/lpd|/sbin/nologin|
+--------+--------+------+-------+----------+--------------+-------------+
only showing top 5 rows



In [9]:
df.select((df.userId + 100).alias("newUserId")).show(5)

+---------+
|newUserId|
+---------+
|      100|
|      101|
|      102|
|      103|
|      104|
+---------+
only showing top 5 rows



In [10]:
df.select((df["userId"] + 100).alias("newUserId")).show(5)

+---------+
|newUserId|
+---------+
|      100|
|      101|
|      102|
|      103|
|      104|
+---------+
only showing top 5 rows



# filter

In [11]:
df.filter("userId < 5").filter("groupId % 2 = 0").show()

+-------------+-------+-------------+--------+------+----------+--------+
| commandShell|groupId|homeDirectory|password|userId|userIdInfo|username|
+-------------+-------+-------------+--------+------+----------+--------+
|    /bin/bash|      0|        /root|       x|     0|      root|    root|
|/sbin/nologin|      2|        /sbin|       x|     2|    daemon|  daemon|
|/sbin/nologin|      4|     /var/adm|       x|     3|       adm|     adm|
+-------------+-------+-------------+--------+------+----------+--------+



In [12]:
df.filter("userId < 5 and groupId % 2 = 0").show()

+-------------+-------+-------------+--------+------+----------+--------+
| commandShell|groupId|homeDirectory|password|userId|userIdInfo|username|
+-------------+-------+-------------+--------+------+----------+--------+
|    /bin/bash|      0|        /root|       x|     0|      root|    root|
|/sbin/nologin|      2|        /sbin|       x|     2|    daemon|  daemon|
|/sbin/nologin|      4|     /var/adm|       x|     3|       adm|     adm|
+-------------+-------+-------------+--------+------+----------+--------+



In [13]:
df.filter((df.userId < 5) & (df.groupId % 2 == 0)).show()

+-------------+-------+-------------+--------+------+----------+--------+
| commandShell|groupId|homeDirectory|password|userId|userIdInfo|username|
+-------------+-------+-------------+--------+------+----------+--------+
|    /bin/bash|      0|        /root|       x|     0|      root|    root|
|/sbin/nologin|      2|        /sbin|       x|     2|    daemon|  daemon|
|/sbin/nologin|      4|     /var/adm|       x|     3|       adm|     adm|
+-------------+-------+-------------+--------+------+----------+--------+



In [14]:
df.filter((df["userId"] < 5) & (df["groupId"] % 2 == 0)).show()

+-------------+-------+-------------+--------+------+----------+--------+
| commandShell|groupId|homeDirectory|password|userId|userIdInfo|username|
+-------------+-------+-------------+--------+------+----------+--------+
|    /bin/bash|      0|        /root|       x|     0|      root|    root|
|/sbin/nologin|      2|        /sbin|       x|     2|    daemon|  daemon|
|/sbin/nologin|      4|     /var/adm|       x|     3|       adm|     adm|
+-------------+-------+-------------+--------+------+----------+--------+



# orderBy

In [15]:
df.orderBy("userId", ascending = 0).show(5)

+-------------+-------+------------------+--------+------+------------------+-----------------+
| commandShell|groupId|     homeDirectory|password|userId|        userIdInfo|         username|
+-------------+-------+------------------+--------+------+------------------+-----------------+
|/sbin/nologin|  65534|      /var/lib/nfs|       x| 65534|Anonymous NFS User|        nfsnobody|
|    /bin/bash|   1000|/home/hsiehpinghan|       x|  1000|      hsiehpinghan|     hsiehpinghan|
|/sbin/nologin|    997|                 /|       x|   999| systemd Bus Proxy|systemd-bus-proxy|
|/sbin/nologin|    996|                 /|       x|   998|  User for polkitd|          polkitd|
|/sbin/nologin|    995|   /var/lib/chrony|       x|   997|                  |           chrony|
+-------------+-------+------------------+--------+------+------------------+-----------------+
only showing top 5 rows



In [16]:
df.orderBy(df.userId.desc()).show(5)

+-------------+-------+------------------+--------+------+------------------+-----------------+
| commandShell|groupId|     homeDirectory|password|userId|        userIdInfo|         username|
+-------------+-------+------------------+--------+------+------------------+-----------------+
|/sbin/nologin|  65534|      /var/lib/nfs|       x| 65534|Anonymous NFS User|        nfsnobody|
|    /bin/bash|   1000|/home/hsiehpinghan|       x|  1000|      hsiehpinghan|     hsiehpinghan|
|/sbin/nologin|    997|                 /|       x|   999| systemd Bus Proxy|systemd-bus-proxy|
|/sbin/nologin|    996|                 /|       x|   998|  User for polkitd|          polkitd|
|/sbin/nologin|    995|   /var/lib/chrony|       x|   997|                  |           chrony|
+-------------+-------+------------------+--------+------+------------------+-----------------+
only showing top 5 rows



In [17]:
df.orderBy(["password", "commandShell"], ascending = [0, 1]).show(5)

+------------+-------+------------------+--------+------+-----------------+------------+
|commandShell|groupId|     homeDirectory|password|userId|       userIdInfo|    username|
+------------+-------+------------------+--------+------+-----------------+------------+
|   /bin/bash|   1000|/home/hsiehpinghan|       x|  1000|     hsiehpinghan|hsiehpinghan|
|   /bin/bash|      0|             /root|       x|     0|             root|        root|
|   /bin/bash|     26|    /var/lib/pgsql|       x|    26|PostgreSQL Server|    postgres|
|   /bin/sync|      0|             /sbin|       x|     5|             sync|        sync|
|  /sbin/halt|      0|             /sbin|       x|     7|             halt|        halt|
+------------+-------+------------------+--------+------+-----------------+------------+
only showing top 5 rows



In [18]:
df.orderBy(df.password.desc(), df.commandShell.asc()).show(5)

+------------+-------+------------------+--------+------+-----------------+------------+
|commandShell|groupId|     homeDirectory|password|userId|       userIdInfo|    username|
+------------+-------+------------------+--------+------+-----------------+------------+
|   /bin/bash|   1000|/home/hsiehpinghan|       x|  1000|     hsiehpinghan|hsiehpinghan|
|   /bin/bash|      0|             /root|       x|     0|             root|        root|
|   /bin/bash|     26|    /var/lib/pgsql|       x|    26|PostgreSQL Server|    postgres|
|   /bin/sync|      0|             /sbin|       x|     5|             sync|        sync|
|  /sbin/halt|      0|             /sbin|       x|     7|             halt|        halt|
+------------+-------+------------------+--------+------+-----------------+------------+
only showing top 5 rows



# distinct

In [19]:
df.select("commandShell").distinct().show()

+--------------+
|  commandShell|
+--------------+
| /sbin/nologin|
|/sbin/shutdown|
|    /sbin/halt|
|     /bin/sync|
|     /bin/bash|
+--------------+



In [20]:
df.select(df.commandShell).distinct().show()

+--------------+
|  commandShell|
+--------------+
| /sbin/nologin|
|/sbin/shutdown|
|    /sbin/halt|
|     /bin/sync|
|     /bin/bash|
+--------------+



# groupBy

In [21]:
df.select("password").groupBy("password").count().show()

+--------+-----+
|password|count|
+--------+-----+
|       x|   44|
+--------+-----+



In [22]:
df.select(df.password).groupBy(df.password).count().show()

+--------+-----+
|password|count|
+--------+-----+
|       x|   44|
+--------+-----+



# crosstab

In [23]:
df.stat.crosstab("commandShell", "password").show(5)

+---------------------+---+
|commandShell_password|  x|
+---------------------+---+
|        /sbin/nologin| 38|
|            /bin/bash|  3|
|            /bin/sync|  1|
|           /sbin/halt|  1|
|       /sbin/shutdown|  1|
+---------------------+---+



# join

In [24]:
otherDf = df
joinedDf = df.join(otherDf, df.userId == otherDf.userId, "left_outer")
joinedDf.printSchema()

root
 |-- commandShell: string (nullable = true)
 |-- groupId: long (nullable = true)
 |-- homeDirectory: string (nullable = true)
 |-- password: string (nullable = true)
 |-- userId: long (nullable = true)
 |-- userIdInfo: string (nullable = true)
 |-- username: string (nullable = true)
 |-- commandShell: string (nullable = true)
 |-- groupId: long (nullable = true)
 |-- homeDirectory: string (nullable = true)
 |-- password: string (nullable = true)
 |-- userId: long (nullable = true)
 |-- userIdInfo: string (nullable = true)
 |-- username: string (nullable = true)

