In [1]:
# sc is an existing SparkContext.
from pyspark.sql import SQLContext, Row
sqlContext = SQLContext(sc)

In [3]:
# Loding data 

likesRDD = sc.textFile("/Users/pravinkumar/Documents/Spark/testData/Social-Network\ Query\ Exercises/Likes")
highSchoolerRDD = sc.textFile("/Users/pravinkumar/Documents/Spark/testData/Social-Network\ Query\ Exercises/Highschooler")
friendRDD = sc.textFile("/Users/pravinkumar/Documents/Spark/testData/Social-Network\ Query\ Exercises/Friend")

# Converting RDD to DF
likesDF = likesRDD.map(lambda rec: rec.split("\t")).map(lambda rec: Row(ID1 = rec[0], ID2 = rec[1])).toDF()
likesDF.limit(2).show()
highSchoolerDF = highSchoolerRDD.map(lambda rec: rec.split("\t")).map(lambda rec: Row(ID = rec[0], name = rec[1], grade = rec[2])).toDF()
highSchoolerDF.limit(2).show()
friendDF = friendRDD.map(lambda rec: rec.split("\t")).map(lambda rec: Row(ID1 = rec[0], ID2 = rec[1])).toDF()
friendDF.limit(2).show()

# Registering DF as Temp Table so we can SQL query on them
likesDF.registerTempTable("likes")
highSchoolerDF.registerTempTable("highschooler")
friendDF.registerTempTable("friend")


+----+----+
| ID1| ID2|
+----+----+
|1689|1709|
|1709|1689|
+----+----+

+----+-----+-------+
|  ID|grade|   name|
+----+-----+-------+
|1510|    9| Jordan|
|1689|    9|Gabriel|
+----+-----+-------+

+----+----+
| ID1| ID2|
+----+----+
|1510|1381|
|1510|1689|
+----+----+



In [4]:
# Query 01
# Find the names of all students who are friends with someone named Gabriel. 

highSchoolerTrim = highSchoolerRDD.map(lambda rec: rec.split("\t")).map(lambda rec: (rec[0], rec[1]))
highSchoolerTrimSwap = highSchoolerRDD.map(lambda rec: rec.split("\t")).map(lambda rec: (rec[1], rec[0]))
friendTrim = friendRDD.map(lambda rec: rec.split("\t")).map(lambda rec: (rec[0], rec[1]))

GabrielList = highSchoolerTrimSwap.lookup('Gabriel')
friendGabrielTrim = friendTrim.filter(lambda rec: rec[0] in GabrielList or rec[1] in GabrielList)
friendsList = friendGabrielTrim.map(lambda rec: rec[1] if rec[0] in GabrielList else rec[0]).toLocalIterator()
friendsList = list(friendsList)
Query01 = highSchoolerTrim.filter(lambda rec: rec[0] in friendsList).map(lambda rec: rec[1])

for i in Query01.collect(): print(i)


sqlContext.sql("select h.name from highschooler h where h.ID in (select f.ID2 from friend f, highschooler h where \
h.ID = f.ID1 and h.name = 'Gabriel') and h.ID in (select f.ID1 from friend f, highschooler h where \
h.ID = f.ID2 and h.name = 'Gabriel')").show()

Jordan
Cassandra
Andrew
Alexis
Jessica
+---------+
|     name|
+---------+
|   Jordan|
|  Jessica|
|   Andrew|
|   Alexis|
|Cassandra|
+---------+



In [18]:
# Query 02
# For every student who likes someone 2 or more grades younger than themselves, 
# return that student's name and grade, and the name and grade of the student they like.

highSchoolerTrim = highSchoolerRDD.map(lambda rec: rec.split("\t")).map(lambda rec: (rec[0], (rec[1], rec[2])))
likesTrim = likesRDD.map(lambda rec: rec.split("\t")).map(lambda rec: (rec[0], rec[1]))

# Broadcasting the likesRDD 
likesBC = sc.broadcast(likesTrim.collectAsMap())
# Broadcasting the highSchoolerTrim RDD 
highSchoolerTrimBC = sc.broadcast(highSchoolerTrim.collectAsMap())

def getSomeone(rec):
    othersName, othersGrade = highSchoolerTrimBC.value.get(likesBC.value.get(rec[0], -1), (None, None))
    if othersGrade != None:
        if int(rec[1][1]) == int(othersGrade) + 2 :
            return (rec[1][0], rec[1][1], othersName, othersGrade)
    

Query02 = highSchoolerTrim.map(lambda rec: getSomeone(rec)).filter(lambda rec: rec != None)

for i in Query02.collect(): print(i)

                                                          

sqlContext.sql("select h1.name, h1.grade, h2.name, h2.grade from highschooler h1, highschooler h2 where h2.ID in \
(select l.ID2 from likes l where h1.ID = l.ID1) and int(h1.grade) >= int(h2.grade) + 2").show()

('John', '12', 'Haley', '10')
+----+-----+-----+-----+
|name|grade| name|grade|
+----+-----+-----+-----+
|John|   12|Haley|   10|
+----+-----+-----+-----+



In [None]:
# Exercise @ https://lagunita.stanford.edu/courses/DB/SQL/SelfPaced/courseware/ch-sql/seq-exercise-sql_social_query_core/

# Here's the schema:

# Highschooler ( ID, name, grade ) 
# English: There is a high school student with unique ID and a given first name in a certain grade. 

# Friend ( ID1, ID2 ) 
# English: The student with ID1 is friends with the student with ID2. Friendship is mutual, 
# so if (123, 456) is in the Friend table, so is (456, 123). 

# Likes ( ID1, ID2 ) 
# English: The student with ID1 likes the student with ID2. Liking someone is not necessarily mutual, 
# so if (123, 456) is in the Likes table, there is no guarantee that (456, 123) is also present. 

In [19]:
# Query 03
# For every pair of students who both like each other, return the name and 
# grade of both students. Include each pair only once, with the two names in alphabetical order. 

highSchoolerTrim = highSchoolerRDD.map(lambda rec: rec.split("\t")).map(lambda rec: (rec[0], (rec[1], rec[2])))
likesTrim = likesRDD.map(lambda rec: rec.split("\t")).map(lambda rec: (rec[1], rec[0]))

# Broadcasting the likesRDD 
likesBC = sc.broadcast(list(likesTrim.toLocalIterator()))
# Broadcasting the highSchoolerTrim RDD 
highSchoolerTrimBC = sc.broadcast(highSchoolerTrim.collectAsMap())

def checkOther(rec):
    for i in likesBC.value:
        if(rec == i):
            return (highSchoolerTrimBC.value.get(rec[0], None), highSchoolerTrimBC.value.get(rec[1], None))
                
# Should remove duplicate data
Query03RDD = likesTrim.map(lambda rec: (rec[1], rec[0])).\
map(lambda rec: checkOther(rec)).filter(lambda rec: rec != None)
for i in Query03RDD.collect(): print(i)

#sqlContext.sql("select h1.name, h1.grade, h2.name, h2.grade from highschooler h1, highschooler h2 where h2.ID in \
#(select l.ID2 from likes l where h1.ID = l.ID1) and h1.ID in \
#(select l.ID1 from likes l where h2.ID = l.ID2)").show()

(('Gabriel', '9'), ('Cassandra', '9'))
(('Cassandra', '9'), ('Gabriel', '9'))
(('Jessica', '11'), ('Kyle', '12'))
(('Kyle', '12'), ('Jessica', '11'))


In [None]:
# Query 04
# Find all students who do not appear in the Likes table (as a student who likes or is liked) 
# and return their names and grades. Sort by grade, then by name within each grade. 



In [None]:
# Query 05
# For every situation where student A likes student B, but we have no information about whom B 
# likes (that is, B does not appear as an ID1 in the Likes table), return A and B's names and grades. 



In [None]:
# Query 06
# Find names and grades of students who only have friends in the same grade. 
# Return the result sorted by grade, then by name within each grade. 

In [None]:
#Query 07
# For each student A who likes a student B where the two are not friends, 
# find if they have a friend C in common (who can introduce them!). 
# For all such trios, return the name and grade of A, B, and C. 

In [None]:
# Query 08
# Find the difference between the number of students in the school and the number of different first names.

In [None]:
# Query 09
# Find the name and grade of all students who are liked by more than one other student. 