Data is from http://socialnetworks.mpi-sws.mpg.de/data/facebook-links.txt.gz

In [None]:
# !wget -q http://socialnetworks.mpi-sws.mpg.de/data/facebook-links.txt.gz
# !gunzip  facebook-links.txt.gz
# !pip install pymongo


Setting up environment 

In [1]:
# !pip install -q findspark

import findspark
findspark.init()

# from pyspark import SparkContext
# sc = SparkContext.getOrCreate()

we are configuring heap size or let's say configuring spark to run 500mb data set

In [2]:
from pyspark.conf import SparkConf
from pyspark import SparkContext

conf = SparkConf().setAppName("App")
conf = (conf.setMaster('local[*]')
        .set('spark.executor.memory', '4G')
        .set('spark.driver.memory', '45G')
        .set('spark.driver.maxResultSize', '10G'))
sc = SparkContext(conf=conf)

#hide
importing all the libraries which are needed

In [4]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql.session import SparkSession
from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.mllib.recommendation import Rating
import pandas
import pyspark.sql.dataframe

Initialing pyspark and reading the raw data

In [5]:
rawData = sc.textFile('./soc-pokec-relationships.txt')

In [6]:

def cleanFile(rawFile):
  ''' we are changing the data into useful form

      ARGS
      rawFile:  the data which we need to parse into useful form

      RETURNS
      tuple of user and friend [user,friend] 
  '''
  user,frnd = rawFile[0].split('\t')
  return [int(user),int(frnd)]

Code Below
*  calling cleandata on rawData to get rdd in useful form
*  Also dispaly sample of the data




In [7]:
userFriendRDD  = rawData.map(lambda x: x.split(',')).map(cleanFile)
del rawData
userFriendRDD.take(10)

[[1, 13],
 [1, 11],
 [1, 6],
 [1, 3],
 [1, 4],
 [1, 5],
 [1, 15],
 [1, 14],
 [1, 7],
 [1, 8]]

making new RDD by swaping columns as the the original RDD has neighbor nodes (Friends) which are not in the main node field(user)

In [8]:
reversedUserFriendRDD = userFriendRDD.map(lambda x: [x[1],x[0]])

#Generate Jaccard similarity Score

In [9]:
def unionList(list1,list2):
  ''' doing to union of the given list
  
    ARGS
    list1: list one on which we need to perform union action
    list2: list two on which we need to perform union action

    RETURN
     the list after performing the union
  '''
  return list(set(list1)| set(list2))

In [10]:
def intersctionList(list1,list2):
  ''' doing to intersection of the given list

  ARGS
  list1: list one on which we need to perform intersction action
  list2: list two on which we need to perform intersction action

  RETURN
    the list after performing the intersction
  '''
  return list(set(list1) & set(list2))

In [11]:
def getScore(list1,list2):
  ''' calculating jaccard_similarity score of the given list
  ARGS
  list1: list on which we need to calcualte the score
  list2: list on which we need to calcualte the score

  RETURN 
  the Jaccard_similarity score of given list
  '''

  return round(len(intersctionList(list1,list2))/len(unionList(list1,list2)),5)


#Dictionay
converting userFriendRDD into matrix form and then into Dictionary 

In [12]:
def addUnknowNodes(node,dictionary):
  ''' going to add value in the dictions if not present 

    ARGS
    node: tuple of [node,neighbour]/[user,friend]
    dictionary: the dictionary in which we are going to add tuple

    RETURN
    nothing
  '''  
  key,value =node

  if key not in dictionary.keys():
    dictionary[key] = value

In [13]:
userFriendMatrixRDD_withList = userFriendRDD.groupByKey().map(lambda x : (x[0], list(x[1])))
reversedUserFriendRDDMatrix = reversedUserFriendRDD.groupByKey().map(lambda x : [x[0], list(x[1])])

In [14]:
userFriendDict = userFriendMatrixRDD_withList.collectAsMap()

In [15]:
reverse = reversedUserFriendRDDMatrix.collect()

temp = userFriendRDD.filter(lambda x: x[1] not in userFriendDict.keys()).map(lambda x: [x[1],x[0]])
userFriendRDD = userFriendRDD.union(temp)

In [16]:
# map(addUnknowNodes,reverse,dic)
for val in reverse:
  addUnknowNodes(val,userFriendDict)

In [17]:
len(userFriendDict)

1632803

In [18]:
del reversedUserFriendRDDMatrix, reverse,reversedUserFriendRDD,userFriendMatrixRDD_withList

In [19]:
def addRating(x):
  ''' adding Jaccard_similarity score as 3rd column for each pair the come
  
      ARGS
      x: tuple of [user,friend]/[node,neighbor] 

      RETURN 
      return a tuple of [user,friend,Jaccard_similarity_score] of the given node 
      and it's neighour

  '''

  list1= userFriendDict[x[0]]
  try:
    list2= userFriendDict[x[1]]
  except:
    list2 =[-1]
  score = getScore(list1,list2)
  return [x[0],x[1], score]

#Prediction model 


In [None]:
import time
start_time1 = time.time()

In [20]:
trainingRDD,testRDD =  userFriendRDD.randomSplit([0.8, 0.2])

In [None]:
# trainingRDD.count()

In [None]:
print("--- %s time for training seconds ---" % (time.time() - start_time1))


In [21]:
# userFriendRDD.take(100)
dataGathered = []

In [23]:
'''
refrence for the model
https://spark.apache.org/docs/latest/mllib-collaborative-filtering.html

NOTE: there is more stuff in the link don't forget to look at it. 
'''
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

import time

start_time1 = time.time()
rank = 10
numIterations = 5


model3 = ALS.train(trainingRDD.map(addRating), rank, numIterations)


timeVal = float((time.time() - start_time1))
print("--- time for training %s  seconds ---" % (time.time() - start_time1))


--- time for training 1132.469306230545  seconds ---


In [27]:
prediction3 = model3.predictAll(testRDD).map(lambda r: ((r[0], r[1]), r[2]))

In [28]:
ratesAndPreds = testRDD.map(addRating).map(lambda r: ((r[0], r[1]), r[2])).join(prediction3)
MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()

print("the mean square error is {} ".format(MSE))

Build in library to make predictions 

In [29]:
rec = model3.recommendProductsForUsers(2)

--- 4.367865085601807 time for predicting products in seconds ---
