In [17]:
from pyspark import SparkContext,SparkConf
from pyspark.sql import SparkSession,SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import Row
import pandas as pd
import matplotlib.pyplot as plt
from pyspark.sql.window import Window
from pyspark.ml.feature import StopWordsRemover, RegexTokenizer,HashingTF
from pyspark.ml import Pipeline
import re
from nltk.stem import PorterStemmer
from pyspark.ml.linalg import Vectors
from pyspark.mllib.clustering import LDA
from pyspark.ml.feature import IDF

In [2]:
# Initialise Spark Session
spark = SparkSession.builder.appName("Experiment3").getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)

In [3]:
#PREPARING DATAFRAMES FOR DATSETS

#Authors Dataframe
#df_authors = spark.read.csv("Datasets/authors.csv", sep = ",", header = True, quote = '"')

#PaperCsv dataframe
papersCsvSchema = StructType([
    StructField("paper_id",StringType(),False),
    StructField("type",StringType(),False),
    StructField("journal",StringType(),False),
    StructField("book_title",StringType(),False),
    StructField("series",StringType(),False),
    StructField("publisher",StringType(),False),
    StructField("pages",StringType(),False),
    StructField("volume",StringType(),False),
    StructField("number",StringType(),False),
    StructField("year",StringType(),False),
    StructField("month",StringType(),False),
    StructField("postedat",StringType(),False),
    StructField("address",StringType(),False),
    StructField("title",StringType(),False),
    StructField("abstract",StringType(),False),
])
df_paperCsv = spark.read.csv("Datasets/papers.csv", sep = ",", header = False, schema = papersCsvSchema, quote = '"')

#UserLibrary dataframe
userLibrarySchema = StructType([
    StructField("user_hash_id",StringType(),False),
    StructField("user_library",StringType(),False)
])
df_userLibrary = spark.read.csv("Datasets/users_libraries.txt", sep = ";", header = False, schema = userLibrarySchema)
df_userLibrary = df_userLibrary.selectExpr("user_hash_id","split(user_library,',') AS user_library")

"""
#Keywords dataframe

keywordsSchema = StructType([
    StructField("paper_id",StringType(),False),
    StructField("keyword",StringType(),False)
])

df_keywords = spark.read.csv("Datasets/keywords.csv", sep = ",", header = True, schema = keywordsSchema, quote = '"')

#Terms Dataframe
terms_df = spark.read.csv("Datasets/terms.txt", header = True)

#Stopword Broadcast
stopWords = sc.textFile("Datasets/stopwords_en.txt")
stopWordsBroadcast = sc.broadcast(stopWords.collect())

#Papers Terms Dataframe
def parse_papers_count(line):

    if not line:
        return dict()
    papers_count_raw = line.split(' ')
    papers_count = dict()
    for pcRaw in papers_count_raw:
        paper, count = pcRaw.split(':')
        papers_count[paper] = int(count)
    return papers_count

papers_vocab = spark.read.format("csv").option("header", "true").load("Datasets/papers_terms.txt").rdd
papers_vocab = papers_vocab.map(lambda x: (x[0], parse_papers_count(x[1]))).toDF().selectExpr('_1 AS paper_id','_2 AS term_count')    

"""

'\n#Keywords dataframe\n\nkeywordsSchema = StructType([\n    StructField("paper_id",StringType(),False),\n    StructField("keyword",StringType(),False)\n])\n\ndf_keywords = spark.read.csv("Datasets/keywords.csv", sep = ",", header = True, schema = keywordsSchema, quote = \'"\')\n\n#Terms Dataframe\nterms_df = spark.read.csv("Datasets/terms.txt", header = True)\n\n#Stopword Broadcast\nstopWords = sc.textFile("Datasets/stopwords_en.txt")\nstopWordsBroadcast = sc.broadcast(stopWords.collect())\n\n#Papers Terms Dataframe\ndef parse_papers_count(line):\n\n    if not line:\n        return dict()\n    papers_count_raw = line.split(\' \')\n    papers_count = dict()\n    for pcRaw in papers_count_raw:\n        paper, count = pcRaw.split(\':\')\n        papers_count[paper] = int(count)\n    return papers_count\n\npapers_vocab = spark.read.format("csv").option("header", "true").load("Datasets/papers_terms.txt").rdd\npapers_vocab = papers_vocab.map(lambda x: (x[0], parse_papers_count(x[1]))).toDF(

In [4]:
#Cleaning and Tokenizing the data
def phraseTokenization(x):
    rawPhrase = x[13] + " " + x[14] #concatenating title and abstract
    rawPhrase = rawPhrase.replace("-","") #removing - from phrase
    rawPhrase = rawPhrase.replace("_","") #removing _ from phrase
    rawPhrase = rawPhrase.strip() #removing any trailing or leading whitespaces
    
    #spliting phrase based on non-alphaNumeric characters
    phraseArray = re.split('[^a-zA-Z0-9]+',rawPhrase) 
    
    #remove words with less than 3 char
    phraseArrayFilteredWords = [i for i in phraseArray if len(i) >= 3]
    
    return (x[0],list(phraseArrayFilteredWords))


df_tokenize = df_paperCsv.na.fill(value="").rdd.map(phraseTokenization).toDF()

#Removing StopWords using ML
swRemover = StopWordsRemover(inputCol="_2", outputCol="cleaned_terms")
df_cleanedData = swRemover.transform(df_tokenize)
df_cleanedData = df_cleanedData.selectExpr("_1 AS paper_id","cleaned_terms")

#Stemming using Porter stemmer Algo
ps =  PorterStemmer()

def stemmingTerms(x):
    stemmedWords = []
    for word in x:
        rootWord = ps.stem(word)
        stemmedWords.append(rootWord)
    return stemmedWords

df_cleanedData = df_cleanedData.rdd.mapValues(stemmingTerms).toDF().selectExpr("_1 AS paper_id","_2 AS cleaned_terms")

In [5]:
# Find the count of papers in which the term is present
df_paperCount = df_cleanedData.selectExpr("paper_id","explode(cleaned_terms) AS terms").distinct().groupBy("terms").count().withColumnRenamed("count", "paper_count")

#10 percent of total papers present in file
noOfDistinctPapers_df = int(df_cleanedData.select(countDistinct("paper_id")).collect()[0][0])
tenPercentOfTotalPapers = int(noOfDistinctPapers_df/10)

# remove words appear in more than 10% of the papers and keep only the words that appear in at least 20 papers 
df_filterdTerms = df_paperCount.filter((df_paperCount["paper_count"]<=tenPercentOfTotalPapers) & (df_paperCount["paper_count"]>=20))

#Fetch top 1000 terms 
top1000Terms = df_filterdTerms.orderBy(col("paper_count").desc()).limit(1000)

In [18]:
#associate unique integer values to each term
df_termsWithUniqueIndex = top1000Terms.withColumn("unique_index",row_number().over(Window.orderBy("paper_count"))).selectExpr("terms","unique_index-1 AS unique_index")

#Collect all terms in a list
terms_collection =  [row.terms for row in df_termsWithUniqueIndex.collect()]

# Generating Termfrequency Vector for each paper
df_cleanedDataExplode = df_cleanedData.selectExpr("paper_id","explode(cleaned_terms) AS terms")

#Getting the Unique_index of term 
df_cleanedDataJoinIndex = df_cleanedDataExplode.join(df_termsWithUniqueIndex,df_cleanedDataExplode.terms == df_termsWithUniqueIndex.terms , how = "inner").select(df_cleanedDataExplode.paper_id,df_cleanedDataExplode.terms,df_termsWithUniqueIndex.unique_index)

#Getting the term_frequency in each paper
df_cleanedDataJoinIndex = df_cleanedDataJoinIndex.groupBy("paper_id","unique_index").count().withColumnRenamed("count", "term_frquency")

#Creating a sparseVector respresentation for each paper
rdd_CleanedDataReducedByPaperId = df_cleanedDataJoinIndex.rdd.map(lambda x: (x[0], [(x[1], x[2])])).reduceByKey(lambda a, b: a + b)
rdd_CleanedDataReducedByPaperId = rdd_CleanedDataReducedByPaperId.map(lambda x: (x[0],Vectors.sparse(1000,x[1])))

df_CleanedDataSparseVector = rdd_CleanedDataReducedByPaperId.toDF().selectExpr("_1 AS paper_id","_2 AS term_frequency_vector")

In [15]:
df_CleanedDataSparseVector.take(1)

[Row(paper_id='498902', term_frequency_vector=SparseVector(1000, {33: 3.0, 47: 1.0, 79: 1.0, 97: 1.0, 138: 1.0, 170: 6.0, 354: 1.0, 368: 1.0, 394: 1.0, 482: 2.0, 491: 1.0, 541: 1.0, 550: 1.0, 566: 1.0, 581: 1.0, 596: 1.0, 622: 1.0, 632: 1.0, 663: 1.0, 670: 1.0, 720: 2.0, 723: 1.0, 762: 1.0, 764: 1.0, 773: 1.0, 797: 1.0, 820: 1.0, 826: 1.0, 837: 2.0, 843: 1.0, 879: 1.0, 881: 1.0, 890: 1.0, 892: 1.0, 894: 1.0, 928: 1.0, 937: 1.0, 946: 2.0, 949: 3.0, 952: 1.0, 962: 1.0, 965: 1.0, 984: 1.0, 985: 4.0, 990: 1.0, 992: 1.0}))]

In [19]:
#TF-IDF Representation for each paper

idf = IDF(inputCol="term_frequency_vector", outputCol="tf_idf_vector")
tf_idf_model = idf.fit(df_CleanedDataSparseVector)
df_rescaledCleanedData = tf_idf_model.transform(df_CleanedDataSparseVector)
df_rescaledCleanedData = df_rescaledCleanedData.select("paper_id", "tf_idf_vector")

In [35]:
df_rescaledCleanedData.limit(2).show(truncate =False)

+--------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [21]:
from pyspark.mllib.linalg import Vectors as MLlibVectors

# Latent Direchlet Allocation

num_topics = 40

# Transform data into LDA supported format
rdd_lda_format = df_CleanedDataSparseVector.rdd.mapValues(MLlibVectors.fromML).map(lambda x: [int(x[0]),x[1]])

#Train the LDA Model
lda_model = LDA.train(rdd_lda_format, k=num_topics)

In [22]:
#Find the top 5 term of each extracted Topic
topicIndices = sc.parallelize(lda_model.describeTopics(maxTermsPerTopic = 5))

def Get_TopicTerms(topic):
    termsId_array = topic[0] 
    terms_array = []
    
    #Finding the term from the their corresponding Unique index
    for i in range(5):
        term = terms_collection[termsId_array[i]]
        terms_array.append(term)
    return terms_array

topics_Final = topicIndices.map(Get_TopicTerms).collect()


In [24]:
topics_Final

[['gene', 'network', 'protein', 'cell', 'interact'],
 ['gene', 'protein', 'cell', 'network', 'activ'],
 ['gene', 'protein', 'network', 'cell', 'activ'],
 ['gene', 'protein', 'network', 'sequenc', 'cell'],
 ['network', 'gene', 'protein', 'interact', 'activ'],
 ['gene', 'network', 'cell', 'protein', 'activ'],
 ['network', 'gene', 'protein', 'activ', 'cell'],
 ['network', 'gene', 'protein', 'activ', 'cell'],
 ['gene', 'protein', 'network', 'sequenc', 'interact'],
 ['network', 'gene', 'protein', 'sequenc', 'interact'],
 ['gene', 'protein', 'cell', 'network', 'activ'],
 ['gene', 'protein', 'cell', 'network', 'sequenc'],
 ['gene', 'network', 'protein', 'cell', 'activ'],
 ['gene', 'network', 'protein', 'cell', 'activ'],
 ['gene', 'protein', 'cell', 'network', 'sequenc'],
 ['network', 'gene', 'protein', 'interact', 'activ'],
 ['gene', 'protein', 'network', 'cell', 'sequenc'],
 ['gene', 'protein', 'network', 'cell', 'activ'],
 ['gene', 'network', 'protein', 'activ', 'interact'],
 ['gene', 'cell

In [25]:
#User Profiling

df_userLibrary_explode = df_userLibrary.selectExpr("user_hash_id","explode(user_library) AS paper_id")
df_userJoined_TfIdf = df_userLibrary_explode.join(df_rescaledCleanedData,df_userLibrary_explode.paper_id == df_rescaledCleanedData.paper_id, how="inner").select(df_userLibrary_explode.user_hash_id,df_userLibrary_explode.paper_id,df_rescaledCleanedData.tf_idf_vector)

In [82]:
import collections

def addSparseVectors(v1, v2):
    values = collections.defaultdict(float) # Initialize Dictionary with default value 0.0
    
    # Add values from v1 SparseVector
    for i in range(v1.indices.size):
        values[v1.indices[i]] += v1.values[i]
    # Add values from v2 SParseVector
    for i in range(v2.indices.size):
        values[v2.indices[i]] += v2.values[i]
    return Vectors.sparse(v1.size, dict(values))

df_userprofile_tfidf = df_userJoined_TfIdf.selectExpr("user_hash_id","tf_idf_vector").rdd.reduceByKey(lambda x,y: addSparseVectors(x,y)).toDF().selectExpr("_1 AS user_hash_id","_2 AS sum_tf_idf_vector")

In [84]:
df_userprofile_tfidf.take(1)

[Row(user_hash_id='6931f7f79678cf72aae416ff7cb43bb1', sum_tf_idf_vector=SparseVector(1000, {0: 13.9931, 1: 23.287, 3: 9.3123, 4: 37.249, 5: 4.6555, 6: 4.6536, 7: 32.5708, 10: 9.2996, 11: 4.6492, 12: 23.2428, 13: 4.6467, 14: 41.8145, 18: 9.2734, 20: 4.633, 22: 4.6305, 23: 41.6692, 24: 4.6299, 26: 97.1893, 27: 9.2561, 28: 37.0147, 29: 4.6256, 31: 4.6238, 33: 4.6146, 34: 13.8384, 35: 4.6122, 36: 13.8293, 37: 23.0429, 38: 13.8257, 39: 9.2147, 41: 4.6044, 42: 9.2087, 47: 18.3839, 48: 9.192, 51: 13.7737, 52: 4.5889, 53: 4.5889, 55: 13.756, 56: 4.583, 57: 9.1601, 60: 13.7331, 62: 13.7243, 65: 4.5707, 67: 9.1298, 68: 4.5643, 70: 13.6809, 72: 173.1171, 74: 95.6219, 75: 36.4183, 76: 9.1034, 77: 4.5511, 78: 4.55, 79: 13.6483, 80: 9.0989, 81: 9.0977, 84: 4.5466, 86: 40.8787, 87: 18.166, 88: 4.5415, 89: 4.5409, 92: 27.2389, 93: 72.5921, 94: 31.7512, 95: 4.5353, 99: 18.1279, 100: 9.0628, 102: 9.0484, 104: 4.5236, 105: 9.045, 107: 9.0362, 110: 4.5153, 111: 13.5444, 113: 4.5148, 115: 9.023, 116: 9.023