# simple article recommendation with content-based filtering in Spark ML

In [1]:
import pyspark
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.ml.feature import Tokenizer, RegexTokenizer, StopWordsRemover, CountVectorizer 
from pyspark.ml.clustering import LDA, LocalLDAModel
from pyspark.ml import Pipeline
from IPython.display import display, HTML
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from itertools import chain
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import squarify
import colorlover as cl

### get the data

In [2]:
! curl -o reco-content-data.csv https://gist.githubusercontent.com/rawar/ae4cce269e29c2826163fbca60b544f4/raw/20594a832b44de4b4a5160fb2732399ba09a70e9/reco-content-data.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 31.7M  100 31.7M    0     0   937k      0  0:00:34  0:00:34 --:--:--  981k  0:00:26  0:00:12  902k


### have a short look into the data

In [3]:
! head -n 2 reco-content-data.csv 

id,title,content
2777,Google verkauft Motorola,Android


### define the schema

In [4]:
schema = StructType([
    StructField("id", StringType()),
    StructField('title', StringType()),
    StructField('content', StringType()),
])

### read the data into Spark dataframe

In [5]:
sql_context = SQLContext(sc)
raw_data_df = sql_context.read.csv("reco-content-data.csv", header=True, schema=schema)

### count the number of data

In [6]:
raw_data_df.count() 

15751

### define some helper functions

In [35]:
ListOfIndexToWords = F.UserDefinedFunction(lambda wl: list([vocab_array[w] for w in wl]))
FormatNumbers = F.UserDefinedFunction(lambda nl: ["{:1.4f}".format(x) for x in nl])

In [8]:
def concat_array_udf(type):
    def concat_(*args):
        return list(chain(*args))
    return F.UserDefinedFunction(concat_, ArrayType(type))

concat_string_arrays = concat_array_udf(StringType())

In [9]:
def displayDF(df, numberOfRows):
    tmp_df = df.limit(numberOfRows)
    display(tmp_df.toPandas())

In [10]:
displayDF(raw_data_df, 5)

Unnamed: 0,id,title,content
0,2777,Google verkauft Motorola,Android
1,2770,Touchscreen im Winter: Handy-Handschuhe selber...,Ein Touchdisplay lässt sich normalerweise nich...
2,2855,"""Bei WhatsApp als Kontakt blockiert: So merkt ...",Auf den ersten Blick könnt Ihr nicht sehen ob ...
3,2942,Paper: Facebook stellt neue App im Flipboard-S...,IOS
4,3323,WhatsApp: mehr Privatsphäre für Nutzer,IOS


### remove some special characters

In [11]:
raw_data_df = raw_data_df.withColumn('clean_title', F.regexp_replace('title', '[:"-.]+', ''))

#### Tokenize the document title

In [12]:
title_tokenizer = RegexTokenizer(inputCol="clean_title", outputCol="title_words", gaps=True, pattern="\\s+", minTokenLength=2)

In [13]:
tokenized_title_df = title_tokenizer.transform(raw_data_df)

#### Tokenize document content

In [14]:
content_tokenizer = RegexTokenizer(inputCol="content", outputCol="content_words", gaps=True, pattern="\\s+", minTokenLength=2)

In [15]:
tokenized_content_df = content_tokenizer.transform(tokenized_title_df) 
displayDF(tokenized_content_df, 5)

Unnamed: 0,id,title,content,clean_title,title_words,content_words
0,2777,Google verkauft Motorola,Android,Google verkauft Motorola,"[google, verkauft, motorola]",[android]
1,2770,Touchscreen im Winter: Handy-Handschuhe selber...,Ein Touchdisplay lässt sich normalerweise nich...,Touchscreen im Winter HandyHandschuhe selber m...,"[touchscreen, im, winter, handyhandschuhe, sel...","[ein, touchdisplay, lässt, sich, normalerweise..."
2,2855,"""Bei WhatsApp als Kontakt blockiert: So merkt ...",Auf den ersten Blick könnt Ihr nicht sehen ob ...,Bei WhatsApp als Kontakt blockiert So merkt Ihrs,"[bei, whatsapp, als, kontakt, blockiert, so, m...","[auf, den, ersten, blick, könnt, ihr, nicht, s..."
3,2942,Paper: Facebook stellt neue App im Flipboard-S...,IOS,Paper Facebook stellt neue App im FlipboardSti...,"[paper, facebook, stellt, neue, app, im, flipb...",[ios]
4,3323,WhatsApp: mehr Privatsphäre für Nutzer,IOS,WhatsApp mehr Privatsphäre für Nutzer,"[whatsapp, mehr, privatsphäre, für, nutzer]",[ios]


#### Filter stopwords
I use two different lists of stopwords. The first one cames from https://github.com/stopwords-iso/stopwords-de/blob/master/stopwords-de.txt and teh 2nd one from Spark itsef

In [16]:
extraStopWords = []
with open('stopwords-de-utf8.txt', 'r', encoding='utf-8') as f:
    extraStopWords = f.readlines()
    
extraStopWords = [x.strip() for x in extraStopWords]

In [17]:
sparkStopWords = StopWordsRemover.loadDefaultStopWords("german")
stopWords = list(set().union(extraStopWords, sparkStopWords))

In [18]:
filter_title_words = StopWordsRemover(
    inputCol="title_words", 
    outputCol="filtered_title_words", 
    stopWords=stopWords
)

In [19]:
filter_content_words = StopWordsRemover(
    inputCol="content_words",
    outputCol="filtered_content_words",
    stopWords=stopWords
)

In [20]:
filterted_words_df = filter_title_words.transform(tokenized_content_df)
filterted_words_df = filter_content_words.transform(filterted_words_df)
displayDF(filterted_words_df.select("filtered_title_words", "filtered_content_words"), 10)

Unnamed: 0,filtered_title_words,filtered_content_words
0,"[google, verkauft, motorola]",[android]
1,"[touchscreen, winter, handyhandschuhe, selber]","[touchdisplay, lässt, normalerweise, woll, led..."
2,"[whatsapp, kontakt, blockiert, merkt, ihrs]","[blick, sehen, whatsappkontakt, blockiert, wur..."
3,"[paper, facebook, stellt, app, flipboardstil]",[ios]
4,"[whatsapp, privatsphäre, nutzer]",[ios]
5,"[tipps, schutz, gefahren, sozialen, netzwerken]","[soziale, netzwerke, facebook, instagramhttpsc..."
6,"[bluetoothkopfhörer, sportler]","[bluetoothkopfhörer, sport, eingesetzt, gelten..."
7,"[mobistel, cynus, t5]","[mobistel, cynus, t5, mitte, jahres, 2013, han..."
8,"[speicherplatz, ipad, iphone, sparen]",[ios]
9,"[sarwert, handy, strahlung, aus?]","[2001, europa, norm, messbedingungen, sarwert,..."


In [21]:
filterted_words_df = filterted_words_df.withColumn(
    'filtered_all_words',
    concat_string_arrays(F.col("filtered_title_words"), F.col("filtered_content_words"))
)

In [22]:
displayDF(filterted_words_df.select("filtered_all_words"), 5)

Unnamed: 0,filtered_all_words
0,"[google, verkauft, motorola, android]"
1,"[touchscreen, winter, handyhandschuhe, selber,..."
2,"[whatsapp, kontakt, blockiert, merkt, ihrs, bl..."
3,"[paper, facebook, stellt, app, flipboardstil, ..."
4,"[whatsapp, privatsphäre, nutzer, ios]"


#### Convert the documents with the extracted words to document vectors
The CountVectorizer https://spark.apache.org/docs/latest/ml-features.html#countvectorizer is a helper to convert a collection of text documents to vectors of token counts. During the fitting process, it can be calculate the TF/IDF.

In [23]:
cv = CountVectorizer(
    inputCol="filtered_all_words", 
    outputCol="features", 
    vocabSize=100,    # number of selected words across the document corpus
    minTF=2,          # minium number of times a word must appear in a document
    minDF=6           # minimun number of documents a word must appear in document corpus
)

cv_model = cv.fit(filterted_words_df)

In [24]:
displayDF(cv_model.transform(filterted_words_df).select("id","filtered_all_words","features"),6)

Unnamed: 0,id,filtered_all_words,features
0,2777,"[google, verkauft, motorola, android]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,2770,"[touchscreen, winter, handyhandschuhe, selber,...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,2855,"[whatsapp, kontakt, blockiert, merkt, ihrs, bl...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,2942,"[paper, facebook, stellt, app, flipboardstil, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,3323,"[whatsapp, privatsphäre, nutzer, ios]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5,1627,"[tipps, schutz, gefahren, sozialen, netzwerken...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


#### Create count feature vectors

In [25]:
count_vectors = cv_model.transform(filterted_words_df).select("id","features")

In [26]:
corpus_size = count_vectors.count()
print('Corpus contains %s documents and the model vocabulary is %s' % (corpus_size, len(cv_model.vocabulary))) 

Corpus contains 15751 documents and the model vocabulary is 100


#### Train the LDA model on 80% of the documents

In [27]:
training_df, testing_df = count_vectors.randomSplit([0.8, 0.2], 1)
print('Number of training documents %s and testing documents %s ' % (training_df.count(), testing_df.count()))

Number of training documents 12640 and testing documents 3111 


In [28]:
lda = LDA(
    maxIter=15, 
    k=5, 
    seed=42, 
    optimizer="online", 
    optimizeDocConcentration=True, 
    learningDecay=0.51, 
    learningOffset=64., 
    subsamplingRate=0.05
)

In [29]:
lda_model = lda.fit(training_df)

In [30]:
lpt, lp = lda_model.logPerplexity(testing_df), lda_model.logPerplexity(training_df)
print("Perplexity on testing data is %s and training data is %s" % (str(lp),str(lpt)))

Perplexity on testing data is 3.633680813055668 and training data is 3.6943704550350875


#### Print topics with weights

In [32]:
lda_model = lda.fit(count_vectors)
topics = lda_model.describeTopics(maxTermsPerTopic=5)
vocab_array = cv_model.vocabulary

In [33]:
num_topics = topics.count()
print("Number of topics %s" % (num_topics))

Number of topics 5


In [36]:
displayDF(topics.select(ListOfIndexToWords(topics.termIndices).alias('topic terms')), 5)

Unnamed: 0,topic terms
0,"[apple, watch, ios, millionen, iphone]"
1,"[apple, gb, tv, tvos, 2018]"
2,"[iphone, galaxy, samsung, apple, homepod]"
3,"[huawei, p20, music, htc, übungen]"
4,"[android, the, google, lg, monster]"


In [37]:
displayDF(topics.select(FormatNumbers(topics.termWeights).alias('topic weights')), 5)

Unnamed: 0,topic weights
0,"[0.3470, 0.1182, 0.0535, 0.0477, 0.0428]"
1,"[0.1327, 0.0782, 0.0685, 0.0653, 0.0587]"
2,"[0.1406, 0.1060, 0.0666, 0.0569, 0.0481]"
3,"[0.0965, 0.0659, 0.0589, 0.0467, 0.0423]"
4,"[0.0711, 0.0691, 0.0688, 0.0612, 0.0558]"


In [38]:
toptopics = topics.select((topics.topic + 1).alias('topic'),
                          ListOfIndexToWords(topics.termIndices).alias('words'),
                          FormatNumbers(topics.termWeights).alias('weights'))
displayDF(toptopics, 5)

Unnamed: 0,topic,words,weights
0,1,"[apple, watch, ios, millionen, iphone]","[0.3470, 0.1182, 0.0535, 0.0477, 0.0428]"
1,2,"[apple, gb, tv, tvos, 2018]","[0.1327, 0.0782, 0.0685, 0.0653, 0.0587]"
2,3,"[iphone, galaxy, samsung, apple, homepod]","[0.1406, 0.1060, 0.0666, 0.0569, 0.0481]"
3,4,"[huawei, p20, music, htc, übungen]","[0.0965, 0.0659, 0.0589, 0.0467, 0.0423]"
4,5,"[android, the, google, lg, monster]","[0.0711, 0.0691, 0.0688, 0.0612, 0.0558]"


In [40]:
top_words = topics.select(ListOfIndexToWords(topics.termIndices).alias('words')).take(num_topics)
print(top_words)

[Row(words='[apple, watch, ios, millionen, iphone]'), Row(words='[apple, gb, tv, tvos, 2018]'), Row(words='[iphone, galaxy, samsung, apple, homepod]'), Row(words='[huawei, p20, music, htc, übungen]'), Row(words='[android, the, google, lg, monster]')]


#### Documents for all topics

In [41]:
number_of_top_documents = 3

In [44]:
df = lda_model.transform(count_vectors)

In [46]:
for i in range(0, num_topics):
    ntopic = i  
    print('Topic ' + str(ntopic) + '\n')  

    df_sliced = df.select("ID", "topicDistribution") \
        .rdd.map(lambda r: Row(ID=int(r[0]), weight=float(r[1][ntopic]))).toDF()

    doc_ids = df_sliced.sort(df_sliced.weight.desc()).take(number_of_top_documents)
    print('Top Document(s):',doc_ids)
    for d_id in doc_ids:
        print(d_id[0])
        raw_data_df\
            .select("id","title")\
            .filter(raw_data_df.id == d_id[0]) \
            .show(truncate=False)

    print('Top terms:')
    print(top_words[ntopic][0], '\n')
    print('===================================================')

Topic 0

Top Document(s): [Row(ID=495179, weight=0.992948173603075), Row(ID=153098, weight=0.9907271912930936), Row(ID=157847, weight=0.9879008681065138)]
495179
+------+--------------------------------------------------------------------+
|id    |title                                                               |
+------+--------------------------------------------------------------------+
|495179|iOS 11 vorgestellt: Siri spricht natürlicher und wird zum Übersetzer|
+------+--------------------------------------------------------------------+

153098
+------+-------------------------------------------------------+
|id    |title                                                  |
+------+-------------------------------------------------------+
|153098|Kratzer im Glas: Die fragwürdige Pleite von GT Advanced|
+------+-------------------------------------------------------+

157847
+------+----------------------------------------------+
|id    |title                                      

+------+----------+
|id    |title     |
+------+----------+
|583744|Huawei P20|
|583744|Huawei P20|
|583744|Huawei P20|
|583744|Huawei P20|
|583744|Huawei P20|
|583744|Huawei P20|
|583744|Huawei P20|
|583744|Huawei P20|
|583744|Huawei P20|
|583744|Huawei P20|
|583744|Huawei P20|
|583744|Huawei P20|
|583744|Huawei P20|
|583744|Huawei P20|
|583744|Huawei P20|
|583744|Huawei P20|
|583744|Huawei P20|
|583744|Huawei P20|
|583744|Huawei P20|
|583744|Huawei P20|
+------+----------+
only showing top 20 rows

583744
+------+----------+
|id    |title     |
+------+----------+
|583744|Huawei P20|
|583744|Huawei P20|
|583744|Huawei P20|
|583744|Huawei P20|
|583744|Huawei P20|
|583744|Huawei P20|
|583744|Huawei P20|
|583744|Huawei P20|
|583744|Huawei P20|
|583744|Huawei P20|
|583744|Huawei P20|
|583744|Huawei P20|
|583744|Huawei P20|
|583744|Huawei P20|
|583744|Huawei P20|
|583744|Huawei P20|
|583744|Huawei P20|
|583744|Huawei P20|
|583744|Huawei P20|
|583744|Huawei P20|
+------+----------+
only sh

#### Details for a single topic

In [48]:
topic_id = 1      
number_of_top_document = 3  

df_sliced = df.select("id","topicDistribution").rdd.map(lambda r: Row(ID = int(r[0]), sliced = float(r[1][topic_id]))).toDF()
top_doc_id = df_sliced.sort(df_sliced.sliced.desc()).take(number_of_top_document)

In [49]:
print('Topic ' + str(topic_id) +'\n')
print("Top %s documents for topic %s" % (number_of_top_document, topic_id))
for d_id in top_doc_id:
    print("DocumentId : ",d_id[0])
    raw_data_df\
        .select("id","title")\
        .filter(raw_data_df.id == d_id[0]) \
        .show(truncate=False)

print('Top 5 terms:')
print(topics.select(ListOfIndexToWords(topics.termIndices).alias('words')).take(num_topics)[topic_id][0], '\n')


Topic 1

Top 3 documents for topic 1
DocumentId :  611645
+------+---------------------------------------------------------------+
|id    |title                                                          |
+------+---------------------------------------------------------------+
|611645|Apple zeigt tvOS 12 für Apple TV: Das sind die neuen Funktionen|
|611645|Apple zeigt tvOS 12 für Apple TV: Das sind die neuen Funktionen|
|611645|Apple zeigt tvOS 12 für Apple TV: Das sind die neuen Funktionen|
|611645|Apple zeigt tvOS 12 für Apple TV: Das sind die neuen Funktionen|
|611645|Apple zeigt tvOS 12 für Apple TV: Das sind die neuen Funktionen|
|611645|Apple zeigt tvOS 12 für Apple TV: Das sind die neuen Funktionen|
|611645|Apple zeigt tvOS 12 für Apple TV: Das sind die neuen Funktionen|
|611645|Apple zeigt tvOS 12 für Apple TV: Das sind die neuen Funktionen|
|611645|Apple zeigt tvOS 12 für Apple TV: Das sind die neuen Funktionen|
|611645|Apple zeigt tvOS 12 für Apple TV: Das sind die neuen Funkt

#### Plot the top documents for each topic distribution

In [51]:
count_top_docs = (
    lda_model\
        .transform(count_vectors)\
        .select("topicDistribution")\
        .rdd.map(lambda r: Row( nTopTopic = int(np.argmax(r)))).toDF()\
        .groupBy("nTopTopic").count().sort("nTopTopic")\
)

In [52]:
values = count_top_docs.select("count").rdd.map(lambda row : row[0]).collect()
print(values)

[5631, 1240, 3184, 2335, 3361]


In [55]:
topic_words = toptopics.rdd.map(lambda row : row[1]).collect()
topic_words

['[apple, watch, ios, millionen, iphone]',
 '[apple, gb, tv, tvos, 2018]',
 '[iphone, galaxy, samsung, apple, homepod]',
 '[huawei, p20, music, htc, übungen]',
 '[android, the, google, lg, monster]']

In [56]:
init_notebook_mode(connected=True)

In [57]:
x = 0.
y = 0.
width = 5.
height = 5.

In [58]:
normed = squarify.normalize_sizes(values, width, height)
rects = squarify.squarify(normed, x, y, width, height)

In [59]:
shapes = []
annotations = []
counter = 0

In [60]:
color_brewer = cl.scales['5']['div']['PiYG']; 
color_brewer

['rgb(208,28,139)',
 'rgb(241,182,218)',
 'rgb(247,247,247)',
 'rgb(184,225,134)',
 'rgb(77,172,38)']

In [61]:
for r in rects:
    shapes.append( 
        dict(
            type = 'rect', 
            x0 = r['x'], 
            y0 = r['y'], 
            x1 = r['x']+r['dx'], 
            y1 = r['y']+r['dy'],
            line = dict( width = 2 ),
            fillcolor = color_brewer[counter]
        ) 
    )
    annotations.append(
        dict(
            x = r['x']+(r['dx']/2),
            y = r['y']+(r['dy']/2),
            text = "topic "+str(counter+1),
            showarrow = False
        )
    )
    counter = counter + 1
    if counter >= len(color_brewer):
        counter = 0

In [62]:
# For hover text
trace0 = go.Scatter(
    x = [ r['x']+(r['dx']/2) for r in rects ], 
    y = [ r['y']+(r['dy']/2) for r in rects ],
    text = [ str(t) for t in topic_words ], 
    mode = 'text',
)

In [63]:
layout = dict(
    height=500, 
    width=900,
    xaxis=dict(showgrid=False,zeroline=False),
    yaxis=dict(showgrid=False,zeroline=False),
    shapes=shapes,
    annotations=annotations,
    hovermode='closest'
)

In [64]:
figure = dict(data=[trace0], layout=layout)
iplot(figure, filename='squarify-treemap')