In [1]:
import os
execfile(os.path.join(os.environ["SPARK_HOME"], 'python/pyspark/shell.py'))

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.1.1
      /_/

Using Python version 2.7.12 (default, Nov 19 2016 06:48:10)
SparkSession available as 'spark'.


In [2]:
from pyspark.sql import SparkSession
sparkSession = SparkSession.builder.enableHiveSupport().master("local").getOrCreate()

In [3]:
data = sparkSession.read.parquet("/data/sample264")
meta = sparkSession.read.parquet("/data/meta")

In [5]:
data.show(10);

+------+-------+--------+----------+
|userId|trackId|artistId| timestamp|
+------+-------+--------+----------+
| 13065| 944906|  978428|1501588527|
|101897| 799685|  989262|1501555608|
|215049| 871513|  988199|1501604269|
|309769| 857670|  987809|1501540265|
|397833| 903510|  994595|1501597615|
|501769| 818149|  994975|1501577955|
|601353| 958990|  973098|1501602467|
|710921| 916226|  972031|1501611582|
|  6743| 801006|  994339|1501584964|
|152407| 913509|  994334|1501571055|
+------+-------+--------+----------+
only showing top 10 rows



In [6]:
meta.show(10)

+------+--------------------+--------------------+-------+
|  type|                Name|              Artist|     Id|
+------+--------------------+--------------------+-------+
| track|               Smile| Artist: Josh Groban|1223851|
| track|Chuni Ashkharhe Q...|Artist: Razmik Amyan|1215486|
| track|           Dark City|Artist: Machinae ...|1296462|
| track|       Not Sensitive|        Artist: Moby|1249694|
|artist|Artist: Carlos Pu...|Artist: Carlos Pu...|1352221|
| track|Thiz Gangsta Chit...|Artist: Tha Dogg ...|1217194|
| track|            Ruffneck|    Artist: Skrillex|1245681|
| track|              Incerc|       Artist: Spike|1193283|
|artist|Artist: Wallenber...|Artist: Wallenber...|1333444|
| track|               remix|    Artist: Flo Rida|1246378|
+------+--------------------+--------------------+-------+
only showing top 10 rows



In [24]:
from pyspark.sql import Window
from pyspark.sql.functions import row_number, sum

#df = userTrack(userId, trackId, count); key1 = "userId"; key2 = "trackId"; field = "count"; n = 1000
def norm(df, key1, key2, field, n): 
    
    #Form window partitioned by userId ordered by count. 
    #key1 = "userId"; field = "count"
    window = Window.partitionBy(key1).orderBy(col(field).desc())
    
    #Numerate most popular tracks for each userId. Select top 1000 for each user
    topsDF = df.withColumn("row_number", row_number().over(window)) \
        .filter(col("row_number") <= n) \
        .drop(col("row_number")) 
    #Result:
    #+------+-------+-----+
    #|userId|trackId|count|
    #+------+-------+-----+
    #|185109| 837238|    5|
    #|185109| 870292|    4|
    #|185109| 846624|    3|
    #|185109| 961308|    3|
    #|185109| 858070|    3|
    #|185109| 858904|    3|
    #|185109| 871513|    2|
    #|185109| 839387|    2|
    #|185109| 899991|    1|
    #|185109| 936124|    1|
    #+------+-------+-----+


    #Sum all count (count of all tracks) for each UserId.
    #key1 = "userId"; field = "count"
    tmpDF = topsDF.groupBy(col(key1)).agg(col(key1), sum(col(field)).alias("sum_" + field))
    #Result:
    #+------+------+---------+
    #|userId|userId|sum_count|
    #+------+------+---------+
    #|185109|185109|       27|
    #+------+------+---------+

    #Normalize tracks for each user
    normalizedDF = topsDF.join(tmpDF, key1, "inner") \
        .withColumn("norm_" + field, col(field) / col("sum_" + field)) \
        .cache()
    #Result:
    #+------+-------+-----+---------+--------------------+
    #|userId|trackId|count|sum_count|          norm_count|
    #+------+-------+-----+---------+--------------------+
    #|185109| 837238|    5|       27| 0.18518518518518517|
    #|185109| 870292|    4|       27| 0.14814814814814814|
    #|185109| 846624|    3|       27|  0.1111111111111111|
    #|185109| 961308|    3|       27|  0.1111111111111111|
    #|185109| 858070|    3|       27|  0.1111111111111111|
    #|185109| 858904|    3|       27|  0.1111111111111111|
    #|185109| 871513|    2|       27| 0.07407407407407407|
    #|185109| 839387|    2|       27| 0.07407407407407407|
    #|185109| 899991|    1|       27|0.037037037037037035|
    #|185109| 936124|    1|       27|0.037037037037037035|
    #+------+-------+-----+---------+--------------------+
    
    return normalizedDF

SyntaxError: invalid syntax (<ipython-input-24-72c04ca38c89>, line 14)

In [32]:
from pyspark.sql import Window
from pyspark.sql.functions import col, rank

#Count rows grouping by userId and trackId
userTrack = data.groupBy(col("userId"), col("trackId")).count();
#Result: 
#+------+-------+-----+
#|userId|trackId|count|
#+------+-------+-----+
#|185109| 870292|    4|

userTrackNorm = norm(userTrack, "userId", "trackId", "count", 1000) \
        .withColumn("id", col("userId")) \
        .withColumn("id2", col("trackId")) \
        .withColumn("norm_count", col("norm_count") * 0.5) \
        .select(col("id"), col("id2"), col("norm_count"));
#Result: id - userId, id2 - trackId
#+------+------+--------------------+
#|    id|   id2|          norm_count|
#+------+------+--------------------+
#|185109|837238| 0.09259259259259259|
#|185109|870292| 0.07407407407407407|
#|185109|846624| 0.05555555555555555|
#|185109|961308| 0.05555555555555555|
#|185109|858070| 0.05555555555555555|
#|185109|858904| 0.05555555555555555|
#|185109|871513|0.037037037037037035|
#|185109|839387|0.037037037037037035|
#|185109|899991|0.018518518518518517|
#|185109|936124|0.018518518518518517|
#+------+------+--------------------+

#The most popular track for each user
window = Window.orderBy(col("norm_count"));

userTrackList = userTrackNorm.withColumn("position", rank().over(window))\
    .filter(col("position") < 50)\
    .orderBy(col("id"), col("id2"))\
    .select(col("id"), col("id2"))\
    .take(40);

In [33]:
for val in userTrackList:
    print "%s %s" % val

415763 853951
436158 889948
586043 800288
586043 800317
586043 801522
586043 804741
586043 805880
586043 806233
586043 806439
586043 807873
586043 808328
586043 810571
586043 811212
586043 811848
586043 815635
586043 818116
586043 819591
586043 821062
586043 822375
586043 825775
586043 825997
586043 826725
586043 831955
586043 833018
586043 834780
586043 834887
586043 835312
586043 837744
586043 838944
586043 842614
586043 844606
586043 851992
586043 852304
586043 852734
586043 852863
586043 855577
586043 856643
586043 858618
586043 858992
586043 860220
