# RDD

#### Задание 1
#### Сгруппировать по ключу, просуммировать значения, вывести результат

In [1]:
%pyspark

rdd = sc.parallelize([(1,2), (3,4), (3,6), (4,5), (3, 4), (1, 5), (4, 1)])

rdd \
    .groupByKey() \
    .map(lambda x:(x[0], sum(x[1]))) \
    .sortByKey(ascending=True) \
    .collect()


#### Задание 2
#### Посчитать частоту встречаемости слов

In [2]:
%pyspark

lines = sc.parallelize([
    "a ab abc",
    "a ac abc",
    "b b ab abc"
    ])

counts = lines \
            .flatMap(lambda x: x.split(' ')) \
            .map(lambda x: (x,1)) \
            .reduceByKey(lambda x, y: x+y)

output = counts.collect()

for (word, count) in output:
    print("%s: %i" % (word, count))

#### market.events

#### Задание 3
#### Добавить колонки category_1, category_2, category_3 с категориями различного уровня

In [4]:
%pyspark

from pyspark.sql.functions import *

path = "/apps/hive/warehouse/eakotelnikov.db/market_events"
df = spark.read.parquet(path)

df.printSchema()
df.show(5)

split_col = split(df["category_code"], '[.]')
df = df.withColumn("cat_1", split_col.getItem(0))
df = df.withColumn("cat_2", split_col.getItem(1))
df = df.withColumn("cat_3", split_col.getItem(2))

df_with_categories = df

df_with_categories.show(10)


#### Задание 4
#### Вывести топ-3 брендов по количеству просмотров для каждой категории 2-го уровня

In [7]:
%pyspark

from pyspark.sql.functions import *
from pyspark.sql.window import *

event_type_filter = col("event_type") == "view"

df_grouped = df_with_categories. \
                    where(event_type_filter). \
                    groupBy(["cat_1", "cat_2", "brand"]). \
                    agg(count("event_type"). \
                    alias("views")). \
                    orderBy(col("views"). \
                    desc())

print("Вывод df после группировки")
df_grouped.show(5)


Window_Spec  = Window. \
                    partitionBy("cat_2"). \
                    orderBy(col("views"). \
                    desc())
                    
rank_filter = col("rank") <= 3

df_with_ranks = df_grouped. \
                        withColumn("rank",row_number(). \
                        over(Window_Spec)). \
                        orderBy(col("views"). \
                        desc()). \
                        where(rank_filter)


print("Вывод df после приминения оконной функции")
df_with_ranks.show(5)

print("Вывод результирующей таблицы")

df_with_ranks. \
        orderBy(col("cat_2").asc(), \
        col("views").desc(), \
        col("rank").asc()). \
        show(20)

#### Датасет с треками

#### Cоздание hw_3.tracks для запуска локально (на кластере уже готова)

In [9]:
%pyspark

import pyspark.sql.functions as f
from pyspark.sql.types import *

sch=ArrayType(StringType());

# важно что разделитель ', ' с пробелом, иначе пробелы добавятся в значения
tracks = spark.read.option("header", "true") \
        .option("escape", '"') \
        .option("InferSchema", "true") \
        .csv("/datasets/tracks.csv") \
        .withColumn("release_year", f.substring("release_date", 1, 4).cast(IntegerType())) \
        .withColumn("array_artist", f.split(f.regexp_replace(f.col("artists"), "[\]\[\']", ""),", ")) \
        .cache() #выделяем год в отдельную колонку и преобразуем колонку с артистами в массив

tracks_exp = tracks.select(  
                            "name", 
                            "popularity",
                            "danceability",
                            "energy",
                            "speechiness",
                            "acousticness",
                            "liveness",
                            "valence",
                            "release_year",
                            "artists",
                            f.explode(f.col("array_artist") ).alias("name_artist")
                        ) #создаем отдельную таблицу с развернутым массивом артистов
                        
tracks_exp.printSchema()

spark.sql("create database hw_3")
tracks_exp.write.mode("overwrite").saveAsTable("hw_3.tracks")

#### Задание 5
#### Какие артист выпустили наибольшее число песен из годового топ-100 (по популярности)?

In [10]:
%pyspark

tracks = spark.table("hw_3.tracks")
z.show(tracks)

In [11]:
%pyspark
from pyspark.sql.window import Window
import pyspark.sql.functions as f
from pyspark.sql.types import IntegerType



In [12]:
%pyspark

from pyspark.sql.window import Window
import pyspark.sql.functions as f
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import *

path = "/apps/hive/warehouse/hw_3.db/tracks"
df = spark.read.parquet(path)
df.printSchema()

print("--------------------------Initial df--------------------------------------------------------------")
df.show(5)

print("--------------------------Initial df count--------------------------------------------------------")
print(df.count())
df = df.distinct()

print("--------------------------df count w/o duplicates-------------------------------------------------")
print(df.count())

popularity_filter_1 = col("popularity") <= 100
popularity_filter_2 = col("popularity") > 0
rank_filter = col("rank") <= 100

Window_Spec  = Window. \
                    partitionBy("release_year"). \
                    orderBy(col("popularity"). \
                    desc())

df_with_ranks = df. \
                where(popularity_filter_1 & popularity_filter_2). \
                withColumn("rank", rank(). \
                over(Window_Spec))
                        
df_with_ranks = df_with_ranks. \
                filter(rank_filter)

                        
print("--------------------------df with ranks after rank function----------------------------------------")                        
df_with_ranks.show(20)

print("--------------------------final result with top 100 artist from each year--------------------------")
df_with_ranks. \
    groupBy(["name_artist"]). \
    agg(count("name").alias("count_artist")). \
    orderBy(col("count_artist").desc()). \
    show(20)


#### Задание 6
#### Вывести топ артистов, которые чаще других попадали в годовой топ-100 песен по популярности?

In [14]:
%pyspark

from pyspark.sql.functions import *
from pyspark.sql.window import Window
   
path = "/apps/hive/warehouse/hw_3.db/tracks"
df = spark.read.parquet(path)
df.printSchema()

print("--------------------------Initial df--------------------------------------------------------------")
df.show(5)

print("--------------------------Initial df count--------------------------------------------------------")
print(df.count())
df = df.distinct()

print("--------------------------df count w/o duplicates-------------------------------------------------")
print(df.count())

popularity_filter_1 = col("popularity") <= 100
popularity_filter_2 = col("popularity") > 0
rank_filter = col("rank") <= 100

Window_Spec  = Window. \
                    partitionBy("release_year"). \
                    orderBy(col("popularity"). \
                    desc())
                    
df_with_ranks = df. \
                where(popularity_filter_1 & popularity_filter_2). \
                withColumn("rank", rank(). \
                over(Window_Spec))
                        
df_with_ranks = df_with_ranks. \
                filter(rank_filter)

                        
print("--------------------------df with ranks after rank function----------------------------------------")                        
df_with_ranks.show(5)

print("--------------------------groupby by year and artist-----------------------------------------------")
df_result = df_with_ranks. \
    groupBy(["release_year", "name_artist"]). \
    agg(count("name").alias("count_songs")). \
    orderBy(col("count_songs").desc()) \
    
df_result.show(20)

print("--------------------------final result with the most frequent artists in top 100--------------------")
df_result = df_result. \
    groupBy(["name_artist"]). \
    agg(count("release_year").alias("count_artist")). \
    orderBy(col("count_artist").desc()). \
    show(20)



#### Задание 7.1
#### Какие артисты дольше других несколько лет подряд держались в ежегодном топ-100 песен по популярности?

In [15]:
%pyspark


from pyspark.sql.functions import *
from pyspark.sql.window import Window
#import pyspark.sql.functions as f
   
path = "/apps/hive/warehouse/hw_3.db/tracks"
df = spark.read.parquet(path)
df.printSchema()

df.show(5)
df = df.distinct()




Window_Spec = Window. \
              partitionBy("release_year"). \
              orderBy(col("popularity").desc())


df_result = df.withColumn("rank", row_number().over(Window_Spec)). \
                    filter(col("rank") <= 100). \
                    drop(col("rank")). \
                    select('release_year', 'name_artist'). \
                    distinct(). \
                    sort("name_artist", "release_year", ascending=[True, True])


Window_Spec  = Window. \
               partitionBy("name_artist"). \
               orderBy(col("release_year").asc())
                
                
df_result_with_lag = df_result. \
                     withColumn("lag", lag("release_year", 1).over(Window_Spec))


yearDifference = col("release_year") - col("lag")
df_result_with_lag = df_result_with_lag. \
                     withColumn("yearDifference", yearDifference)


df_result_with_counter = df_result_with_lag. \
                         withColumn("counter", when(col("yearDifference") == 1, 1).when(col("yearDifference") != 1, 0).otherwise(0))


final_result = df_result_with_counter. \
               groupBy(["name_artist"]). \
               agg(sum("counter").alias("count")). \
               orderBy(col("count").desc())
               
#df_result_with_counter = df_result_with_counter.withColumn("id", monotonically_increasing_id())      
               
#new_result = df_result_with_counter.withColumn(
#    "grp", 
#    sum((col("counter") == 0).cast("int")).over(Window.orderBy("id"))
#    ).withColumn(
#    "D",
#    sum(col("counter")).over(Window.partitionBy("grp").orderBy("id"))
#).drop("grp")
    
#new_result = new_result.groupBy(["name_artist"]).agg(count("D").alias("count")).orderBy(col("count").desc())
#z.show(new_result)

final_result.show(10)


#### Задание 7.2
#### Решение с udf

In [16]:
%pyspark

from pyspark.sql.types import IntegerType
from pyspark.sql.functions import *

path = "/apps/hive/warehouse/hw_3.db/tracks"
df = spark.read.parquet(path)
df.printSchema()

df = df.distinct()

Window_Spec = Window.partitionBy("release_year").orderBy(col("popularity").desc())


df = df.withColumn("rank", row_number().over(Window_Spec)). \
                    filter(col("rank") <= 100). \
                    drop(col("rank"))

# Подсчет количества лет подряд для каждого артиста
@udf(returnType=IntegerType())
def count_consecutive_years(release_years):
    consecutive_years = 0
    prev_year = None
    for year in sorted(release_years):
        if prev_year is not None and year == prev_year + 1:
            consecutive_years += 1
        prev_year = year
    return consecutive_years


result_top_artists = df.groupBy("name_artist"). \
                                      agg(collect_list("release_year").alias("release_years")). \
                                      withColumn("max_period", count_consecutive_years(col("release_years"))). \
                                      filter(col("max_period") >= 2). \
                                      select(col("name_artist"), col("max_period")). \
                                      orderBy(col("max_period").desc())

result_top_artists.show()




#### Задание 7.3
#### Через lag

In [17]:
%pyspark


# https://stackoverflow.com/questions/56384625/pyspark-cumulative-sum-with-reset-condition

from pyspark.sql.functions import *
from pyspark.sql.window import Window
import pyspark.sql.functions as f
   
path = "/apps/hive/warehouse/hw_3.db/tracks"
df = spark.read.parquet(path)
df.printSchema()

print("--------------------------OPTION 1--------------------------")

print("--------------------------Initial df--------------------------")
df.show(5)
print("--------------------------Initial df count--------------------------")
print(df.count())
df = df.distinct()
print("--------------------------df count w/o duplicates--------------------------")
print(df.count())

popularity_filter_1 = col("popularity") <= 100
popularity_filter_2 = col("popularity") > 0


df_result = df.where(popularity_filter_1 & popularity_filter_2).select('release_year', 'name_artist').distinct().sort("name_artist", "release_year", ascending=[True, True])
z.show(df_result)

Window_Spec  = Window.partitionBy("name_artist").orderBy(col("release_year").asc())
df_result_with_lag = df_result.withColumn("lag", lag("release_year", 1).over(Window_Spec))
z.show(df_result_with_lag)

yearDifference = col("release_year") - col("lag")
df_result_with_lag = df_result_with_lag.withColumn("yearDifference", yearDifference)
z.show(df_result_with_lag)

df_result_with_counter = df_result_with_lag.withColumn("counter", when(col("yearDifference") == 1, 1).when(col("yearDifference") != 1, 0).otherwise(0))
z.show(df_result_with_counter)

df_result_with_counter = df_result_with_counter.withColumn("id", monotonically_increasing_id())
z.show(df_result_with_counter)

new_result = df_result_with_counter.withColumn(
    "grp", 
    f.sum((f.col("counter") == 0).cast("int")).over(Window.orderBy("id"))
    ).withColumn(
    "D",
    f.sum(f.col("counter")).over(Window.partitionBy("grp").orderBy("id"))
).drop("grp")
    
new_result = new_result.groupBy(["name_artist"]).agg(count("D").alias("count")).orderBy(col("count").desc())
z.show(new_result)




#final_result = df_result_with_counter.groupBy(["name_artist"]).agg(count("counter").alias("count")).orderBy(col("count").desc())
#z.show(final_result)



#final_result = df_result_with_counter.groupBy(["name_artist"]).agg(count("counter").alias("count")).orderBy(col("count").desc())
#z.show(final_result)

#df_result.show(10)
#yearDifference = col("release_year") - col("lag")
#df_result_final = df_result.withColumn("yearDifference", yearDifference). \
#                withColumn("Trend", when(col("yearDifference") == 0, 0).when(col("yearDifference") > 0, 1).otherwise(0)).orderBy(col("name_artist").desc(), col("release_year").asc())
                
#df_result_final.show(20)         
                
#.filter(df_result_final.Trend.isNotNull())

#df_result_final.withColumn(
#    "grp", 
#    f.sum((f.col("Trend") == 0).cast("int")).over(Window.orderBy("release_year"))
#).withColumn(
#    "D",
#    f.sum(f.col("Trend")).over(Window.partitionBy("grp").orderBy("release_year"))
#).drop("grp").show()

#.withColumn(
#    "result",
#    f.sum(f.col("Trend")).over(Window.partitionBy("grp").orderBy("release_year"))
#).drop("grp").show()
