In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("pyspark-basics").getOrCreate()

df = spark.read.csv("gs://bucket-name/stackoverflowposts.csv",header=True)

df_subset = df.select(["id","tags","score","view_count","creation_date"])

In [None]:
from pyspark.sql import functions as F 

df_tagged = df_subset \
        .withColumn(
        'post_type',
        F.when(F.col('tags').like('python%'),'python') \
        .otherwise(
            F.when(F.col('tags').like('mysql%'),'mysql')
            .otherwise(
                F.when(F.col('tags').like('scala%'),'scala')
                .otherwise("others")
            )
        )
)

In [None]:
df_tagged.show(10,False)

In [None]:
df_tagged = df_tagged.where(df_tagged.post_type.isin(["python","mysql","scala"]))

In [None]:
df_tagged.show(10,False)

In [None]:
from pyspark.sql.types import TimestampType,IntegerType
from pyspark.sql.functions import year


df_tagged = df_tagged \
            .withColumn("creation_date",df_tagged["creation_date"].cast(TimestampType())) \
            .withColumn("view_count",df_tagged["view_count"].cast(IntegerType()))

In [None]:
df_tagged.printSchema()

In [None]:
df_tagged = df_tagged.withColumn("creation_year",year(df_tagged.creation_date))

In [None]:
df_tagged.show(5,False)

In [None]:
df_tagged = df_tagged.select(["post_type","creation_year","view_count"])

In [None]:
df_tagged.show(10,False)

In [None]:
df_grouped = df_tagged.groupBy(["post_type","creation_year"]) \
            .agg(F.sum("view_count").alias("total_views"))

In [None]:
df_grouped.show()

In [None]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number,rank

In [None]:

window_spec = Window.partitionBy(["creation_year"]).orderBy(F.desc(df_grouped.total_views))



In [None]:
df_grouped.withColumn("row_number",row_number().over(window_spec)) \
            .orderBy(["row_number","creation_year"],ascending=True) \
            .show(50,False)

In [None]:
df_grouped.withColumn("row_number",rank().over(window_spec)) \
            .orderBy(["row_number","creation_year"],ascending=True) \
            .show(50,False)