In [9]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("pyspark-basics").getOrCreate()

df = spark.read.csv("gs://bucket-name/stackoverflowposts.csv",header=True)

df_subset = df.select(["tags","view_count","creation_date"])

In [10]:
from pyspark.sql import functions as F

df_tagged = df_subset \
        .withColumn(
        'post_type',
        F.when(F.col('tags').like('python%'),'python') \
        .otherwise("others")
)

In [11]:
from pyspark.sql.types import TimestampType,IntegerType
from pyspark.sql.functions import year,month

df_tagged = df_tagged \
    .withColumn("creation_date",df_tagged["creation_date"].cast(TimestampType())) \
    .withColumn("view_count",df_tagged["view_count"].cast(IntegerType()))

df_tagged = df_tagged \
    .withColumn("creation_year",year(df_tagged.creation_date)) \
    .withColumn("creation_month",month(df_tagged.creation_date))

In [12]:
df_tagged = df_tagged \
    .filter(
        ( df_tagged.creation_year==2015 )
        & 
        ( df_tagged.post_type=='python')
    )

In [13]:
df_tagged = df_tagged.select(["post_type","creation_year","creation_month","view_count"])

In [14]:
df_tagged = df_tagged.groupBy(["post_type","creation_year","creation_month"]) \
            .agg(
                F.sum("view_count").alias("monthly_views")
            )

In [None]:
df_tagged.show()

In [16]:
from pyspark.sql.window import Window

In [18]:

running_sum_window_spec = Window.partitionBy(["post_type","creation_year"]) \
                            .rowsBetween(Window.unboundedPreceding,Window.currentRow) \
                            .orderBy("creation_month")

In [None]:

df_tagged.withColumn("running_total_views",F.sum("monthly_views") \
                     .over(running_sum_window_spec)
                    ).orderBy("creation_month").show(50,False)



In [20]:

running_avg_window_spec = Window.partitionBy(["post_type","creation_year"]) \
                            .rowsBetween(-2,Window.currentRow) \
                            .orderBy("creation_month")


In [None]:

df_tagged.withColumn("running_3monthly_avg_views",F.avg("monthly_views") \
                     .over(running_avg_window_spec)
                    ).orderBy("creation_month").show(50,False)
