In [None]:
!pip install pyspark 

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark=SparkSession.builder.appName("DEMO").getOrCreate()

# Import the data into spark

In [None]:
df=spark.read.csv("../input/train.csv",inferSchema=True,header=True)

# Validate the data points


In [None]:
df.printSchema()

In [None]:
from pyspark.sql.functions import to_date

df=df.withColumn('Date',to_date("Date"))

In [None]:
df.printSchema()

In [None]:
df.dtypes

In [None]:
df.show()

# Check Null Values

In [None]:
from pyspark.sql.functions import *

In [None]:
#df.groupby(df['DayOfWeek']).count().orderBy('count', ascending=False).show()
df.select([count(when(isnull(c),'c')).alias(c)  for c in df.columns]).show()

In [None]:
df.describe("Store","DayOfWeek","Date","Sales").show()

In [None]:
df.describe("Customers","Open","Promo","StateHoliday","SchoolHoliday").show()

# Take One Year Data for One Store 

In [None]:
df1=df.filter((df["Store"]==1)&(year(df["Date"])=="2013"))

In [None]:
df1.describe("Store","DayOfWeek","Date","Sales").show()

In [None]:
df1.describe("Customers","Open","Promo","StateHoliday","SchoolHoliday").show()

In [None]:
df1.select("StateHoliday").distinct().show()

# Take Open Store Details

In [None]:
df2=df1.filter(df["Open"]!=0).orderBy("Date")

In [None]:
df2.describe("Store","DayOfWeek","Date","Sales").show()

In [None]:
df2.describe("Customers","Open","Promo","StateHoliday","SchoolHoliday").show()

# Impute the missing data points

In [None]:
from datetime import *

In [None]:
base = date(2013,1,1)
new_date_list = []
for x in range(0, 365):
    date_list = [base + timedelta(days=x)]
    new_date_list.append(date_list)

In [None]:
new_date_list

In [None]:
test = spark.createDataFrame(new_date_list,['Date'])

In [None]:
test.show()

In [None]:
df3=test.join(df2,["Date"],"leftouter").orderBy("Date")

In [None]:
df3.count()

In [None]:
df3.printSchema()

In [None]:
df3.show()

In [None]:
df3 = df3.withColumn("Store", when(df3.Store.isNull(), lit(1)).otherwise(df3.Store))

In [None]:
df3=df3.withColumn("DayOfWeek", dayofweek("Date"))

In [None]:
df3 = df3.withColumn("Sales", when(df3.Sales.isNull(), lit(0)).otherwise(df3.Sales))

In [None]:
df3=df3.withColumn("Customers", when(df3.Customers.isNull(), lit(0)).otherwise(df3.Customers))

In [None]:
df3=df3.withColumn("Open", when(df3.Open.isNull(), lit(0)).otherwise(df3.Open))

In [None]:
df3=df3 .withColumn("Promo", when(df3.Promo.isNull(), lit(0)).otherwise(df3.Promo))

In [None]:
df3=df3 .withColumn("SchoolHoliday", when(df3.SchoolHoliday.isNull(), lit(0)).otherwise(df3.SchoolHoliday))

In [None]:
df3.show()

In [None]:
df3=df3.withColumn("WeekOfYear", weekofyear("Date"))

In [None]:
df3.show()

# Aggregate the data to week level

In [None]:
df4=df3.groupBy("WeekOfYear").sum("Sales").orderBy("WeekOfYear")

In [None]:
df4.show()



# Generate features from the aggregated data - The features are 1.) Previous 1st week cumulative sales , 2.) Previous 2nd week cumulative sales, 3.) Previous 3rd week cumulative sales.


In [None]:
from pyspark.sql import Window
from pyspark.sql import functions as F

In [None]:
df3_cum = df3.withColumn('week_sales', F.sum('Sales').over(Window.partitionBy('WeekOfYear').orderBy('WeekOfYear')
             .rangeBetween(Window.unboundedPreceding, 0)))

In [None]:
df3_cum=df3_cum.orderBy('WeekOfYear')
df3_cum.show()

In [None]:
df3_cum = df3_cum.withColumn('week2_sales', F.sum('Sales').over(Window.partitionBy(df3_cum['WeekOfYear']==0).orderBy('WeekOfYear')
             .rangeBetween(Window.unboundedPreceding, 0)))

In [None]:
df3_cum=df3_cum.orderBy('WeekOfYear')
df3_cum.select("week_sales","week2_sales").show()

# All Stores

In [None]:
x=df
x.count()
x=x.filter(x['Date']<="2013-31-12")


# Date are continuous value already for full Dataset

In [None]:
x.filter("open=1").groupBy("Store").agg(count("Date")).orderBy("count(Date)",ascending=False).show()

In [None]:
x=x.withColumn("WeekOfYear", weekofyear("Date"))

# Some Stores are opened for every day

In [None]:
x3=x.groupBy("Store","WeekOfYear").sum("Sales").orderBy("Store","WeekOfYear")

# Generate features from the aggregated data - The features are 1.) Previous 1st week cumulative sales , 2.) Previous 2nd week cumulative sales, 3.) Previous 3rd week cumulative sales.


In [None]:
x_cum = x.withColumn('week_sales', F.sum('Sales').over(Window.partitionBy('Store','WeekOfYear').orderBy('WeekOfYear')
             .rangeBetween(Window.unboundedPreceding, 0)))

In [None]:
x_cum=x_cum.orderBy('Store','WeekOfYear')

In [None]:
x_cum.select("Store","WeekOfYear","week_sales").show()


In [None]:
x3_cum = x_cum.withColumn('week2_sales', F.sum('Sales').over(Window.partitionBy("Store",x_cum['WeekOfYear']==0).orderBy('WeekOfYear')
             .rangeBetween(Window.unboundedPreceding, 0)))

In [None]:
x3_cum=x3_cum.orderBy('Store','WeekOfYear')

In [None]:
x3_cum.select("Store","WeekOfYear","week_sales","week2_sales").show()