In [None]:
import numpy as np 
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
!pip install py7zr

In [None]:
import py7zr
from subprocess import check_output

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        archive = py7zr.SevenZipFile(os.path.join(dirname, filename), mode='r')
        archive.extractall(path="/kaggle/working")
        archive.close()

print(check_output(["ls", "../working"]).decode("utf8"))

In [None]:
#train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../working/test.csv")
stores = pd.read_csv("../working/stores.csv")
items = pd.read_csv("../working/items.csv")
trans = pd.read_csv("../working/transactions.csv")
oil = pd.read_csv("../working/oil.csv")
holiday = pd.read_csv("../working/holidays_events.csv")
print("done")

Since train.csv has 125 mil records, it is best to consider performing some data engineering before starting any analysis.

In [None]:
#check memory use for the two biggest files - train and test
#mem_train = train.memory_usage(index=True).sum()
mem_test=test.memory_usage(index=True).sum()
#print("train dataset uses ",mem_train/ 1024**2," MB")
print("test dataset uses ",mem_test/ 1024**2," MB")

test.head()

In [None]:
# optimize test.csv
# First check the contents of train.csv
print(test.max())
print(test.min())
#check datatypes
print(test.dtypes)

In [None]:
#There are only 54 stores
test['store_nbr'] = test['store_nbr'].astype(np.uint8)

# The ID column is a continuous number from 1 to 128867502 in train and 128867503 to 125497040 in test
test['id'] = test['id'].astype(np.uint32)

# item number is unsigned 
test['item_nbr'] = test['item_nbr'].astype(np.uint32)

#Converting the date column to date format
test['date']=pd.to_datetime(test['date'],format="%Y-%m-%d")

#check memory
print(test.memory_usage(index=True))
new_mem_test=test.memory_usage(index=True).sum()
print("test dataset uses ",new_mem_test/ 1024**2," MB after changes")
print("memory saved =",(mem_test-new_mem_test)/ 1024**2," MB")

# Around 50% save in memory utilization

In [None]:
print(test.memory_usage())

#check range of float 16
min_value = np.finfo(np.float16).min
max_value = np.finfo(np.float16).max
print("range of float16 is",min_value,max_value)

In [None]:
dtype_dict={"id":np.uint32,
            "store_nbr":np.uint8,
            "item_nbr":np.uint32,
            "unit_sales":np.float32
           }

train_part1 = pd.read_csv("../working/train.csv",dtype=dtype_dict,usecols=[0,2,3,4])
print(train_part1.dtypes)

In [None]:
train_part2=pd.read_csv("../working/train.csv",dtype=dtype_dict,usecols=[1,5],parse_dates=[0])
train_part2['Year'] = pd.DatetimeIndex(train_part2['date']).year
train_part2['Month'] = pd.DatetimeIndex(train_part2['date']).month
train_part2['Day'] =pd.DatetimeIndex(train_part2['date']).day.astype(np.uint8)
del(train_part2['date'])
train_part2['Day']=train_part2['Day'].astype(np.uint8)
train_part2['Month']=train_part2['Month'].astype(np.uint8)
train_part2['Year']=train_part2['Year'].astype(np.uint16)

#impute the missing values to be -1
train_part2["onpromotion"].fillna(0, inplace=True)
train_part2["onpromotion"]=train_part2["onpromotion"].astype(np.int8)
print(train_part2.head())
print(train_part2.dtypes)

In [None]:
# joining part one and two
# For people familiar with R , the equivalent of cbind in pandas is the following command
train = pd.concat([train_part1.reset_index(drop=True), train_part2], axis=1)
#drop temp files
del(train_part1)
del(train_part2)
#Further Id is just an indicator column, hence not required for analysis
id=train['id']
del(train['id'])
# check memory
print(train.memory_usage())
#The extracted train.csv file is approx 5 GB
mem_train=5*1024**3
new_mem_train=train.memory_usage().sum()
print("Train dataset uses ",new_mem_train/ 1024**2," MB after changes")
print("memory saved is approx",(mem_train-new_mem_train)/ 1024**2," MB")

1.6GB is a managable size

# Now lets look into the dataset



## Further to make EDA easier, rolling up the sales to different levels

 - Day-Store level
 - Day-Item level
 - Store level
 - Item level
 - Day level




 Store-day level sale -- This variable indicates the sale of a particular store over time

 Store-day level count -- This variable gives an indication of the variaty/spread of the items sold

 Item-day level sale -- Sale of an item over time
 
 Item-day level count -- This gives an indication of the popularity of the item across the supermarket chain.


In [None]:
sale_day_store_level=train.groupby(['Year','Month','Day','store_nbr'])['unit_sales'].sum()
sale_day_item_level=train.groupby(['Year','Month','Day','item_nbr'])['unit_sales'].sum()

In [None]:
def aggregate_level1(df):
    #day-store level
    sale_day_store_level=df.groupby(['Year','Month','Day','store_nbr'],as_index=False)['unit_sales'].agg(['sum','count'])
    #drop index and rename
    sale_day_store_level=sale_day_store_level.reset_index().rename(columns={'sum':'store_sales','count':'item_variety'})
    
    #day-item level  
    sale_day_item_level=df.groupby(['Year','Month','Day','item_nbr'],as_index=False)['unit_sales'].agg(['sum','count'])
    sale_day_item_level=sale_day_item_level.reset_index().rename(columns={'sum':'item_sales','count':'store_spread'})
    
    #store item level   
    sale_store_item_level=df.groupby(['Year','store_nbr','item_nbr'],as_index=False)['unit_sales'].agg(['sum','count'])
    sale_store_item_level=sale_store_item_level.reset_index().rename(columns={'sum':'item_sales','count':'entries'})

    return sale_day_store_level,sale_day_item_level,sale_store_item_level

In [None]:
import time
start_time = time.time()
sale_day_store_level,sale_day_item_level,sale_store_item_level=aggregate_level1(train)

end_time=time.time()
time_taken=end_time-start_time
print("This block took ",time_taken,"seconds")

In [None]:
sale_day_store_level.to_csv("sale_day_store_level.csv")
sale_day_item_level.to_csv("sale_day_item_level.csv")
sale_store_item_level.to_csv("sale_store_item_level.csv")