In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import joblib, pickle

In [4]:
from subprocess import check_output
print(check_output(["ls", "../../../data"]).decode("utf8"))

holidays_events.csv
items
items.csv
oil.csv
sample_submission.csv
stores.csv
test.csv
train.csv
transactions.csv



In [3]:
# Any results you write to the current directory are saved as output.
####################################################################
#
# Modified base on the script from ArjanGroen 
# Improved the Nan column handling
# https://www.kaggle.com/arjanso/reducing-dataframe-memory-size-by-65/code
#
####################################################################
def reduce_mem_usage(props,prompt=True):
	nan_cols=props.columns[props.isnull().any()].tolist()
	if prompt:
		start_mem_usg = props.memory_usage().sum() / 1024**2 
		print("Memory usage of properties dataframe is :",start_mem_usg," MB")
	for col in props.columns:
		if props[col].dtype != object:  # Exclude strings
			if prompt:
				# Print current column type
				print("******************************")
				print("Column: ",col)
				print("dtype before: ",props[col].dtype)
			
			if col in nan_cols:
				if prompt: 
					print('Column: %s has NAN values'%col)
				props.loc[:,col] = props.loc[:,col].astype(np.float32)
			else:
				# make variables for Int, max and min
				IsInt = False
				mx = props[col].max()
				mn = props[col].min()
				
				# Integer does not support NA, therefore, NA needs to be filled
				

				# test if column can be converted to an integer
				asint = props[col].astype(np.int64)
				result = (props[col] - asint)
				result = result.sum()
				if result > -0.01 and result < 0.01:
					IsInt = True

				
				# Make Integer/unsigned Integer datatypes
				if IsInt:
					if mn >= 0:
						if mx < 2**8:
							props.loc[:,col] = props.loc[:,col].astype(np.uint8)
						elif mx < 2**16:
							props.loc[:,col] = props.loc[:,col].astype(np.uint16)
						elif mx < 2**32:
							props.loc[:,col] = props.loc[:,col].astype(np.uint32)
						else:
							props.loc[:,col] = props.loc[:,col].astype(np.uint64)
					else:
						if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
							props.loc[:,col] = props.loc[:,col].astype(np.int8)
						elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
							props.loc[:,col] = props.loc[:,col].astype(np.int16)
						elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
							props.loc[:,col] = props.loc[:,col].astype(np.int32)
						elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
							props.loc[:,col] = props.loc[:,col].astype(np.int64)	
				
				# Make float datatypes 32 bit
				else:
					props.loc[:,col] = props.loc[:,col].astype(np.float32)

			if prompt:
				# Print new column type
				print("dtype after: ",props[col].dtype)
				print("******************************")
	
	if prompt:
		# Print final result
		print("___MEMORY USAGE AFTER COMPLETION:___")
		mem_usg = props.memory_usage().sum() / 1024**2 
		print("Memory usage is: ",mem_usg," MB")
		print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
	return props

def encoder(data,group=False,allow_nan=False):
	results={}
	paces={}
	cols=data.columns
	for col in cols:
		results[col]=np.array(data[col])
		items=sorted(list(np.unique(results[col])))
		for item in items:
			results[col][results[col]==item]=items.index(item)
		if len(cols)<2:
			return results[col]
		paces[col]=len(items)
	if group:
		paces=sorted(paces.items(), key=lambda d: d[1])
		if allow_nan:
			col,stp=paces[0]
			results[col]=results[col]+1
			paces[0]=(col,stp+1)
		ary=np.zeros(len(data),dtype=np.int)
		factor=1
		for i in range(len(paces)):
			col,stp=paces[i]
			ary=ary+factor*results[col]
			factor*=stp
		return ary
	return results.values()

def dump(obj,file_name,level=6):
	joblib.dump(value=obj,filename=file_name,compress=level,protocol=pickle.HIGHEST_PROTOCOL)
	return

def load(file_name):
	return joblib.load(file_name)


In [None]:
data=pd.read_csv('../../../data/train.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
MIN_DATE='2012-03-02'
MAX_DATE='2017-12-26'
dump_fold=''
read_fold='../../../data/'
print('Building Calendar Dataframe...')
data=pd.DataFrame(data={'date':pd.date_range(MIN_DATE,MAX_DATE)})
data['year']=data['date'].dt.year
data['quarter']=data['date'].dt.quarter
data['month']=data['date'].dt.month
data['day']=data['date'].dt.day
data['dow']=data['date'].dt.weekday
data['woy']=data['date'].dt.weekofyear
data['wom']=(data['day']-data['dow']+6)//7
data['doy']=data['date'].dt.dayofyear
data['day_idx']=data.index
data['date']=data['date'].astype(str)
data=reduce_mem_usage(data)
print('Building Calendar Dataframe Done!\n')
print('Dumpping Calendar Dataframe...')
dump(data,'%scalendar.dataframe.gz'%dump_fold)
print('Dumpping Calendar Dataframe Done!')

print('Loading initial data dataframe...')
print('loading items data..')
data = pd.read_csv('%sitems.csv'%read_fold)
data['family'],data['class']=encoder(data[['family','class']])
print('Loading full initial dataframe done!')
data=reduce_mem_usage(data)

print('Dumpping Products Dataframe...')
dump(data,'%sproducts.dataframe.gz'%dump_fold)
print('Dumpping Products Dataframe Done!')

print('Loading initial data dataframe...')
print('loading stores data..')
data = pd.read_csv('%sstores.csv'%read_fold)
data['type'],data['city'],data['state']=encoder(data[['type','city','state']])
data.rename(columns={'type':'store_type'},inplace=True)
print('Loading full initial dataframe done!')
data=reduce_mem_usage(data)

print('Dumpping Stores Dataframe...')
dump(data,'%sstores.dataframe.gz'%dump_fold)
print('Dumpping Stores Dataframe Done!')

print('Loading initial data dataframe...')
print('loading transactions data..')
data = pd.read_csv('%stransactions.csv'%read_fold)
data['date']=data['date'].astype(str)
data=reduce_mem_usage(data.merge(load('%scalendar.dataframe.gz'%dump_fold)[['date','day_idx']],how='left',on='date').drop(['date'],axis=1).rename(columns={'day_idx':'date'}))
print('packing data...')
print('Loading initial dataframe done!')

print('Dumpping Data Initial Dataframe...')
dump(data,'%stransactions.dataframe.gz'%dump_fold)
print('Dumpping Data Initial Dataframe Done!')

print('Building Calendar Dataframe...')
data=pd.read_csv('%soil.csv'%read_fold)
data=reduce_mem_usage(data[data['dcoilwtico'].notnull()].merge(load('%scalendar.dataframe.gz'%dump_fold)[['date','day_idx']],how='left',on='date').drop('date',axis=1).rename(columns={'day_idx':'date'}))
print('Building Oil Price Dataframe Done!\n')

print('Dumpping Oil Price Dataframe...')
dump(data,'%soil.dataframe.gz'%dump_fold)
print('Dumpping Oil Price Dataframe Done!')

print('Loading initial data dataframe...')
print('loading holidays_events data..')
data = pd.read_csv('%sholidays_events.csv'%read_fold)
data['date']=data['date'].astype(str)
data.loc[(data['transferred'].isnull())|(data['transferred']==False),'transferred']=0
data.loc[data['transferred']==True,'transferred']=1
data.rename(columns={'type':'holiday_type'},inplace=True)
print('Loading full initial dataframe done!')

stores = pd.read_csv('%sstores.csv'%read_fold)[['store_nbr','city','state']]
holidays = pd.concat([data[data['locale']=='Local'].merge(stores,how='left',left_on='locale_name',right_on='city'),
                      data[data['locale']=='Regional'].merge(stores,how='left',left_on='locale_name',right_on='state')]).drop(['locale_name','city','state'],axis=1)


stores['locale']='National'
holidays=pd.concat([holidays,data[data['locale']=='National'].merge(stores[['store_nbr','locale']],how='inner',on='locale').drop(['locale_name'],axis=1)])


holidays['holiday']=encoder(holidays[['transferred','holiday_type','locale','description']],group=True,allow_nan=True)
holidays.drop(['holiday_type','locale','description','transferred'],axis=1,inplace=True)
holidays=reduce_mem_usage(holidays.merge(load('%scalendar.dataframe.gz'%dump_fold)[['date','day_idx']],how='left',on='date').drop('date',axis=1).rename(columns={'day_idx':'date'}))

print('Dumpping Holidays Dataframe...')
dump(holidays,'%sholidays.dataframe.gz'%dump_fold)
print('Dumpping Holidays Dataframe Done!')

print('loading train,test data..')
data = pd.concat([pd.read_csv('%strain.csv'%read_fold),pd.read_csv('%stest.csv'%read_fold)])
data['unique']=data['item_nbr']*(data['store_nbr'].max()+1)+data['store_nbr']
data['date']=data['date'].astype(str)
data.loc[(data['onpromotion'].isnull())|(data['onpromotion']==False),'onpromotion']=0
data.loc[data['onpromotion']==True,'onpromotion']=1
print('Loading full initial dataframe done!')
data=data.merge(load('%sproducts.dataframe.gz'%dump_fold),how='left',on='item_nbr')
print('products merging is done!')
data = reduce_mem_usage(data)

print('Building data dataframe...')
print('Encapuslating full initial dataframe...')
data=data.merge(load('%scalendar.dataframe.gz'%dump_fold),how='left',on='date').drop('date',axis=1).rename(columns={'day_idx':'date'})
print('calendar merging is done!')
data=data.merge(load('%sholidays.dataframe.gz'%dump_fold),how='left',on=['store_nbr','date'])
data.loc[data['holiday'].isnull(),'holiday']=0
print('holidays merging is done!')
data=data.merge(load('%stransactions.dataframe.gz'%dump_fold),how='left',on=['store_nbr','date'])
print('transactions merging is done!')
data=data.merge(load('%sstores.dataframe.gz'%dump_fold),how='left',on='store_nbr')
print('stores merging is done!')
data=data.merge(load('%soil.dataframe.gz'%dump_fold),how='left',on='date')
print('oil merging is done!')
print('Encapuslating full initial dataframe done!')
data = reduce_mem_usage(data)
print("Memory usage of dataframe is :%f MB"%(1.0*data.memory_usage().sum()/1024**2))   
print('Building data dataframe done!')
print('Dumpping Initial Data Dataframe...')
dump(data,'%sdata.initial.dataframe.gz'%dump_fold)
print('Dumpping Initial Data Dataframe Done!\n\n')

Building Calendar Dataframe...
Memory usage of properties dataframe is : 0.16227722168  MB
******************************
Column:  year
dtype before:  int64
dtype after:  uint16
******************************
******************************
Column:  quarter
dtype before:  int64
dtype after:  uint8
******************************
******************************
Column:  month
dtype before:  int64
dtype after:  uint8
******************************
******************************
Column:  day
dtype before:  int64
dtype after:  uint8
******************************
******************************
Column:  dow
dtype before:  int64
dtype after:  uint8
******************************
******************************
Column:  woy
dtype before:  int64
dtype after:  uint8
******************************
******************************
Column:  wom
dtype before:  int64
dtype after:  uint8
******************************
******************************
Column:  doy
dtype before:  int64
dtype after:  uint16
***

FileNotFoundError: File b'../../../dataitems.csv' does not exist

In [1]:
import pandas as pd

In [4]:
data = pd.read_csv('../../../data/train1.csv',sep=',')

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
data[0:3]

Unnamed: 0,id,date,store_nbr,item_nbr,unit_sales,onpromotion
0,0,2013-01-01,25,103665,7.0,
1,1,2013-01-01,25,105574,1.0,
2,2,2013-01-01,25,105575,2.0,


In [8]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import joblib, pickle
data = joblib.load('data.initial.dataframe1.gz')
data[0:3]

Unnamed: 0,id,store_nbr,item_nbr,unit_sales,onpromotion,family,class,perishable,year,quarter,...,wom,doy,date,holiday,transactions,city,state,store_type,cluster,dcoilwtico
0,0,25,103665,7.0,0,5,187,1,2013,1,...,0,1,305,2785,770.0,3,20,13,1,
1,1,25,105574,1.0,0,12,31,0,2013,1,...,0,1,305,2785,770.0,3,20,13,1,
2,2,25,105575,2.0,0,12,31,0,2013,1,...,0,1,305,2785,770.0,3,20,13,1,


In [25]:
data.describe()

Unnamed: 0,id,store_nbr,item_nbr,unit_sales,onpromotion,class,perishable,year,quarter,month,day,dow,woy,wom,doy,date,holiday,transactions,cluster,dcoilwtico
count,41915070.0,41915070.0,41915070.0,41915070.0,41915070.0,41915070.0,41915070.0,41915070.0,41915070.0,41915070.0,41915070.0,41915070.0,41915070.0,41915070.0,41915070.0,41915070.0,41915070.0,41910770.0,41915070.0,28501670.0
mean,20948420.0,27.0568,771394.6,9.014503,0.01314477,114.9473,0.2441704,2013.688,2.473581,6.392034,15.65919,3.043443,25.81838,2.250167,178.9529,734.0018,183.7587,1934.511,8.83712,90.91991
std,12086900.0,16.54522,399951.7,19.99962,0.1138946,85.99878,0.4295942,0.607648,1.166858,3.647878,8.776189,2.016848,15.86772,1.311237,111.3543,223.8735,694.6188,1054.517,4.693557,16.61651
min,0.0,1.0,96995.0,-1451.0,0.0,0.0,0.0,2013.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,305.0,0.0,203.0,1.0,44.08
25%,10478770.0,11.0,414454.0,2.0,0.0,31.0,0.0,2013.0,1.0,3.0,8.0,1.0,11.0,1.0,74.0,547.0,0.0,1181.0,4.0,90.88
50%,20956440.0,28.0,805311.0,4.0,0.0,81.0,0.0,2014.0,3.0,7.0,16.0,3.0,26.0,2.0,183.0,749.0,0.0,1619.0,9.0,95.83
75%,31430950.0,43.0,1083517.0,10.0,0.0,188.0,0.0,2014.0,4.0,10.0,23.0,5.0,40.0,3.0,278.0,933.0,0.0,2485.0,13.0,101.88
max,41832350.0,54.0,1909770.0,12021.0,1.0,336.0,1.0,2015.0,4.0,12.0,31.0,6.0,52.0,5.0,365.0,1090.0,5539.0,8256.0,17.0,110.62


In [24]:
data[['id',
 'store_nbr',
 'item_nbr',
 'onpromotion',
 'family',
 'class',
 'perishable',
 'year',
 'quarter',
 'month',
 'day',
 'dow',
 'woy',
 'wom',
 'doy',
 'date',
 'holiday',
 'transactions',
 'city',
 'state',
 'store_type',
 'cluster',
 'dcoilwtico']] = data [[]].apply(pd.to_numeric)

count    4.191507e+07
mean     2.013688e+03
std      6.076480e-01
min      2.013000e+03
25%      2.013000e+03
50%      2.014000e+03
75%      2.014000e+03
max      2.015000e+03
Name: year, dtype: float64

In [21]:
data.columns.tolist()

['id',
 'store_nbr',
 'item_nbr',
 'unit_sales',
 'onpromotion',
 'family',
 'class',
 'perishable',
 'year',
 'quarter',
 'month',
 'day',
 'dow',
 'woy',
 'wom',
 'doy',
 'date',
 'holiday',
 'transactions',
 'city',
 'state',
 'store_type',
 'cluster',
 'dcoilwtico']