# Sample Data

## Load Sample Data

In [1]:
# import packages
import numpy as np
import pandas as pd
import janitor # !pip install pyjanitor==0.23.1

In [2]:
sample_data_dir = 'data/sample_data/'

# read data files and clean names
individual_sample = pd.read_csv(sample_data_dir + 'individual_sample.csv', sep = ',', skiprows=[1]).clean_names()
product = pd.read_csv(sample_data_dir + 'product.csv', sep = '|', skiprows=[1]).clean_names()
sales_sample = pd.read_csv(sample_data_dir + 'sales_data_sample.csv', sep = ',', skiprows=[1]).clean_names()
store = pd.read_csv(sample_data_dir + 'store.csv', sep = '|', skiprows=[1]).clean_names()
vehicle_sample = pd.read_csv(sample_data_dir + 'vehicle_sample.csv', sep = ',', skiprows=[1]).clean_names()

In [3]:
print(individual_sample.shape)
individual_sample.head()

(99999, 5)


Unnamed: 0,unnamed_0,mzb_indiv_id,email_optin_ind,ah1_res_bus_indc,supp1_bus_pander
0,2302009,263098627,Y,R,N
1,12836285,369527724,Y,R,N
2,16337753,552976498,Y,R,N
3,14239232,464651569,Y,R,N
4,3117446,267341408,Y,R,N


In [4]:
print(product.shape)
product.head()

(56841, 13)


Unnamed: 0,article_id,prod_group_code,prod_group_desc,category_code,category_desc,segment_code,segment_desc,class_code,class_desc,discount_flag,cross_section,aspect_ratio,rim_size
0,20257,5.0,Tires,26.0,Passenger Tires,29.0,Performance Tires,37.0,S/T Performance Tires,N,225,60,16
1,53872,5.0,Tires,26.0,Passenger Tires,29.0,Performance Tires,37.0,S/T Performance Tires,N,215,60,14
2,20192,5.0,Tires,26.0,Passenger Tires,29.0,Performance Tires,37.0,S/T Performance Tires,N,195,65,15
3,45500,5.0,Tires,26.0,Passenger Tires,29.0,Performance Tires,37.0,S/T Performance Tires,N,175,70,13
4,20001,5.0,Tires,26.0,Passenger Tires,29.0,Performance Tires,37.0,S/T Performance Tires,N,205,70,14


In [5]:
print(sales_sample.shape)
sales_sample.head()

(4000504, 9)


Unnamed: 0,unnamed_0,store_id,tran_id,date,article_id,indiv_id,vehicle_id,units,sales
0,1,337934,990994590,2018-07-23,7001647,318422234.0,944814728,0.0,-8.55
1,2,337934,990994590,2018-07-23,7001715,318422234.0,944814728,0.0,-50.45
2,3,337934,990994590,2018-07-23,7096547,318422234.0,944814728,0.0,10.0
3,4,337934,990994590,2018-07-23,7001647,318422234.0,944814728,0.0,0.0
4,5,337934,990994590,2018-07-23,7001715,318422234.0,944814728,0.0,0.0


In [6]:
print(store.shape)
store.head()

(2813, 4)


Unnamed: 0,store_id,state_code,zip_code,msa
0,244240,NE,68134,
1,244233,MO,64151,
2,227602,KY,40272,
3,227603,KY,40219,
4,232365,IN,46142,


In [7]:
print(vehicle_sample.shape)
vehicle_sample.head()

(99999, 6)


Unnamed: 0,unnamed_0,vehicle_id,make,model,sub_model,model_year
0,4616962,949893706,NISSAN,MURANO SV,SV,2014
1,12535557,953720646,CHEVROLET,MONTE CARLO LT,LT,2005
2,8953545,949554501,MERCEDES-BENZ,C300 4MATIC LUXURY,4Matic Luxury,2010
3,15025103,879337349,DODGE,GRAND CARAVAN SXT,SXT,2005
4,11657057,946122136,DODGE,DURANGO SLT,SLT 4x4,1999


## Join Sample Data

In [8]:
# convert store zip and id to string
store['store_id']= store['store_id'].apply(str)
sales_sample['store_id']= sales_sample['store_id'].apply(str)
store['zip_code']= store['zip_code'].apply(str)

In [9]:
# change the column name of table 'individual'
individual_sample = individual_sample.rename(columns = {'mzb_indiv_id':'indiv_id'})

In [10]:
mega_table = sales_sample.merge(product, on = 'article_id', how = 'left')
mega_table = mega_table.merge(store, on = 'store_id', how = 'left')
mega_table = mega_table.merge(individual_sample, on = 'indiv_id', how = 'left')
mega_table = mega_table.merge(vehicle_sample, on = 'vehicle_id', how = 'left')

In [11]:
# check the dimensions of the mega table
print(mega_table.shape)
print(sales_sample.shape)

(4000504, 33)
(4000504, 9)


In [12]:
# check columns
int(mega_table.shape[1]) == \
int(individual_sample.shape[1]) \
+ int(product.shape[1]) \
+ int(sales_sample.shape[1]) \
+ int(store.shape[1]) \
+ int(vehicle_sample.shape[1]) - 4


True

In [13]:
mega_table.head()

Unnamed: 0,unnamed_0_x,store_id,tran_id,date,article_id,indiv_id,vehicle_id,units,sales,prod_group_code,...,msa,unnamed_0_y,email_optin_ind,ah1_res_bus_indc,supp1_bus_pander,unnamed_0,make,model,sub_model,model_year
0,1,337934,990994590,2018-07-23,7001647,318422234.0,944814728,0.0,-8.55,4.0,...,HAGERSTOWN,11239886,Y,R,N,,,,,
1,2,337934,990994590,2018-07-23,7001715,318422234.0,944814728,0.0,-50.45,4.0,...,HAGERSTOWN,11239886,Y,R,N,,,,,
2,3,337934,990994590,2018-07-23,7096547,318422234.0,944814728,0.0,10.0,4.0,...,HAGERSTOWN,11239886,Y,R,N,,,,,
3,4,337934,990994590,2018-07-23,7001647,318422234.0,944814728,0.0,0.0,4.0,...,HAGERSTOWN,11239886,Y,R,N,,,,,
4,5,337934,990994590,2018-07-23,7001715,318422234.0,944814728,0.0,0.0,4.0,...,HAGERSTOWN,11239886,Y,R,N,,,,,
