In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai.basics import *
from sklearn import preprocessing

In [3]:
data_path = Path("/home/jupyter/data/bimbo")
os.makedirs('tmp', exist_ok=True)

In [4]:
data_path.ls()

[PosixPath('/home/jupyter/data/bimbo/test.csv'),
 PosixPath('/home/jupyter/data/bimbo/zips'),
 PosixPath('/home/jupyter/data/bimbo/producto_tabla.csv'),
 PosixPath('/home/jupyter/data/bimbo/cliente_tabla.csv'),
 PosixPath('/home/jupyter/data/bimbo/nn-200-100'),
 PosixPath('/home/jupyter/data/bimbo/models'),
 PosixPath('/home/jupyter/data/bimbo/sample_submission.csv'),
 PosixPath('/home/jupyter/data/bimbo/town_state.csv'),
 PosixPath('/home/jupyter/data/bimbo/tmp'),
 PosixPath('/home/jupyter/data/bimbo/train.csv')]

In [5]:
df_geo = pd.read_csv(data_path/'town_state.csv')
df_client = pd.read_csv(data_path/'cliente_tabla.csv')
df_product = pd.read_csv(data_path/'producto_tabla.csv')

In [6]:
le_g = preprocessing.LabelEncoder()
df_geo['Town_State'] = df_geo['Town'] + ', ' + df_geo['State']
df_geo['geo_lbl'] = le_g.fit_transform(df_geo['Town_State'])
df_geo.drop_duplicates(inplace=True)
df_geo.to_parquet('tmp/geo_map.parquet')

In [7]:
le_c = preprocessing.LabelEncoder()
df_client['client_lbl'] = le_c.fit_transform(df_client['NombreCliente'])
df_client.drop_duplicates(inplace=True)
df_client.to_parquet('tmp/client_map.parquet')

In [8]:
le_p = preprocessing.LabelEncoder()
df_product['product_lbl'] = le_p.fit_transform(df_product['NombreProducto'])
df_product.drop_duplicates(inplace=True)
df_product.to_parquet('tmp/product_map.parquet')

In [9]:
def label_df(df):
    df = df.merge(df_geo[['Agencia_ID', 'geo_lbl']], how='left', on='Agencia_ID')
    df = df.merge(df_client[['Cliente_ID', 'client_lbl']], how='left', on='Cliente_ID')
    df = df.merge(df_product[['Producto_ID', 'product_lbl']], how='left', on='Producto_ID')
    df.drop(['Agencia_ID', 'Cliente_ID', 'Producto_ID'], axis=1, inplace=True)
    df.drop_duplicates(inplace=True)
    df.sort_values(by=['client_lbl', 'product_lbl', 'geo_lbl', 'Semana'], inplace=True)
    return df

In [10]:
# df_test = pd.read_csv(data_path/'test.csv')
# df_test.to_parquet('tmp/test_raw.parquet')
df_test.read_parquet('tmp/test_raw.parquet')

In [11]:
df_test = label_df(df_test)
df_test = df_test.astype(dtype={'Semana': np.uint8,
                                  'Canal_ID': np.uint8,
                                  'Ruta_SAK': np.uint16,
                                  'geo_lbl': np.uint16,
                                  'client_lbl': np.uint32,
                                  'product_lbl': np.uint16})
df_test.to_parquet('tmp/test_labeled.parquet')

In [12]:
# df_train = pd.read_csv(data_path/'train.csv')
# df_train.to_parquet('tmp/train_raw.parquet')
df_train.read_parquet('tmp/train_raw.parquet')

In [13]:
df_train = label_df(df_train)
df_train = df_train.astype(dtype={'Semana': np.uint8,
                                  'Canal_ID': np.uint8,
                                  'Ruta_SAK': np.uint16,
                                  'Venta_uni_hoy': np.uint16,
                                  'Venta_hoy': np.float32,
                                  'Dev_uni_proxima': np.uint16,
                                  'Dev_proxima': np.float32,
                                  'Demanda_uni_equil': np.uint16,
                                  'geo_lbl': np.uint16,
                                  'client_lbl': np.uint32,
                                  'product_lbl': np.uint16})
df_train.to_parquet('tmp/train_labeled.parquet')

In [14]:
df_train = df_train.sample(frac=0.1).reset_index(drop=True)
df_train.to_parquet('tmp/train_small.parquet')

In [15]:
df_train.head()

Unnamed: 0,Semana,Canal_ID,Ruta_SAK,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,geo_lbl,client_lbl,product_lbl
0,4,11,3913,4,41.919998,0,0.0,4,162,92721,591
1,4,1,2105,2,23.059999,0,0.0,2,11,230616,1806
2,6,1,1206,5,44.599998,0,0.0,5,78,78833,1551
3,5,1,2811,10,52.799999,13,68.639999,0,79,230616,1008
4,6,1,1131,2,13.54,0,0.0,2,229,206884,603
