# Ecommerce Categories

In [19]:
import numpy as np, pandas as pd
import os, sys
import math
import shutil
import zipfile
import string
import random
import json

# Paths and Variables

In [20]:
dataset_name = 'ecommerce_categories'

In [21]:
input_dir = './raw'
output_dir = './processed'

outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')
outp_train_fname = os.path.join(output_dir, f'{dataset_name}_train.csv')
outp_test_fname = os.path.join(output_dir, f'{dataset_name}_test.csv')
outp_test_key_fname = os.path.join(output_dir, f'{dataset_name}_test_key.csv')
outp_infer_instances = os.path.join(output_dir, f'{dataset_name}_infer_req.json')

In [22]:
inp_fname = 'ecommerceDataset.csv'

In [23]:
cols = ["category", "description"]

# Read data into a DataFrame

In [24]:
data = pd.read_csv(os.path.join(input_dir, inp_fname), header=None, names=cols)
print(data.shape)
data.head()

(50425, 2)


Unnamed: 0,category,description
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


In [25]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50425 entries, 0 to 50424
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   category     50425 non-null  object
 1   description  50424 non-null  object
dtypes: object(2)
memory usage: 788.0+ KB


In [26]:
id_col = "id"
target_col = "category"
text_col = "description"

In [27]:
data[target_col].value_counts()

Household                 19313
Books                     11820
Electronics               10621
Clothing & Accessories     8671
Name: category, dtype: int64

# Prepare Data

### Drop NaN rows

In [28]:
data = data.dropna()

In [29]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50424 entries, 0 to 50424
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   category     50424 non-null  object
 1   description  50424 non-null  object
dtypes: object(2)
memory usage: 1.2+ MB


# Insert Id Column

In [30]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())

   id   category                                        description
0   0  Household  Paper Plane Design Framed Wall Hanging Motivat...
1   1  Household  SAF 'Floral' Framed Painting (Wood, 30 inch x ...
2   2  Household  SAF 'UV Textured Modern Art Print Framed' Pain...
3   3  Household  SAF Flower Print Framed Painting (Synthetic, 1...
4   4  Household  Incredible Gifts India Wooden Happy Birthday U...


# Shuffle Data

In [31]:
# shuffle data
data = data.sample(frac=1, random_state=42)
print(data.shape)
data.head()

(50424, 3)


Unnamed: 0,id,category,description
35847,35847,Clothing & Accessories,BREGEO Men's Cotton Casual Blazer This one but...
13005,13005,Household,HealthSense Chef-Mate KS 50 Digital Kitchen Sc...
26164,26164,Books,Think & Grow Rich About the Author NAPOLEON HI...
38330,38330,Clothing & Accessories,ayushicreationa Women's Cotton Sports Padded B...
45344,45343,Electronics,BlueRigger High Speed Micro HDMI to HDMI Cable...


# Utility to Save DF as a zipped file

In [32]:
def save_df_to_zipped_csv(df, ftype=None): 
    if ftype is not None: 
        suffix = f'_{ftype}'
    else: 
        suffix = ''
        
    zipped_f_name = f'{dataset_name}{suffix}.zip'
    archive_f_name = f'{dataset_name}{suffix}.csv'   
    compression_opts = dict(method='zip',
                        archive_name=archive_f_name)      
    df.to_csv(os.path.join(output_dir, zipped_f_name), index=False, compression=compression_opts )

# Save Main Data File

In [33]:
# # save original file as csv
# data.to_csv(outp_fname, index=False)

# save as zipped file 
save_df_to_zipped_csv(data)

# Train Test Split

In [34]:
from sklearn.model_selection import train_test_split
test_size = 0.1

data_train, data_test = train_test_split(data, test_size=test_size, random_state=42)


data_test_key = data_test[[id_col, target_col]].copy()
data_test = data_test.drop(columns=[target_col])
print(data_train.shape, data_test.shape, data_test_key.shape)

# # Save original files as csv
# data_train.to_csv(outp_train_fname, index=False)
# data_test.to_csv(outp_test_fname, index=False)
# data_test_key.to_csv(outp_test_key_fname, index=False)


(45381, 3) (5043, 2) (5043, 2)


In [35]:
# zip files
save_df_to_zipped_csv(data_train, "train")
save_df_to_zipped_csv(data_test, "test")
save_df_to_zipped_csv(data_test_key, "test_key")

# JSON inference request instance

In [36]:
instance = data_test.reset_index(drop=True).loc[0].to_dict()
infer_req_instance_dict = {  "instances": [ {**instance}, ] }
print(infer_req_instance_dict)
   
with open(outp_infer_instances, 'w', encoding='utf8') as f:
    json.dump(infer_req_instance_dict, f, indent=2, ensure_ascii=False)

{'instances': [{'id': 37627, 'description': "Dada Shopy Comfort Fit Rayon Cotton Pant Palazzo for Women combo 2 (Black::White) Beautifully crafted regular wear rayon cotton pant palazzo to add grace to your normal attire. This can be carried upon formal's, casual or ethnic occation. Fabric:- Rayon Cotton Color:- Two color Size:- Free Size (Waist - Min 28 to Max 40 intches, Length - 41 intch, Hips - 43 intch) Fit :- Regular Fit"}]}
