In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import traceback
import re
import warnings
warnings.filterwarnings('ignore')

In [2]:
transactions_by_customer = pd.read_csv('transactions_by_customer_DW.csv')

### Label encode age_range, family_size, income_bracket

In [3]:
transactions_by_customer['age_range'] = transactions_by_customer['age_range'].astype(str)
le = preprocessing.LabelEncoder()
transactions_by_customer['age_range'] = le.fit_transform(transactions_by_customer['age_range'].values)
transactions_by_customer['family_size'] = transactions_by_customer['family_size'].astype(str)
transactions_by_customer['family_size'] = le.fit_transform(transactions_by_customer['family_size'].values)
transactions_by_customer['income_bracket'] = transactions_by_customer['income_bracket'].astype(str)
transactions_by_customer['income_bracket'] = le.fit_transform(transactions_by_customer['income_bracket'].values)

### One-hot encode marital_status, day, month, year, weekday, brand_type, rented

In [4]:
transactions_by_customer['marital_status'] = pd.get_dummies(transactions_by_customer['marital_status'], prefix='marital_status')
weekday_ohe = pd.get_dummies(transactions_by_customer['weekday'])
month_ohe = pd.get_dummies(transactions_by_customer['month'])
day_ohe = pd.get_dummies(transactions_by_customer['day'])
year_ohe = pd.get_dummies(transactions_by_customer['year'])

transactions_by_customer = pd.concat([transactions_by_customer, day_ohe, month_ohe, year_ohe, weekday_ohe], axis=1) #ddding one-hot encoded columns
transactions_by_customer.drop(['date', 'weekday', 'month', 'year', 'day'], axis=1, inplace=True) #dropping original columns

transactions_by_customer['brand_type'] = pd.get_dummies(transactions_by_customer['brand_type'])
transactions_by_customer['rented'] = pd.get_dummies(transactions_by_customer['rented'])

### One-hot encode category

In [5]:
transactions_by_customer['category'].value_counts()

Grocery                   558555
Pharmaceutical             93734
Packaged Meat              37105
Natural Products           33469
Dairy, Juices & Snacks     13428
Meat                       13221
Bakery                     11437
Fuel                        8637
Prepared Food               8011
Seafood                     4104
Skin & Hair Care            3014
Miscellaneous               1985
Flowers & Plants             997
Alcohol                      694
Garden                       294
Salads                       217
Travel                       131
Restauarant                  129
Vegetables (cut)              65
Name: category, dtype: int64

In [6]:
transactions_by_customer['category'] = transactions_by_customer['category'].str.strip()

map_category = {'Vegetables (cut)':'Grocery', 'Skin & Hair Care': 'Pharmaceutical', 'Packaged Meat':'Meat', 'Seafood':'Meat', \
                'Bakery':'Fresh Food', 'Prepared Food':'Fresh Food', 'Restaurant':'Fresh Food', 'Alcohol':'Miscellaneous', \
                'Fuel':'Miscellaneous', 'Flowers & Plants':'Miscellaneous', 'Garden': 'Miscellaneous', 'Travel':'Miscellaneous', \
                'Grocery':'Grocery', 'Pharmaceutical':'Pharmaceutical', 'Natural Products': 'Natural Products', 'Meat':'Meat', \
                'Dairy, Juices & Snacks':'Dairy & Snacks', 'Miscellaneous':'Miscellaneous', 'Salads':'Grocery'}
transactions_by_customer['category'] = transactions_by_customer['category'].map(map_category)
transactions_by_customer['category'].value_counts()

Grocery             558837
Pharmaceutical       96748
Meat                 54430
Natural Products     33469
Fresh Food           19448
Dairy & Snacks       13428
Miscellaneous        12738
Name: category, dtype: int64

In [7]:
category_ohe = pd.get_dummies(transactions_by_customer['category'])
transactions_by_customer = pd.concat([transactions_by_customer, category_ohe], axis=1) #ddding one-hot encoded columns
transactions_by_customer.drop(['category'], axis=1, inplace=True) #dropping original column
transactions_by_customer.head()

Unnamed: 0,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount,age_range,marital_status,rented,family_size,...,Thursday,Tuesday,Wednesday,Dairy & Snacks,Fresh Food,Grocery,Meat,Miscellaneous,Natural Products,Pharmaceutical
0,1,4315,1,201.97,0.0,0.0,5,1,1,1,...,1,0,0,0,0,0,0,0,0,1
1,1,4577,2,120.4,0.0,0.0,5,1,1,1,...,0,0,1,0,0,1,0,0,0,0
2,1,4796,1,106.5,0.0,0.0,5,1,1,1,...,1,0,0,0,0,1,0,0,0,0
3,1,4953,1,142.12,0.0,0.0,5,1,1,1,...,0,1,0,0,0,1,0,0,0,0
4,1,4953,1,142.12,0.0,0.0,5,1,1,1,...,1,0,0,0,0,1,0,0,0,0


In [8]:
transactions_by_customer.head()

Unnamed: 0,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount,age_range,marital_status,rented,family_size,...,Thursday,Tuesday,Wednesday,Dairy & Snacks,Fresh Food,Grocery,Meat,Miscellaneous,Natural Products,Pharmaceutical
0,1,4315,1,201.97,0.0,0.0,5,1,1,1,...,1,0,0,0,0,0,0,0,0,1
1,1,4577,2,120.4,0.0,0.0,5,1,1,1,...,0,0,1,0,0,1,0,0,0,0
2,1,4796,1,106.5,0.0,0.0,5,1,1,1,...,1,0,0,0,0,1,0,0,0,0
3,1,4953,1,142.12,0.0,0.0,5,1,1,1,...,0,1,0,0,0,1,0,0,0,0
4,1,4953,1,142.12,0.0,0.0,5,1,1,1,...,1,0,0,0,0,1,0,0,0,0


In [9]:
transactions_by_customer.columns

Index([     'customer_id',          'item_id',         'quantity',
          'selling_price',   'other_discount',  'coupon_discount',
              'age_range',   'marital_status',           'rented',
            'family_size',   'no_of_children',   'income_bracket',
                  'brand',       'brand_type',             'week',
                        1,                  2,                  3,
                        4,                  5,                  6,
                        7,                  8,                  9,
                       10,                 11,                 12,
                       13,                 14,                 15,
                       16,                 17,                 18,
                       19,                 20,                 21,
                       22,                 23,                 24,
                       25,                 26,                 27,
                       28,                 29,                

In [10]:
transactions_by_customer.shape

(789227, 74)

In [11]:
transactions_by_customer.to_csv('transactions_by_customer_FE.csv')