### Load Data

In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
path = r"D:\OnedriveWork\OneDrive - Microsoft\Documents\data\bf_data"

In [3]:
raw = pd.read_csv(path + '/train.csv')

In [4]:
raw.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [5]:
raw.shape

(550068, 12)

In [6]:
raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 12 columns):
User_ID                       550068 non-null int64
Product_ID                    550068 non-null object
Gender                        550068 non-null object
Age                           550068 non-null object
Occupation                    550068 non-null int64
City_Category                 550068 non-null object
Stay_In_Current_City_Years    550068 non-null object
Marital_Status                550068 non-null int64
Product_Category_1            550068 non-null int64
Product_Category_2            376430 non-null float64
Product_Category_3            166821 non-null float64
Purchase                      550068 non-null int64
dtypes: float64(2), int64(5), object(5)
memory usage: 50.4+ MB


In [7]:
raw.isna().sum()

User_ID                            0
Product_ID                         0
Gender                             0
Age                                0
Occupation                         0
City_Category                      0
Stay_In_Current_City_Years         0
Marital_Status                     0
Product_Category_1                 0
Product_Category_2            173638
Product_Category_3            383247
Purchase                           0
dtype: int64

### Data Engineering

In [8]:
df = raw

In [9]:
#Impute missing values
df.Product_Category_2.fillna(-1, inplace=True)
df.Product_Category_3.fillna(-1, inplace=True)

In [10]:
df.User_ID = df.User_ID.astype(str)
df.Occupation = df.Occupation.astype(str)
df.Marital_Status = df.Marital_Status.astype(str)
df.Product_Category_1 = df.Product_Category_1.astype(str)
df.Product_Category_2 = df.Product_Category_2.astype(str)
df.Product_Category_3 = df.Product_Category_3.astype(str)

In [11]:
target_col = 'Purchase'
cat_cols = [col for col in df.columns if col != target_col]

In [12]:
unique_col_count = df[cat_cols].nunique()
unique_col_count

User_ID                       5891
Product_ID                    3631
Gender                           2
Age                              7
Occupation                      21
City_Category                    3
Stay_In_Current_City_Years       5
Marital_Status                   2
Product_Category_1              20
Product_Category_2              18
Product_Category_3              16
dtype: int64

In [13]:
def cat2idx(dataset, cat_cols):
    data = dataset.copy()
    cat2idx_dict = dict()
    
    for col in cat_cols:
        unique_cat = data[col].unique()
        cat2idx_map = {o:i for i, o in enumerate(unique_cat)}
        data[col] = data[col].apply(lambda x: cat2idx_map[x])
        cat2idx_dict[col] = cat2idx_map
        
    return data, cat2idx_dict

In [14]:
df_indexed, index_map = cat2idx(df, cat_cols)

In [15]:
df_indexed.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,0,0,0,0,0,0,0,0,0,0,0,8370
1,0,1,0,0,0,0,0,0,1,1,1,15200
2,0,2,0,0,0,0,0,0,2,0,0,1422
3,0,3,0,0,0,0,0,0,2,2,0,1057
4,1,4,1,1,1,1,1,0,3,0,0,7969


In [18]:
from feather import write_dataframe

In [19]:
write_dataframe(df_indexed, 'training_data.feather')

In [20]:
df_indexed.to_csv('training_data.csv')

In [21]:
with open('training_data.pkl', 'wb') as f:
    pickle.dump(df_indexed, f)

In [22]:
with open('index_map.pkl', 'wb') as f:
    pickle.dump(index_map, f)

In [97]:
with open('outcome_var.pkl', 'wb') as f:
    pickle.dump(target_col, f)

### Train Test Split

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
X_train, X_test, y_train, y_test = train_test_split(df_indexed[cat_cols].values, df_indexed[target_col].values, test_size=0.2, random_state=42)

In [25]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((440054, 11), (440054,), (110014, 11), (110014,))

### Model Creation

In [54]:
from tensorflow.keras.layers import Input, Embedding, concatenate, Flatten, Dense, Dropout, Reshape
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam, RMSprop

In [55]:
input_models=[]
output_embeddings=[]

In [56]:
for cat_col in cat_cols:
    cat_emb_name = cat_col + "_Embedding"
    cat_input_name = cat_col + "_Input"
    emb_size = int(min(np.ceil((unique_col_count[cat_col])/2), 50))
    
    input_model = Input(shape=(1,), name=cat_input_name)
    output_model = Embedding(unique_col_count[cat_col], emb_size, input_length=1, name=cat_emb_name)(input_model)
    output_model = Reshape(target_shape=(emb_size,))(output_model)
    
    input_models.append(input_model)
    output_embeddings.append(output_model)

In [57]:
x = concatenate(output_embeddings)
x = Dense(512, activation='relu')(x)
x = Dropout(rate=0.7)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(rate=0.4)(x)
x = Dense(1)(x)

In [58]:
model = Model(inputs=input_models, outputs=x)

In [59]:
opt = Adam(lr=0.01)

In [60]:
model.compile(loss='mean_squared_error', optimizer=opt, metrics=['mse'])

In [61]:
X_train.shape

(440054, 11)

### Model Training

In [62]:
X_train_list = []
X_test_list = []

for i, _ in enumerate(cat_cols):
    X_train_list.append(X_train[:, i])
    X_test_list.append(X_test[:, i])

In [63]:
model.fit(X_train_list, y_train, validation_data=(X_test_list, y_test), epochs=2 , batch_size=256)

Train on 440054 samples, validate on 110014 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x1919c43e6a0>

In [64]:
from sklearn.metrics import mean_squared_error
pred = model.predict(X_test_list)
score = np.sqrt(mean_squared_error(y_test,pred))
print (score)

2536.6626988708767


In [65]:
pred = np.squeeze(model.predict([i[0:15] for i in X_test_list]))
pd.DataFrame({'pred': pred, 'actual': y_test[0:15]})

Unnamed: 0,pred,actual
0,16099.805664,19142
1,16589.945312,15513
2,8449.321289,7802
3,14199.564453,15455
4,11426.116211,4492
5,7970.996094,6965
6,8507.952148,8763
7,16898.96875,19347
8,8002.360352,7017
9,13751.893555,15594


In [66]:
model.save('bf_emb.hd5')

### Baseline models

In [321]:
import random
from sklearn.ensemble import RandomForestRegressor
random.seed(42)
rf = RandomForestRegressor(n_estimators=10)
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [322]:
from sklearn.metrics import mean_squared_error
pred=rf.predict(X_test)
score = np.sqrt(mean_squared_error(y_test,pred))
print (score)

2861.9028388980832


In [345]:
pd.DataFrame({'pred': rf.predict(X_test[0:15, :]), 'actual': y_test[0:15]})

Unnamed: 0,pred,actual
0,13194.5,19142
1,15181.8,15513
2,8697.0,7802
3,7385.6,15455
4,14361.9,4492
5,6979.8,6965
6,7570.2,8763
7,17905.1,19347
8,5948.8,7017
9,14204.8,15594


In [309]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import BatchNormalization

In [310]:
m = Sequential()
m.add(Dense(512, input_dim=11, activation= "relu"))
m.add(Dropout(rate=0.7))
m.add(Dense(256, activation= "relu"))
m.add(Dropout(rate=0.4))
m.add(Dense(1))
m.summary() #Print model Summary

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_42 (Dense)             (None, 512)               6144      
_________________________________________________________________
dropout_24 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_43 (Dense)             (None, 256)               131328    
_________________________________________________________________
dropout_25 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_44 (Dense)             (None, 1)                 257       
Total params: 137,729
Trainable params: 137,729
Non-trainable params: 0
_________________________________________________________________


In [311]:
m.compile(loss= "mean_squared_error" , optimizer="rmsprop", metrics=["mse"])
m.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10 , batch_size=512)

Train on 440054 samples, validate on 110014 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fc7f49be048>

In [312]:
from sklearn.metrics import mean_squared_error
pred = m.predict(X_test)
score = np.sqrt(mean_squared_error(y_test,pred))
print (score)

4926.683431094741


In [252]:
X_test[0, :], y_test[0]

(array([943, 476,   1,   4,  14,   1,   2,   0,   1,   3,   5]), 19142)

In [255]:
X_test.shape

(110014, 11)

In [265]:
m.predict(X_test[0:15, :].reshape(15, -1)), y_test[0:15]

(array([[10606.161 ],
        [13206.655 ],
        [ 6366.4194],
        [11040.15  ],
        [12848.5205],
        [ 7699.681 ],
        [10147.699 ],
        [ 7533.976 ],
        [ 9200.857 ],
        [ 8683.802 ],
        [11316.114 ],
        [12062.061 ],
        [12124.043 ],
        [ 5723.922 ],
        [ 3677.109 ]], dtype=float32),
 array([19142, 15513,  7802, 15455,  4492,  6965,  8763, 19347,  7017,
        15594,  4649, 19549,  1762,  6186,  3663]))

### Prediction

#### Create test value

In [119]:
raw.iloc[1, :]

User_ID                         1000001
Product_ID                    P00248942
Gender                                F
Age                                0-17
Occupation                           10
City_Category                         A
Stay_In_Current_City_Years            2
Marital_Status                        0
Product_Category_1                    1
Product_Category_2                  6.0
Product_Category_3                 14.0
Purchase                          15200
Name: 1, dtype: object

In [121]:
user_id, product_id, gender, age, occupation, city_category, stay_in_current_city_years, marital_status, pc_1, pc_2, pc_3, purchase = raw.iloc[1, :]

In [122]:
test_val = dict(User_ID=user_id, 
     Product_ID=product_id, 
     Gender=gender, 
     Age=age, 
     Occupation=occupation, 
     City_Category=city_category, 
     Stay_In_Current_City_Years=stay_in_current_city_years, 
     Marital_Status=marital_status, 
     Product_Category_1=pc_1, 
     Product_Category_2=pc_2, 
     Product_Category_3=pc_3, 
     Purchase=purchase)

In [118]:
with open('test_val.pkl', 'wb') as f:
    pickle.dump(test_val, f)

#### Making predictions

In [107]:
import pickle
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import load_model

In [132]:
model = load_model('bf_emb.hd5')

In [123]:
with open('index_map.pkl', 'rb') as f:
    index_map = pickle.load(f)

In [124]:
len(index_map)

11

In [125]:
with open('outcome_var.pkl', 'rb') as f:
    target_col = pickle.load(f)

In [126]:
target_col

'Purchase'

In [127]:
with open('test_val.pkl', 'rb') as f:
    test_val = pickle.load(f)

In [128]:
test_val

{'User_ID': '1000001',
 'Product_ID': 'P00248942',
 'Gender': 'F',
 'Age': '0-17',
 'Occupation': '10',
 'City_Category': 'A',
 'Stay_In_Current_City_Years': '2',
 'Marital_Status': '0',
 'Product_Category_1': '1',
 'Product_Category_2': '6.0',
 'Product_Category_3': '14.0',
 'Purchase': 15200}

In [130]:
test_data = [[index_map[key][value]] for key, value in test_val.items() if key!=target_col]

In [131]:
test_data

[[0], [1], [0], [0], [0], [0], [0], [0], [1], [1], [1]]

In [133]:
pred = int(np.squeeze(model.predict(test_data)))

In [134]:
{'predicted': pred, 'actual': test_val['Purchase']}

{'predicted': 17110, 'actual': 15200}