# Resale Price Prediction

In [None]:
# import the libraries
import re
import scipy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from scipy.sparse import coo_matrix, hstack
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from keras import backend as K
import tensorflow as tf
from tensorflow import keras
from keras.layers import Dense, Input, Dropout
from keras.models import Model

from utils import *

import warnings
warnings.filterwarnings("ignore")

plt.style.use('fivethirtyeight')
plt.style.use('seaborn-notebook')

%config InlineBackend.figure_format = 'retina'
%reload_ext autoreload
%autoreload 2

In [None]:
df = pd.read_pickle('./data/df_cleaned.p')

In [None]:
colname_map = {'PRC':'BRAND', 'PARTNO':'PARTNO','UNIT RESALE':'UNITRESALE',
               'ORIG ORDER QTY':'ORDERQTY', 'NEW UNIT COST':'UNITCOST'}
df = prepare_data(df, colname_map)

In [None]:
df.head()

Unnamed: 0,BRAND,PARTNO,QUANTITY,UNITRESALE,UNITCOST
0,2,53001176-1 REV A,25-49,209.66,107.65
1,2,53001176-1 REV A,25-49,209.66,99.51
2,2,61-82477-8,25-49,76.75,60.0
3,2,AA1208K08,25-49,66.12,50.8
4,2,AA1208K08X,50-99,66.21,52.0


In [None]:
df, fitted_lambda = scale_price(df)

In [None]:
df.head()

Unnamed: 0,BRAND,PARTNO,QUANTITY,UNITCOST,RESALE
0,2,61-82477-8,25-49,60.0,3.850272
1,2,AA1208K08,25-49,50.8,3.733058
2,2,AA1208K08X,50-99,52.0,3.734132
3,2,AA67006-4KA,50-99,13.9,2.559783
4,2,AA67006-4KA,25-49,13.5,2.686291


In [None]:
CV1 = CountVectorizer(stop_words=None, 
                      max_df=1.0, 
                      min_df=100, 
                      ngram_range=(1,1),
                      binary=True, 
                      analyzer='char')

CV1.fit(list(set(df['PARTNO'].tolist())))
X1 = CV1.transform(df['PARTNO'].tolist())
X1

<432960x45 sparse matrix of type '<class 'numpy.int64'>'
	with 3493797 stored elements in Compressed Sparse Row format>

In [None]:
# CV1.vocabulary_

In [None]:
CV2 = CountVectorizer(stop_words=None, 
                      max_df=0.8, 
                      min_df=100, 
                      ngram_range=(2,6), 
                      binary=True,
                      analyzer='char')
CV2.fit(list(set(df['PARTNO'].tolist())))
X2 = CV2.transform(df['PARTNO'].tolist())
X2

<432960x5430 sparse matrix of type '<class 'numpy.int64'>'
	with 9427277 stored elements in Compressed Sparse Row format>

In [None]:
def tokenizer(text):
  text = text.lower()
  rx1 = r"(?i)(?:(?<=\d)(?=[a-z])|(?<=[a-z])(?=\d))"
  text = re.sub(rx1,' ', text)
  text = re.sub(r'[^a-z0-9]',' ', text)
  text = ' '.join(text.split())
  text = text.split()
  return text

In [None]:
CV3 = TfidfVectorizer(stop_words=None, 
                      max_df=0.5, 
                      min_df=100, 
                      ngram_range=(1,5), 
                      binary=False,
                      analyzer='word',
                      tokenizer=tokenizer)
CV3.fit(list(set(df['PARTNO'].tolist())))
X3 = CV3.transform(df['PARTNO'].tolist())
X3

<432960x1007 sparse matrix of type '<class 'numpy.float64'>'
	with 1715717 stored elements in Compressed Sparse Row format>

In [None]:
enc = OneHotEncoder()
ohecols = ['BRAND','QUANTITY']
enc.fit(df[ohecols])
X4 = enc.transform(df[ohecols])
X4

<432960x577 sparse matrix of type '<class 'numpy.float64'>'
	with 865920 stored elements in Compressed Sparse Row format>

In [None]:
X = hstack([X1, X2, X3, X4])
X

<432960x7059 sparse matrix of type '<class 'numpy.float64'>'
	with 15502711 stored elements in COOrdinate format>

In [None]:
Y = df['RESALE'].values
Y = Y.reshape(-1,1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=40)
print("Training Records {}, Testing Records: {}".format(X_train.shape[0],
                                                        X_test.shape[0]))

Training Records 389664, Testing Records: 43296


In [None]:
batch_size = 2048
epochs = 75

inputs = Input(shape=(X_train.shape[1],), sparse=True)
L = Dense(512, activation='relu')(inputs)
L = Dropout(0.5)(L)
L = Dense(10, activation='relu')(L)
outputs = Dense(y_train.shape[1])(L)
model = Model(inputs=inputs, outputs=outputs)
model.compile(loss='mse', optimizer='adam', metrics=['mae'])
model.summary()

Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 7059)]            0         
_________________________________________________________________
dense_3 (Dense)              (None, 512)               3614720   
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 10)                5130      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 11        
Total params: 3,619,861
Trainable params: 3,619,861
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(nn_batch_generator(X_train, y_train, batch_size),
          steps_per_epoch=len(y_train)//batch_size, 
          validation_data=nn_batch_generator(X_test, y_test, batch_size),
          validation_steps=len(y_test)//batch_size, 
          epochs=100,
          workers=-1, 
          use_multiprocessing=True)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
model.save('./models/model_201203.h5')

In [None]:
hist_df = pd.DataFrame(history.history) 
hist_csv_file = './outputs/history.csv'
with open(hist_csv_file, mode='w') as f:
  hist_df.to_csv(f)

In [None]:
from scipy.special import inv_boxcox
from sklearn.metrics import r2_score, median_absolute_error, mean_absolute_error

y_pred = model.predict(X_test).flatten()
a = inv_boxcox(y_test.flatten(), fitted_lambda)
b = inv_boxcox(y_pred.flatten(), fitted_lambda)
print('r2_score: ', r2_score(a, b))
print('median_absolute_error: ', median_absolute_error(a, b))
print('mean_absolute_error', mean_absolute_error(a, b))
out2 = pd.DataFrame({'y_true':inv_boxcox(y_test.flatten(), fitted_lambda), 'y_pred':inv_boxcox(y_pred.flatten(), fitted_lambda)})

r2_score:  0.7725025811056968
median_absolute_error:  0.49407594919204767
mean_absolute_error 3.357431710265042


In [None]:
out2.head()

Unnamed: 0,y_true,y_pred
0,4.65,10.811751
1,7.6,7.666917
2,1.1,0.746361
3,0.72,0.291657
4,41.4,31.236202


In [None]:
_, out1 = train_test_split(df, test_size=0.1, random_state=40)
out1['RESALE'] = out2.y_true.values
out1['PRED'] = out2.y_pred.values
out1.to_csv('./outputs/result.csv', index=False)

In [None]:
out1.sample(10)

Unnamed: 0,BRAND,PARTNO,QUANTITY,UNITCOST,RESALE,PRED
174877,155,TLM-6X1C-12,2500-4999,0.09,0.15,0.170021
39616,59,MS3498-9,500-999,1.12,1.5,1.150005
18036,30,8106-A-0440-17,500-999,0.17,0.4,0.61341
406828,662,250-0201-01,50-99,3.01,5.0,4.733656
78736,78,SC-16-SB,5-9,59.05,84.35,91.922928
116973,116,NAS6206-18,1-4,2.18,25.0,24.017282
59365,63,MS21087-4,25-49,3.5,5.22,7.015954
256983,350,0326010.HXP,10-24,0.4,4.5,4.300601
211941,212,MS91528-1F2B,25-49,2.55,4.4,4.558315
334493,525,M24243/1A404,5000-9999,0.12,0.26,0.203509
