In [1]:
import pandas as pd
import numpy as np
import collections
import json
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import stopwords
import re
import nltk
import spacy
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from num2words import num2words
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
import string
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.preprocessing import OneHotEncoder
from scipy import stats
import pickle
import time

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
def decontractions(phrase):
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"won\’t", "will not", phrase)
    phrase = re.sub(r"can\’t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    phrase = re.sub(r"n\’t", " not", phrase)
    phrase = re.sub(r"\’re", " are", phrase)
    phrase = re.sub(r"\’s", " is", phrase)
    phrase = re.sub(r"\’d", " would", phrase)
    phrase = re.sub(r"\’ll", " will", phrase)
    phrase = re.sub(r"\’t", " not", phrase)
    phrase = re.sub(r"\’ve", " have", phrase)
    phrase = re.sub(r"\’m", " am", phrase)
    return phrase

def preprocess(text):
    text = text.lower()
    text = decontractions(text)
    text = re.sub('[^A-Za-z0-9 ]+', ' ', text)
    text = [t for t in text if t not in string.punctuation]
    return ''.join(text)

def removeStopWords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)  
    filtered_sentence = [w for w in word_tokens if not w in list(stop_words)] 
    filtered_sentence = ' '.join(filtered_sentence)
    return filtered_sentence

def lemmatizer(text, spacydata):
    lem_text = spacydata(text)
    lem_text = [i.lemma_ for i in lem_text]
    return ' '.join(lem_text)

def remove_special_characters(text):
    pat = r'[^a-zA-z0-9.,!?/:;\"\'\s]' 
    return re.sub(pat, '', text)

def num_to_word(text):
    digits = [int(s) for s in text.split() if s.isdigit()]
    digits_map = {str(i): num2words(i) for i in digits}
    for i in digits_map.keys():
        text = text.replace(i, digits_map[i])
    return text

def remove_extra_whitespace_tabs(text):
    pattern = r'^\s*|\s\s*'
    return re.sub(pattern, ' ', text).strip()

def contains_bundle(data):
    if (('bundles' in data) or ('bundle' in data)):
        return 1
    else:
        return 0
    
def get_sentiment_score(data):
    senti = SentimentIntensityAnalyzer()
    if data != 'miss':
        sentence_sentiment_score = senti.polarity_scores(data)
        compound = sentence_sentiment_score['compound']
        if compound >= 0.5:
            return 3 
        if compound >= (-0.5) and compound < 0.5:
            return 2
        if compound < (-0.5):
            return 1
    else: 
        return 0

In [3]:
def get_preprocessed_data(data, do_preprocess=False, spacydata=None):
    if (type(data) == str):
        data = data.strip()
    if (data == '' or pd.isna(data) or data.lower() == 'no description yet'):
        data = 'miss'
    elif (do_preprocess):
        data = preprocess(data)
        data = remove_special_characters(data)
        data = num_to_word(data)
        data = remove_extra_whitespace_tabs(data)
        data = lemmatizer(data, spacydata)
    else:
        data = data.lower()
    return data

In [4]:
def encode_data(data, encoder):
    return encoder.transform(np.array(data).reshape(-1, 1))

def tokanize_data(data, tokanizer):
    return np.array(tokanizer.texts_to_sequences([data]))

In [5]:
def form_predict_data(data):
    x = {
        'name': pad_sequences(data['name'], maxlen=30),
        'item_description': pad_sequences(data['item_description'], maxlen=70),
        'item_condition_id': np.array(data['item_condition_id']).reshape(1, -1), 
        'brand_name': np.array(data['brand_name']).reshape(1, -1), 
        'sub_l1': np.array(data['sub_l1']).reshape(1, -1), 
        'sub_l2': np.array(data['sub_l2']).reshape(1, -1), 
        'sub_l3': np.array(data['sub_l3']).reshape(1, -1), 
        'shipping': np.array(data['shipping']).reshape(1, -1), 
        'contains_bundle': np.array(data['contains_bundle']).reshape(1, -1),
        'item_description_score': np.array(data['item_description_score']).reshape(1, -1), 
    }
    return x

In [6]:
def predict(data, model):
    predicted = model.predict(data)
    predicted = predicted.flatten().tolist()
    temp = []
    for i in predicted:
        if i<0:
            temp.append(0.0)
        else:
            temp.append(i)

    predicted = temp
    return predicted

In [11]:
def predict_data(data):
    sp = spacy.load('en_core_web_sm')
    
    model = tf.keras.models.load_model('/Applied AI/Assignments/29.SelfCase Study - 1/Best Model/mercari_lbl_cnn_lstm_model.h5', compile=False)
    
    with open("/Applied AI/Assignments/29.SelfCase Study - 1/Best Model/mercari_category_encoder.pkl", "rb") as fp:
        category_encoder = pickle.load(fp)
        
    with open("/Applied AI/Assignments/29.SelfCase Study - 1/Best Model/mercari_brand_encoder.pkl", "rb") as fp:
        brand_encoder = pickle.load(fp)
        
    with open("/Applied AI/Assignments/29.SelfCase Study - 1/Best Model/mercari_name_desc_tokenizer.pkl", "rb") as fp:
        name_desc_tokenizer = pickle.load(fp)
        
    pre_data = data.copy()
    
    pre_data['name'] = get_preprocessed_data(pre_data['name'], True, sp)
    pre_data['item_description'] = get_preprocessed_data(pre_data['item_description'], True, sp)
    pre_data['brand_name'] = get_preprocessed_data(pre_data['brand_name'])
    pre_data['sub_l1'] = get_preprocessed_data(pre_data['sub_l1'])
    pre_data['sub_l2'] = get_preprocessed_data(pre_data['sub_l2'])
    pre_data['sub_l3'] = get_preprocessed_data(pre_data['sub_l3'])
    pre_data['contains_bundle'] = contains_bundle(pre_data['name'] + ' ' + pre_data['item_description'])
    pre_data['item_description_score'] = get_sentiment_score(pre_data['item_description'])
    
    pre_data['sub_l1'] = encode_data(pre_data['sub_l1'], category_encoder)
    pre_data['sub_l2'] = encode_data(pre_data['sub_l2'], category_encoder)
    pre_data['sub_l3'] = encode_data(pre_data['sub_l3'], category_encoder)
    pre_data['brand_name'] = encode_data(pre_data['brand_name'], brand_encoder)
    
    pre_data['name'] = tokanize_data(pre_data['name'], name_desc_tokenizer)
    pre_data['item_description'] = tokanize_data(pre_data['item_description'], name_desc_tokenizer)
    
    pre_data = form_predict_data(pre_data)
    
    predicted_result = predict(pre_data, model)[0]
    
    print('Predicted price of the product is ','"', round(predicted_result, 2),'"')

In [12]:
df = pd.read_csv('../Dataset/train.tsv', sep='\t')
given_data = df.head(1)
given_data = given_data.to_dict('index')[0]
given_data['sub_l1'] = 'Men'
given_data['sub_l2'] = 'shoes'
given_data['sub_l3'] = 'athletic training'

In [13]:
given_data

{'train_id': 0,
 'name': 'MLB Cincinnati Reds T Shirt Size XL',
 'item_condition_id': 3,
 'category_name': 'Men/Tops/T-shirts',
 'brand_name': nan,
 'price': 10.0,
 'shipping': 1,
 'item_description': 'No description yet',
 'sub_l1': 'Men',
 'sub_l2': 'shoes',
 'sub_l3': 'athletic training'}

In [14]:
start = time.time()

predict_data(given_data)

end = time.time()
print('Time taken to execute:', round(end - start, 2))

Predicted price of the product is  " 15.34 "
Time taken to execute: 3.6


In [16]:
given_data['name'] = 'puma wear'
given_data['item_condition_id'] = 1
given_data['brand_name'] = 'puma'
given_data['shipping'] = 0
given_data

{'train_id': 0,
 'name': 'puma wear',
 'item_condition_id': 1,
 'category_name': 'Men/Tops/T-shirts',
 'brand_name': 'puma',
 'price': 10.0,
 'shipping': 0,
 'item_description': 'No description yet',
 'sub_l1': 'Men',
 'sub_l2': 'shoes',
 'sub_l3': 'athletic training'}

In [17]:
start = time.time()

predict_data(given_data)

end = time.time()
print('Time taken to execute:', round(end - start, 2))

Predicted price of the product is  " 21.68 "
Time taken to execute: 1.71
