# New Section

In [1]:
import numpy as np
import pandas as pd
import re

import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import Ridge, Lasso
from xgboost import XGBRegressor

from nltk.tokenize import word_tokenize
import gensim.models

from scipy.sparse import hstack
from sklearn.metrics import mean_squared_log_error

from prettytable import PrettyTable

import joblib
import gc

import warnings
warnings.filterwarnings('ignore')
import pickle

In [2]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
data_new = pd.read_csv("/content/drive/My Drive/new/data_new.csv")
#df_test = pd.read_csv('/content/drive/My Drive/new/test_stg2.tsv', sep='\t')
#test_ids = df_test['test_id'].values

In [4]:
Y=  np.log1p(data_new['price'])

In [5]:
X_train, X_cv, y_train, y_cv = train_test_split(data_new, Y, test_size=0.10)
#X_train, X_cv, y_train, y_cv = train_test_split(X_train, y_train, test_size=0.33)


In [6]:
del data_new
del Y

In [7]:
#Convert No description yet to ''
def handle_text(X_train):
  X_train.fillna('', inplace=True)
  #X_cv.fillna('', inplace=True)
  #df_test.fillna('', inplace=True)
  X_train['item_description']  = X_train['item_description'].str.replace('^no description yet$', '', regex=True)
 # X_cv['item_description']     = X_cv['item_description'].str.replace('^no description yet$', '', regex=True)
  X_train['name'] = X_train['name'] + " " + X_train['brand_name']
  X_train['text'] = X_train['item_description'] + " " + X_train['name'] + " " + X_train['category_name']


  X_train['name'] = X_train['name'].apply(lambda x : decontracted(x))
  X_train['text'] = X_train['text'].apply(lambda x : decontracted(x))
  return X_train['name'],X_train['text']
  #df_test['item_description']  = df_test['item_description'].str.replace('^no description yet$', '', regex=True)

In [8]:
# Ref: AAIC Notebook for Donors' Choose
def decontracted(sent):
    '''
    Task:   This Function changes common short forms like can't, won't to can not, will not resp. (Decontraction)
            This is done to ensure uniformity in the whole text
    Input:  Raw Text
    Output: Decontracted Text
    '''
    sent = re.sub(r"aren\'t", "are not", sent)
    sent = re.sub(r"didn\'t", "did not", sent)
    sent = re.sub(r"can\'t", "can not", sent)
    sent = re.sub(r"couldn\'t", "could not", sent)
    sent = re.sub(r"won\'t", "would not", sent)
    sent = re.sub(r"wouldn\'t", "would not", sent)
    sent = re.sub(r"haven\'t", "have not", sent)
    sent = re.sub(r"shouldn\'t", "should not", sent)
    sent = re.sub(r"doesn\'t", "does not", sent)
    sent = re.sub(r"don\'t", "do not", sent)
    sent = re.sub(r"didn\'t", "did not", sent)
    sent = re.sub(r"mustn\'t", "must not", sent)
    sent = re.sub(r"needn\'t", "need not", sent)
    
    return sent

In [9]:
name_train,text_train=handle_text(X_train)

In [10]:
name_cv,text_cv=handle_text(X_cv)

In [11]:
# loading weights for fasttext
#model_fasttext = KeyedVectors.load_word2vec_format('/content/drive/My Drive/new/wiki-news-300d-1M.vec', binary=False)
# stronging variables into pickle files python: http://www.jessicayung.com/how-to-use-pickle-to-save-and-load-variables-in-python/
# make sure you have the glove_vectors file
with open('drive/My Drive/new/glove_vectors', 'rb') as f:
    model_fasttext = pickle.load(f)
    glove_words =  set(model_fasttext.keys())

In [12]:

def one_hot_encoder(train, valid):
    '''
    This function returns the One Hot Encoded vectors for the given train and CV data
    Input ->
        train_data : Training data to be one hot encoded (List of integers/strings or a Pandas Series)
        cv_data    : Cross Validation data to be one hot encoded (List of integers/strings or a Pandas Series)
    Output -> Tuple of One hot encoded vectors of training and CV data
    Task   -> This function converts the raw values (integers/strings) into one hot encoded vectors using
              sklearn's OneHotEncoder()
    '''
    global  vectorizer3, vectorizer4
    vectorizer3 = OneHotEncoder(dtype=np.float32)
    train_shipvec = vectorizer3.fit_transform(train['shipping'].values.reshape(-1, 1))
    
    
    
    vectorizer4 = OneHotEncoder(dtype=np.float32)
    train_conditionvec = vectorizer4.fit_transform(train['item_condition_id'].values.reshape(-1, 1))
    
    X_train = hstack(( train_shipvec, train_conditionvec)).tocsr()
    
    valid_shipvec = vectorizer3.transform(valid['shipping'].values.reshape(-1, 1))

    valid_conditionvec = vectorizer4.transform(valid['item_condition_id'].values.reshape(-1, 1))

    X_valid = hstack((valid_shipvec, valid_conditionvec)).tocsr()
    

    return X_train,X_valid

In [13]:

#df_test['name'] = df_test['name'].apply(lambda x : decontracted(x))
#df_test['text'] = df_test['text'].apply(lambda x : decontracted(x))

In [None]:
#Defining some special regexes which would be used in the function text_preprocessing() to clean the text
regex_special_chars = re.compile('[^A-Za-z0-9.]+')
regex_decimal_digits = re.compile('(?<!\d)\.(?!\d)')
regex_white_space = re.compile(r'\s+')

In [None]:
def text_preprocessing(sent):
    '''
    Input  -> Raw text (string)
    Output -> Cleaned Text (string)
    Task   -> The objective of this function is to clean the text and make it suitable for Bag of Words/TF-IDF vectorization
              This includes removal of new lines, special characters, emojis etc.
    
    '''
    #Removing special characters such as carriage return and newline character
    sent = sent.replace('\\r', ' ')
    sent = sent.replace('\\n', ' ')

    #Removing all special characters except the period
    sent = regex_special_chars.sub(' ', sent)
    
    #Removing periods which are not either followed or preceeded by a digit
    #Ref: https://stackoverflow.com/questions/6599646/remove-decimal-point-when-not-between-two-digits
    
    sent = regex_decimal_digits.sub(' ', sent)
    
    #Converting multiple white spaces to single white space
    sent = regex_white_space.sub(' ', sent)
    
    #Removing space at starting and ending and converting to lower case
    sent = sent.strip().lower()
    
    # Lemmatizing the text: Lemmetization in NLP means to convert similar words to the same word while taking care of grammar
    sent_list = sent.split()
    lem = WordNetLemmatizer()
    text = [lem.lemmatize(word) for word in sent_list if word not in stop_words] 
    sent = " ".join(text)
    
    return sent

In [14]:
from keras.utils import to_categorical

import tensorflow as tf

from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense,Dropout,Embedding,LSTM
from keras.callbacks import EarlyStopping
from keras.losses import categorical_crossentropy
from keras.optimizers import Adam
from keras.optimizers import Adam
from keras.models import Sequential
from tqdm import tqdm
import warnings
from tensorflow.keras.layers import Dense
from tensorflow.keras import Sequential
from keras.initializers import he_normal
from keras.preprocessing.text import Tokenizer
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Input
from keras.layers import Flatten
from keras.layers import Embedding
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Input
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import LSTM,Bidirectional
from keras.layers.core import Dense, Dropout
from keras.models import Model, load_model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import ReduceLROnPlateau

In [None]:
#https://subscription.packtpub.com/book/application_development/9781782167853/1/ch01lvl1sec10/tokenizing-sentences-into-words
global t_1
t_1 = Tokenizer()
t_1.fit_on_texts(name_train)
vocab_size = len(t_1.word_index) + 1
print('Total unique words in the x_train',vocab_size)
encoded_train = t_1.texts_to_sequences(name_train)
encoded_cv=t_1.texts_to_sequences(name_cv)
#encoded_test = t.texts_to_sequences(df_test['name'])

Total unique words in the x_train 111887


In [None]:
max_length = 300
padded_train = pad_sequences(encoded_train, maxlen=max_length, padding='post')
padded_cv=pad_sequences(encoded_cv, maxlen=max_length, padding='post')
#padded_test = pad_sequences(encoded_test, maxlen=max_length, padding='post')
print("length of padded_train data",len(padded_train))
print("length of padded_test data",len(padded_cv))

length of padded_train data 1333494
length of padded_test data 148167


In [None]:
# for train
embedding_matrix_train = np.zeros((vocab_size, 300))
for word, i in t_1.word_index.items():
    if word in glove_words:
        embedding_vector = model_fasttext[word]
        embedding_matrix_train[i] = embedding_vector

In [None]:
#embedding layer name
from keras.layers import Conv1D, MaxPooling2D
name = Input(shape=(300,), name='name')

x = Embedding(vocab_size, 300, weights=[embedding_matrix_train],trainable = False, input_length=300)(name)
lstm_out = LSTM(20,return_sequences=True)(x)
con1 = Conv1D(32, kernel_size = 3, activation = 'relu', name = 'block_1')(lstm_out)
lstm_out2 = LSTM(10,return_sequences=True)(con1)
con2 = Conv1D(16, kernel_size = 3, activation = 'relu', name = 'block_2')(lstm_out2)
lstm_out3 = LSTM(10,return_sequences=True)(con2)
flatten_x = Flatten()(lstm_out3)

In [None]:
#https://subscription.packtpub.com/book/application_development/9781782167853/1/ch01lvl1sec10/tokenizing-sentences-into-words
global t_2
t_2 = Tokenizer()
t_2.fit_on_texts(X_train['text'])
vocab_size = len(t_2.word_index) + 1
print('Total unique words in the x_train',vocab_size)
encoded_train_desc = t_2.texts_to_sequences(X_train['text'])
encoded_cv_desc = t_2.texts_to_sequences(X_cv['text'])
#encoded_test_desc = t.texts_to_sequences(df_test['text'])

Total unique words in the x_train 244185


In [None]:
# for train embedding
embedding_matrix_train_desc = np.zeros((vocab_size, 300))
for word, i in t_2.word_index.items():
    if word in glove_words:
        embedding_vector = model_fasttext[word]
        embedding_matrix_train_desc[i] = embedding_vector

In [None]:
#train padding
max_length = 300
padded_train_desc = pad_sequences(encoded_train_desc, maxlen=max_length, padding='post')
padded_cv_desc = pad_sequences(encoded_cv_desc, maxlen=max_length, padding='post')
#padded_test_desc = pad_sequences(encoded_test_desc, maxlen=max_length, padding='post')
print("length of padded_train data",len(padded_train_desc))
#print("length of padded_test data",len(padded_test_desc))

length of padded_train data 1333494


In [None]:
#embedding layer description
desc = Input(shape=(300,), name='desc')

y = Embedding(vocab_size, 300, weights=[embedding_matrix_train_desc],trainable = False, input_length=300)(desc)
lstm_out6 = LSTM(20,return_sequences=True)(y)
lstm_out7=LSTM(20,return_sequences=True)(lstm_out6)
con6 = Conv1D(32, kernel_size = 3, activation = 'relu', name = 'block_3')(lstm_out7)
lstm_out8 = LSTM(20,return_sequences=True)(con6)
con7 = Conv1D(16, kernel_size = 3, activation = 'relu', name = 'block_4')(lstm_out8)
lstm_out9 = LSTM(20,return_sequences=True)(con7)
flatten_y = Flatten()(lstm_out9)


In [None]:

X_tr_shipping=X_tr_shipping.todense()
X_cv_shipping=X_cv_shipping.todense()

X_tr_item_condition=X_tr_item_condition.todense()
X_cv_item_condition=X_cv_item_condition.todense()

#X_test_shipping=X_test_shipping.todense()
#X_test_item_condition=X_test_item_condition.todense()

NameError: ignored

In [15]:
X_tr,X_cv=one_hot_encoder(X_train, X_cv)

In [None]:
type(X_tr)

scipy.sparse.csr.csr_matrix

In [None]:
other = Input(shape=(X_tr.shape[1],), name="other")

out = Dense(256, activation='relu')(other)
#out =Dropout(0.1)(out)     ## performance is better without dropouts
out = Dense(64, activation='relu')(out)
#out = Dropout(0.1)(out)
out = Dense(64, activation='relu')(out)


In [16]:
#concate embedding name,embedding description and other features
from keras.layers import Concatenate, Dense, LSTM, Input, concatenate
from keras.layers import Flatten
from keras.regularizers import l2
from keras.layers import concatenate
import keras as k



In [None]:
con_lay = concatenate([flatten_x,flatten_y,out])

In [None]:
X_tr=X_tr.todense()
X_cv=X_cv.todense()

In [None]:
# INPUT DENSE LAYER AFTER CONCAT ALL FEATURES
# Layer 1
m_3 = Dense(256, activation = 'relu', kernel_regularizer = l2(0.01))(con_lay)
#m_3 = Dropout(0.3)(m_3)

# Layer 2


# Layer 4
m_3 = Dense(32, activation = 'relu', kernel_regularizer = l2(0.01))(m_3)
#m_3 = Dropout(0.3)(m_3)

x = Dense(64, activation='relu',kernel_initializer="he_normal",kernel_regularizer=l2(0.001))(m_3)
final_output = Dense(1,kernel_initializer="he_normal")(x)

model5 = Model(inputs=[name,desc,other], outputs=[final_output])
print(model5.summary())



Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
desc (InputLayer)               [(None, 300)]        0                                            
__________________________________________________________________________________________________
name (InputLayer)               [(None, 300)]        0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 300, 300)     73264200    desc[0][0]                       
__________________________________________________________________________________________________
embedding (Embedding)           (None, 300, 300)     33599100    name[0][0]                       
______________________________________________________________________________________________

In [None]:
train_2 = [padded_train,padded_train_desc,X_tr]
cv_2=[padded_cv,padded_cv_desc,X_cv]


In [17]:
import keras
from keras.callbacks import ModelCheckpoint
from keras.layers.normalization import BatchNormalization
from keras.callbacks import ReduceLROnPlateau
from keras.callbacks import EarlyStopping

In [None]:
import datetime
path="/content/drive/My Drive/new/"
# Keep only a single checkpoint, the best over test accuracy.
checkpoint = ModelCheckpoint(path,
                            monitor='val_mse',
                            verbose=1,
                            save_best_only=True,
                            mode='min')

logdir = "/content/drive/My Drive/new/logs/scalars/" 
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)

In [None]:
model5.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse'])
model5.fit(train_2, y_train,epochs=50,validation_data=(cv_2, y_cv),batch_size=1024,callbacks=[tensorboard_callback,checkpoint])

Epoch 1/50

Epoch 00001: val_mse improved from inf to 0.27790, saving model to /content/drive/My Drive/new/




INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


Epoch 2/50

Epoch 00002: val_mse improved from 0.27790 to 0.25849, saving model to /content/drive/My Drive/new/




INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


Epoch 3/50

Epoch 00003: val_mse improved from 0.25849 to 0.24291, saving model to /content/drive/My Drive/new/




INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


Epoch 4/50

Epoch 00004: val_mse improved from 0.24291 to 0.24005, saving model to /content/drive/My Drive/new/




INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


Epoch 5/50

Epoch 00005: val_mse improved from 0.24005 to 0.23362, saving model to /content/drive/My Drive/new/




INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


Epoch 6/50

Epoch 00006: val_mse did not improve from 0.23362
Epoch 7/50

Epoch 00007: val_mse improved from 0.23362 to 0.22624, saving model to /content/drive/My Drive/new/




INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


Epoch 8/50

Epoch 00008: val_mse improved from 0.22624 to 0.22534, saving model to /content/drive/My Drive/new/




INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


Epoch 9/50

Epoch 00009: val_mse improved from 0.22534 to 0.22257, saving model to /content/drive/My Drive/new/




INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


Epoch 10/50

Epoch 00010: val_mse did not improve from 0.22257
Epoch 11/50

Epoch 00011: val_mse improved from 0.22257 to 0.21951, saving model to /content/drive/My Drive/new/




INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


Epoch 12/50

Epoch 00012: val_mse improved from 0.21951 to 0.21886, saving model to /content/drive/My Drive/new/




INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


Epoch 13/50

Epoch 00013: val_mse did not improve from 0.21886
Epoch 14/50

Epoch 00014: val_mse did not improve from 0.21886
Epoch 15/50

Epoch 00015: val_mse did not improve from 0.21886
Epoch 16/50

Epoch 00016: val_mse improved from 0.21886 to 0.21624, saving model to /content/drive/My Drive/new/




INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


Epoch 17/50

Epoch 00017: val_mse did not improve from 0.21624
Epoch 18/50

Epoch 00018: val_mse improved from 0.21624 to 0.21621, saving model to /content/drive/My Drive/new/




INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


Epoch 19/50

Epoch 00019: val_mse improved from 0.21621 to 0.21463, saving model to /content/drive/My Drive/new/




INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


Epoch 20/50

Epoch 00020: val_mse did not improve from 0.21463
Epoch 21/50

Epoch 00021: val_mse improved from 0.21463 to 0.21389, saving model to /content/drive/My Drive/new/




INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


Epoch 22/50

Epoch 00022: val_mse improved from 0.21389 to 0.21341, saving model to /content/drive/My Drive/new/




INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


Epoch 23/50

Epoch 00023: val_mse improved from 0.21341 to 0.21303, saving model to /content/drive/My Drive/new/




INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


Epoch 24/50

Epoch 00024: val_mse did not improve from 0.21303
Epoch 25/50

Epoch 00025: val_mse did not improve from 0.21303
Epoch 26/50

Epoch 00026: val_mse improved from 0.21303 to 0.21150, saving model to /content/drive/My Drive/new/




INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


Epoch 27/50

Epoch 00027: val_mse did not improve from 0.21150
Epoch 28/50

Epoch 00028: val_mse did not improve from 0.21150
Epoch 29/50

Epoch 00029: val_mse did not improve from 0.21150
Epoch 30/50

Epoch 00030: val_mse did not improve from 0.21150
Epoch 31/50

Epoch 00031: val_mse did not improve from 0.21150
Epoch 32/50

Epoch 00032: val_mse did not improve from 0.21150
Epoch 33/50

Epoch 00033: val_mse did not improve from 0.21150
Epoch 34/50

Epoch 00034: val_mse did not improve from 0.21150
Epoch 35/50

Epoch 00035: val_mse improved from 0.21150 to 0.21143, saving model to /content/drive/My Drive/new/




INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


Epoch 36/50

Epoch 00036: val_mse did not improve from 0.21143
Epoch 37/50

Epoch 00037: val_mse did not improve from 0.21143
Epoch 38/50

Epoch 00038: val_mse did not improve from 0.21143
Epoch 39/50

Epoch 00039: val_mse improved from 0.21143 to 0.21111, saving model to /content/drive/My Drive/new/




INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


Epoch 40/50

Epoch 00040: val_mse did not improve from 0.21111
Epoch 41/50

Epoch 00041: val_mse did not improve from 0.21111
Epoch 42/50

Epoch 00042: val_mse did not improve from 0.21111
Epoch 43/50

Epoch 00043: val_mse did not improve from 0.21111
Epoch 44/50

Epoch 00044: val_mse improved from 0.21111 to 0.21085, saving model to /content/drive/My Drive/new/




INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


Epoch 45/50

Epoch 00045: val_mse did not improve from 0.21085
Epoch 46/50

Epoch 00046: val_mse improved from 0.21085 to 0.21064, saving model to /content/drive/My Drive/new/




INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


Epoch 47/50

Epoch 00047: val_mse did not improve from 0.21064
Epoch 48/50

Epoch 00048: val_mse improved from 0.21064 to 0.21063, saving model to /content/drive/My Drive/new/




INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


Epoch 49/50

Epoch 00049: val_mse improved from 0.21063 to 0.21048, saving model to /content/drive/My Drive/new/




INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/new/assets


Epoch 50/50

Epoch 00050: val_mse did not improve from 0.21048


<tensorflow.python.keras.callbacks.History at 0x7fdd5a9df908>

In [None]:
# save model
model5.save("/content/drive/My Drive/new/model6.h5")
print("Saved model to disk")

Saved model to disk


In [18]:
X_test = pd.read_csv('/content/drive/My Drive/new/test_stg2.tsv', sep='\t')
test_ids = X_test['test_id'].values

In [None]:
y_pred=model5.predict(cv_2)

In [22]:
name_test,text_test=handle_text(X_test)

In [19]:

def one_hot_encoder_test( valid):
    '''
    This function returns the One Hot Encoded vectors for the given train and CV data
    Input ->
        train_data : Training data to be one hot encoded (List of integers/strings or a Pandas Series)
        cv_data    : Cross Validation data to be one hot encoded (List of integers/strings or a Pandas Series)
    Output -> Tuple of One hot encoded vectors of training and CV data
    Task   -> This function converts the raw values (integers/strings) into one hot encoded vectors using
              sklearn's OneHotEncoder()
    '''
   
    
    valid_shipvec = vectorizer3.transform(valid['shipping'].values.reshape(-1, 1))

    valid_conditionvec = vectorizer4.transform(valid['item_condition_id'].values.reshape(-1, 1))

    X_valid = hstack((valid_shipvec, valid_conditionvec)).tocsr()
    

    return X_valid


In [20]:
X_te=one_hot_encoder_test( X_test)
X_te=X_te.todense()

In [23]:
max_length=300
#https://subscription.packtpub.com/book/application_development/9781782167853/1/ch01lvl1sec10/tokenizing-sentences-into-words
global t_1
t_1 = Tokenizer()
t_1.fit_on_texts(name_train)
#https://subscription.packtpub.com/book/application_development/9781782167853/1/ch01lvl1sec10/tokenizing-sentences-into-words
global t_2
t_2 = Tokenizer()
t_2.fit_on_texts(text_train)
encoded_test_desc = t_2.texts_to_sequences(text_test)
padded_test_desc = pad_sequences(encoded_test_desc, maxlen=max_length, padding='post')
encoded_test=t_1.texts_to_sequences(name_test)
padded_test=pad_sequences(encoded_test, maxlen=max_length, padding='post')


In [None]:
del encoded_cv
del encoded_cv_desc
del encoded_train
del encoded_train_desc

In [None]:

del train_2
del cv_2

In [None]:
gc.collect()

90

In [25]:
test_2 = [padded_test,padded_test_desc,X_te]

In [None]:
y_pred=model5.predict(test_2)

In [None]:
df = pd.DataFrame() 
df['test_ids']=test_ids
df['price']=np.expm1(y_pred)

In [None]:
df.to_csv("/content/drive/My Drive/new/submit.csv")

In [None]:
print("file saved")

In [24]:
from keras.models import load_model
# load model
model =  load_model('/content/drive/My Drive/new/model6.h5')
# summarize model.
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
desc (InputLayer)               [(None, 300)]        0                                            
__________________________________________________________________________________________________
name (InputLayer)               [(None, 300)]        0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 300, 300)     73264200    desc[0][0]                       
__________________________________________________________________________________________________
embedding (Embedding)           (None, 300, 300)     33599100    name[0][0]                       
______________________________________________________________________________________________

In [31]:
#model1=tf.keras.models.load_model('/content/drive/My Drive/new/model6.h5')
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse'])
y_pred=model.predict(test_2,batch_size=1024)

InvalidArgumentError: ignored