<a href="https://colab.research.google.com/github/sheensta/retail_products_ensemble_deep_learning/blob/main/re_training_on_full_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Required models (from model_evaluations.ipnyb): 
#y_NLP1 + y_CNN + y_preds_rf + y_resnet + y_preds_XGB
#y_NLP1, y_CNN, y_preds_rf, y_resnet
#Image models: resnet transfer learning
#NLP models: simple NLP, CNN, random forest, XGB

In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, GridSearchCV

import xgboost as xgb
from xgboost import XGBClassifier

import keras
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import(GlobalMaxPooling1D, Conv1D, Embedding, Dense, Conv2D, MaxPool2D, Flatten, Dropout, BatchNormalization, GlobalAveragePooling2D)
from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.applications.resnet50 import preprocess_input
from keras.applications.resnet50 import ResNet50

from gensim.models import word2vec
import nltk

import joblib

In [None]:
#Image data
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/df_clean.csv')
df['array'] = list(np.load('/content/drive/MyDrive/Colab Notebooks/np_img_array.npy'))

In [None]:
#Resnet_transfer

X = preprocess_input(np.array(list(df['array'])))
le = LabelEncoder()
le.fit(df['categories'])
y = list(le.transform(df['categories']))
num_classes = 21
y = keras.utils.to_categorical(y, num_classes)

base_model = ResNet50(include_top=False,
                  input_shape = (100,100,3),
                  weights = 'imagenet')

for layer in base_model.layers:
    layer.trainable = False

model = Sequential()
model.add(base_model)
model.add(GlobalAveragePooling2D())
model.add(Dropout(0.3))
model.add(Dense(21,activation='softmax'))
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

augs_gen = ImageDataGenerator(
        featurewise_center=False,  
        samplewise_center=False, 
        featurewise_std_normalization=False,  
        samplewise_std_normalization=False,  
        zca_whitening=False,  
        rotation_range=10,  
        zoom_range = 0.1, 
        width_shift_range=0.2,  
        height_shift_range=0.2, 
        horizontal_flip=True,  
        vertical_flip=False) 

augs_gen.fit(X)

history = model.fit(augs_gen.flow(X,y,batch_size=128), steps_per_epoch  = 402, epochs = 10,verbose = 1)
model.save('/content/drive/MyDrive/models/full_models/FULL_resnet2.h5')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
#NLP data_DL
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/df_NLP.csv')
df['description_clean'] = df['description_clean'].astype(str)
corpus = df['description_clean']

In [None]:
#Simple NLP
X = list(df['description_clean'])

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(corpus)

X = tokenizer.texts_to_sequences(X)

vocab_size = len(tokenizer.word_index) + 1 

maxlen = 250
X = pad_sequences(X, padding='post', maxlen=maxlen)
# define the model
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=maxlen))
model.add(Flatten())
model.add(Dense(21, activation='softmax'))
# compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
history = model.fit(X, y, batch_size=128, epochs=6, verbose=1)
model.save('/content/drive/MyDrive/models/full_models/FULL_NLP1.h5')

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [None]:
#CNN NLP
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=maxlen))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(21, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
history = model.fit(X, y, batch_size=128, epochs=6, verbose=1)
model.save('/content/drive/MyDrive/models/full_models/FULL_NLP_CNN.h5')

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [None]:
#NLP data_RF/XGB
feature_size = 100    # Word vector dimensionality  
window_context = 30          # Context window size                                                                                    
min_word_count = 1   # Minimum word count                        
sample = 1e-3   # Downsample setting for frequent words

wpt = nltk.WordPunctTokenizer()
tokenized_corpus = [wpt.tokenize(document) for document in corpus]

w2v_model = word2vec.Word2Vec(tokenized_corpus, size=feature_size, 
                          window=window_context, min_count=min_word_count,
                          sample=sample, iter=50)

def average_word_vectors(words, model, vocabulary, num_features):
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector

def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

w2v_feature_array = averaged_word_vectorizer(corpus=tokenized_corpus, model=w2v_model,
                                             num_features=feature_size)

X = w2v_feature_array
#le = LabelEncoder()
#le.fit(df['categories'])
#y = list(le.transform(df['categories']))



In [None]:
param_grid = [{}]
rf = GridSearchCV(RandomForestClassifier(), 
                           param_grid,
                           cv=KFold(n_splits=10, 
                                              random_state=42).split(X, y), 
                           verbose=1)
rf.fit(X, y)
joblib.dump(rf.best_estimator_, '/content/drive/MyDrive/models/full_models/FULL_NLP_rf.pkl')

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 10.7min finished


['/content/drive/MyDrive/models/full_models/FULL_NLP_rf.pkl']

In [None]:
xgb = XGBClassifier(objective = 'multiclass:softmax')

param_grid = [{}]
clf_xgb = GridSearchCV(xgb, 
                           param_grid,
                           cv=KFold(n_splits=10, 
                                              random_state=42).split(X, y), 
                           verbose=1)
clf_xgb.fit(X, y)
joblib.dump(clf_xgb.best_estimator_, '/content/drive/MyDrive/models/full_models/FULL_NLP_xgb.pkl')

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 127.6min finished


['/content/drive/MyDrive/models/full_models/FULL_NLP_xgb.pkl']