<a href="https://colab.research.google.com/github/sheensta/retail_products_ensemble_deep_learning/blob/main/Google%20Colab%20Notebooks/NLP_Feature_Extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import sklearn

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/df_NLP.csv')
df['description_clean'] = df['description_clean'].astype(str)
corpus = df['description_clean']

Word Similarity with word2vec

In [None]:
from gensim.models import word2vec
import nltk

In [None]:
feature_size = 100    # Word vector dimensionality  
window_context = 30          # Context window size                                                                                    
min_word_count = 1   # Minimum word count                        
sample = 1e-3   # Downsample setting for frequent words

wpt = nltk.WordPunctTokenizer()
tokenized_corpus = [wpt.tokenize(document) for document in corpus]

w2v_model = word2vec.Word2Vec(tokenized_corpus, size=feature_size, 
                          window=window_context, min_count=min_word_count,
                          sample=sample, iter=50)

In [None]:
def average_word_vectors(words, model, vocabulary, num_features):
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector

def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

w2v_feature_array = averaged_word_vectorizer(corpus=tokenized_corpus, model=w2v_model,
                                             num_features=feature_size)

In [None]:
pd.DataFrame(w2v_feature_array).shape

In [None]:
similar_words = {search_term: [item[0] for item in w2v_model.wv.most_similar([search_term], topn=5)]
                  for search_term in ['pet','electronics','beauty','industrial','baby','arts','outdoors']}
similar_words

In [None]:
#@title
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import keras
#le = LabelEncoder()
#le.fit(df['categories'])
#y = list(le.transform(df['categories']))

#num_classes = 21
y = list(df['categories'])
X = w2v_feature_array
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)
#y_train = keras.utils.to_categorical(y_train, num_classes)
#y_test = keras.utils.to_categorical(y_test, num_classes)

In [None]:
w2v_feature_array = pd.DataFrame(X)
a = pd.concat([w2v_feature_array, y], axis = 1)
pd.DataFrame(a).to_csv('/content/drive/MyDrive/Colab Notebooks/w2v_feature_array_evaluation.csv', index= False)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, GridSearchCV
param_grid = [{}]
lg = GridSearchCV(LogisticRegression(), 
                           param_grid,
                           cv=KFold(n_splits=10, 
                                              random_state=42).split(X_train, y_train), 
                           verbose=1)
y_preds_lg = lg.fit(X_train, y_train).predict(X_test)

In [None]:
from sklearn.metrics import classification_report
report_lg = classification_report( y_test, y_preds_lg)
print(report_lg)

In [None]:
lg.score(X_test, y_test)

In [None]:
param_grid = [{}]
rf = GridSearchCV(RandomForestClassifier(), 
                           param_grid,
                           cv=KFold(n_splits=10, 
                                              random_state=42).split(X_train, y_train), 
                           verbose=1)
y_preds_rf = rf.fit(X_train, y_train).predict_proba(X_test)

In [None]:
len(X_test)

In [None]:
from sklearn.metrics import classification_report
report_rf = classification_report( y_test, y_preds_rf)
print(report_rf)

In [None]:
pd.DataFrame(y_preds_rf).to_csv('/content/drive/MyDrive/Colab Notebooks/y_preds_rf.csv', index = False)

In [None]:
import joblib
joblib.dump(rf.best_estimator_, 'NLP_rf.pkl')

In [None]:
rf.best_estimator_.predict(X_test).shape

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import keras
le = LabelEncoder()
le.fit(df['categories'])
y = list(le.transform(df['categories']))


num_classes = 21

X = w2v_feature_array
y = list(le.transform(df['categories']))
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
X_train = np.array(X_train)

In [None]:
#xgboost hyperparameter tuning
import xgboost as xgb
from scipy import stats
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.metrics import f1_score
X = pd.DataFrame(w2v_feature_array)
xgb = XGBClassifier(objective = 'multiclass:softmax')

param_grid = [{}]
clf_xgb = GridSearchCV(xgb, 
                           param_grid,
                           cv=KFold(n_splits=10, 
                                              random_state=42).split(X_train, y_train), 
                           verbose=1)
y_preds_xbg = clf_xgb.fit(X_train, y_train).predict_proba(X_test)

In [None]:
pd.DataFrame(y_preds_xbg).to_csv('/content/drive/MyDrive/Colab Notebooks/y_preds_xgb.csv')

In [None]:
joblib.dump(clf_xgb.best_estimator_, '/content/drive/MyDrive/models/NLP_XGB.pkl')

In [None]:
report_xgb = classification_report( y_test, y_preds_xbg)
print(report_rf)

In [None]:
import joblib
filename = 'NLP_xgb.sav'
joblib.dump(clf_xgb, filename)



Keras Word Preprocessing

In [None]:
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder

X = list(df['description_clean'])
le = LabelEncoder()
le.fit(df['categories'])
y = list(le.transform(df['categories']))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

import keras
num_classes = 21
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(corpus)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1 

In [None]:
from keras.preprocessing.sequence import pad_sequences

maxlen = 250
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [None]:
#TRAINING A SIMPLE DEEP LEARNING MODEL (accuracy similar to Random Forest)

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding

# define the model
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=maxlen))
model.add(Flatten())
model.add(Dense(21, activation='softmax'))
# compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
history = model.fit(X_train, y_train, batch_size=128, epochs=6, verbose=1, validation_data=(X_test, y_test))

In [None]:
model.save('/content/drive/MyDrive/models/NLP_custom_trainedsimple DL.h5')

In [None]:
from numpy import array
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()
glove_file = open('/content/drive/MyDrive/Colab Notebooks/glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
glove_file.close()

In [None]:
embedding_matrix = zeros((vocab_size, 100))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [None]:
#Embedding with pre-trained glove model

model = Sequential()
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=True)
model.add(embedding_layer)

model.add(Flatten())
model.add(Dense(21, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
print(model.summary())


In [None]:
history = model.fit(X_train, y_train, batch_size=128, epochs=6, verbose=1, validation_data=(X_test, y_test))

In [None]:
model.save('/content/drive/MyDrive/models/NLP_GloVeEmbedding.h5')

In [None]:
#using a CNN custom
from keras.layers import Conv1D 
from keras.layers import GlobalMaxPooling1D

model = Sequential()

model.add(Embedding(vocab_size, 100, input_length=maxlen))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(21, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train, batch_size=128, epochs=6, verbose=1, validation_data=(X_test, y_test))

In [None]:
model.save('/content/drive/MyDrive/models/NLP_custom_CNN.h5')

In [None]:
#using a CNN with GloVe embedding
from keras.layers import Conv1D 
from keras.layers import GlobalMaxPooling1D

model = Sequential()

embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)
model.add(embedding_layer)

model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(21, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train, batch_size=128, epochs=20, verbose=1, validation_data=(X_test, y_test))

In [None]:
model.save('/content/drive/MyDrive/models/NLP_GloVe_CNN.h5')

In [None]:
#Using LSTM model with custom embedding
from keras.layers import LSTM

model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=maxlen))
model.add(LSTM(128))
model.add(Dense(21, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=10, verbose=1, validation_data=(X_test, y_test))

In [None]:
#Using LSTM model with GloVe embedding

model = Sequential()
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)
model.add(embedding_layer)
model.add(LSTM(128))
model.add(Dense(21, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train, batch_size=128, epochs=10, verbose=1, validation_data=(X_test, y_test))

Create model

In [None]:
train_embedding_weights = X_train

In [None]:
from keras.layers import concatenate
from keras.layers import Input
from keras.layers import MaxPooling1D
from keras.layers import Dropout
from keras.models import Model

def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, trainable=False, extra_conv=False):
    
    embedding_layer = Embedding(vocab_size, 100, input_length=maxlen)
    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    # Yoon Kim model (https://arxiv.org/abs/1408.5882)
    convs = []
    filter_sizes = [3,4,5]

    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=128, kernel_size=filter_size, activation='relu')(embedded_sequences)
        l_pool = MaxPooling1D(pool_size=3)(l_conv)
        convs.append(l_pool)

    l_merge = concatenate([convs[0],convs[1],convs[2]],axis=1)

    # add a 1D convnet with global maxpooling, instead of Yoon Kim model
    conv = Conv1D(filters=128, kernel_size=3, activation='relu')(embedded_sequences)
    pool = MaxPooling1D(pool_size=3)(conv)

    if extra_conv==True:
        x = Dropout(0.5)(l_merge)  
    else:
        # Original Yoon Kim model
        x = Dropout(0.5)(pool)
    x = Flatten()(x)
    x = Dense(128, activation='relu')(x)
    # Finally, we feed the output into a Sigmoid layer.
    # The reason why sigmoid is used is because we are trying to achieve a binary classification(1,0) 
    # for each of the 6 labels, and the sigmoid function will squash the output between the bounds of 0 and 1.
    preds = Dense(21,activation='softmax')(x)

    model = Model(sequence_input, preds)
    model.compile(loss='categorical_crossentropy',
                  optimizer='adadelta',
                  metrics=['accuracy'])
    model.summary()
    return model


In [None]:
model = ConvNet(train_embedding_weights, maxlen, 35987, 250, False)

In [None]:
history = model.fit(X_train, y_train, epochs=50, batch_size=64,validation_data=(X_test,y_test))

In [None]:
#Trying another model - also 5%, very slow to train

from keras.models import Sequential
from keras.layers import Dense,Embedding,LSTM,GRU
from keras.layers.embeddings import Embedding

model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=maxlen))
model.add(GRU(units = 32, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(21, activation = 'softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

In [None]:
#another CNN model, custom
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=maxlen))
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(21, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
history = model.fit(X_train, y_train, epochs=6, batch_size=128,validation_data=(X_test,y_test))

In [None]:
#another CNN model, GloVe
model = Sequential()
model.add(Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=True))
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(21, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=10, batch_size=256,validation_data=(X_test,y_test))