#Amaxon Luxury Product Analysis

## Loading the data

In [3]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv("https://raw.githubusercontent.com/DanishDahaka/dsba_winter2020/master/Project/Luxury_Beauty/Preprocessing/Data/hair.csv?token=AQ2RLFJLCBUCFPTJLWFE3WK7Y2FBM", )
df = df.drop("cluster", axis=1)
df = df.drop_duplicates(subset=["text_generation_preprocessed"])
df['target_sentiment_binary'] = df.target_sentiment_binary.map({'negative' : 0, 'positive' : 1})

In [5]:
df.shape

(68368, 4)

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,text_generation_preprocessed,text_preprocessed_for_sentiment,target_sentiment_binary
0,34729,this is my favorite hand cream. it is not grea...,"['favorit', 'hand', 'cream', 'greasi', 'nice',...",1
2,35638,"best hand cream ever. one at work, one in my p...","['best', 'hand', 'cream', 'work', 'purs', 'nig...",1
4,44680,keep this by my kitchen sink. after every wash...,"['kitchen', 'sink', 'wash', 'add', 'hand', 'lo...",1
5,44745,find this soap to be gentle and quite excellen...,"['soap', 'gentl', 'excel', 'qualityfin', 'soap...",1
6,45096,love this hand cream. work in job where am con...,"['love', 'hand', 'cream', 'work', 'job', 'cons...",1


In [7]:
df.shape

(68368, 4)

## Word Embedding (Tf-idf) - Trinomial target

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['text_preprocessed_for_sentiment'])

y = df["target_sentiment_binary"].values

In [9]:
## split dataset (Binomial target)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    random_state=42, 
                                                    stratify=df['target_sentiment_binary'])

In [10]:
#From sparse to dense data
X_test_A = X_test.todense()

In [11]:
#From sparse to dense data
X_train_A = X_train.todense()

Motivation for function: https://stackoverflow.com/questions/41538692/using-sparse-matrices-with-keras-and-tensorflow 

## Neural Network (Keras)

In [21]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from keras.models import Sequential #Our model type
from keras.layers import Dense #Type of layer

#Combining Keras with Sklearn (enables us to do K-fold crossvalidation and things like that - not a necessary package)
from keras.wrappers.scikit_learn import KerasClassifier 

from tensorflow.keras.utils import to_categorical
from keras.callbacks import EarlyStopping

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

ModuleNotFoundError: No module named 'keras'

In [None]:
#When working with neural network, we will need a column for each target output. 
#Given that we have three penguin species, we will need three columns with dummy variables. 
#We can create dummies by using the np_utils, which does the same as the one hot encoder from sklearn:

dummy_y_train = to_categorical(y_train)
dummy_y_test = to_categorical(y_test)

Motivation for function: https://stackoverflow.com/questions/41538692/using-sparse-matrices-with-keras-and-tensorflow 

In [None]:
#Keras NN model cannot handle sparse matrix directly. 
#The data has to be dense array or matrix, 
#but transforming the whole training data to dense array won’t fit into my RAM. 
#So I had to define a function, which generates iterable generator object, 
#so that it can be fed to NN model.
def batch_generator(X_data, y_data, batch_size):
    samples_per_epoch = X_data.shape[0]
    number_of_batches = samples_per_epoch/batch_size #steps_per_epoch
    counter=0
    index = np.arange(np.shape(y_data)[0])
    while 1:
        index_batch = index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X_data[index_batch,:].todense()
        y_batch = y_data[index_batch]
        counter += 1
        yield np.array(X_batch),y_batch
        if (counter > number_of_batches):
            counter=0

Dokumentation for EarlyStopping: https://keras.io/api/callbacks/early_stopping/ 

In [None]:
n_cols = X_train.shape[1]

early_stopping_monitor = EarlyStopping(patience=2, monitor='accuracy')

In [None]:
from keras.callbacks import ModelCheckpoint

# define the checkpoint
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

fit_generator documentation: 

https://www.kite.com/python/docs/keras.Model.fit_generator

https://www.pyimagesearch.com/2018/12/24/how-to-use-keras-fit-and-fit_generator-a-hands-on-tutorial/

In [None]:
%%time
model = Sequential()

model.add(Dense(100,activation='relu', input_shape = (n_cols,)))
model.add(Dense(50,activation='relu'))
model.add(Dense(2,activation='softmax')) 
model.compile(optimizer = 'adam', loss='categorical_crossentropy', metrics=['accuracy'])

history = model.fit(batch_generator(X_train, dummy_y_train, 20),
                    epochs = 20,
                    verbose=1,
                    steps_per_epoch = X_train.shape[1]/20, #Very important so it doesent run forever
                    callbacks = [early_stopping_monitor]
                    )

In [None]:
model.summary()

In [None]:
from keras.models import load_model
model.save('ann_model.h5')

In [None]:
#from keras.models import load_model
#model = load_model('ann_model.h5')

In [None]:
#Summarize history for accuracy (Binomial model)
plt.plot(history.history['accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.show()

Evaluation documentation: https://keras.io/api/models/model_training_apis/ 

In [None]:
#Evaluation of Binomial model
model.evaluate(X_test_A, dummy_y_test, steps=10, verbose=1, batch_size=20)

###Evaluation of Binomial model

In [None]:
## encode y
dic_y_mapping = {n:label for n,label in 
                 enumerate(np.unique(y_test))}
inverse_dic = {v:k for k,v in dic_y_mapping.items()}
y_test = np.array([inverse_dic[y] for y in y_test])

In [None]:
## test
predicted_prob = model.predict(X_test_A)
predicted = [dic_y_mapping[np.argmax(pred)] for pred in 
             predicted_prob]

In [None]:
#Confusion matrix
from sklearn.metrics import confusion_matrix
import seaborn as sns
sns.set()

cf_matrix_bi = confusion_matrix(y_test, predicted)

#Lets look at it graphically:
group_names = ["TN", "FP", "FN", "TP"]
group_counts = ["{0:0.0f}".format(value) for value in cf_matrix_bi.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in cf_matrix_bi.flatten() / np.sum(cf_matrix_bi)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names, group_counts, group_percentages)]
labels = np.asarray(labels).reshape(2, 2)
fig, ax = plt.subplots()
sns.heatmap(cf_matrix_bi, annot=labels, fmt="", cmap="Blues")
labels = ['Negative', 'Positive']
ax.set_xticklabels(labels)
ax.set_yticklabels(labels)
ax.set_ylabel('True labels')
ax.set_xlabel('Predicted data')
plt.show()

In [None]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_A, batch_size=64, verbose=1)
y_pred_bool = np.argmax(y_pred, axis=1)

print(classification_report(y_test, y_pred_bool))

In [None]:
from sklearn.metrics import roc_curve, auc

#Roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred[:,1])

#AUC
roc_auc = auc(fpr, tpr)

#Plot
plt.figure(figsize=[7, 6])
plt.plot(fpr, tpr, label="ANN Neural Network (AUC={:0.2f})".format(roc_auc), linewidth=2)

plt.plot([0, 1], [0, 1], color="green", linestyle="--", label="Random (AUC=0.50)")
plt.xlabel("FPR", fontsize=13)
plt.ylabel("TPR", fontsize=13)
plt.legend(loc="lower right", fontsize=13)
plt.show()

## ANN Sklearn (MLP)

Dokumentation for MLP: https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html?highlight=mlp#sklearn.neural_network.MLPClassifier 

In [None]:
from sklearn.neural_network import MLPClassifier
mlp_bi = MLPClassifier(
    hidden_layer_sizes=(100, 2), #100 neurons and 2 layers
    activation="relu", #For the hidden layers
    solver='adam',
    batch_size=200,
    max_iter=5, #Number of epochs
    verbose=True, #Print progess
    early_stopping=True, #terminate training when validation score is not improving
    n_iter_no_change=2, #Maximum number of epochs to not meet tol improvement.
    validation_fraction=0.1, #default
    random_state=42 
)

mlp_bi.out_activation_ = 'logistic'

In [None]:
mlp_bi.fit(X_train_A, y_train)

In [None]:
from sklearn.metrics import classification_report

labels = ["Negative", "Positive"]
print(classification_report(y_train, mlp_bi.predict(X_train_A), target_names=labels))
print(classification_report(y_test, mlp_bi.predict(X_test_A), target_names=labels))

In [None]:
from sklearn.metrics import roc_curve, auc

#Roc_curve
fpr, tpr, thresholds = roc_curve(y_test, mlp_bi.predict_proba(X_test)[:,1])

#AUC
roc_auc = auc(fpr, tpr)

#Plot
plt.figure(figsize=[7, 6])
plt.plot(fpr, tpr, label="MLP Neural Network (AUC={:0.2f})".format(roc_auc), linewidth=2)

plt.plot([0, 1], [0, 1], color="green", linestyle="--", label="Random (AUC=0.50)")
plt.xlabel("FPR", fontsize=13)
plt.ylabel("TPR", fontsize=13)
plt.legend(loc="lower right", fontsize=13)
plt.show()

In [None]:
#Confusion matrix
from sklearn.metrics import confusion_matrix
import seaborn as sns
sns.set()

cf_matrix = confusion_matrix(y_test, mlp_bi.predict(X_test))

#Lets look at it graphically:
group_names = ["TN", "FP", "FN", "TP"]
group_counts = ["{0:0.0f}".format(value) for value in cf_matrix.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in cf_matrix.flatten() / np.sum(cf_matrix)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names, group_counts, group_percentages)]
labels = np.asarray(labels).reshape(2, 2)
fig, ax = plt.subplots()
sns.heatmap(cf_matrix, annot=labels, fmt="", cmap="Blues")
labels = ['Negative', 'Positive']
ax.set_xticklabels(labels)
ax.set_yticklabels(labels)
ax.set_ylabel('True labels')
ax.set_xlabel('Predicted data')
plt.show()

Inspiration for pickle: https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/ 

In [None]:
!pip install pickle-mixin

In [None]:
#import pickle

In [None]:
# save the model to disk
#filename = 'mlp_model_bi.sav'
#pickle.dump(mlp_bi, open(filename, 'wb'))

In [None]:
# load the model from disk
#loaded_model = pickle.load(open(filename, 'rb'))