In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import nltk
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
import scikitplot as skplt

from keras import callbacks
from keras.layers import Bidirectional
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Embedding
from keras.layers import LSTM
from keras.models import Sequential
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer


import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 10)

plt.rc('figure', figsize=(10, 7))

num_epoch = 5

# Data Structure

In [None]:
data = pd.read_csv('../input/womens-ecommerce-clothing-reviews/Womens Clothing E-Commerce Reviews.csv')
data.drop(columns="Unnamed: 0", axis=1, inplace=True)
data

In [None]:
department_list = data['Department Name'].dropna().unique()
department_list = [x.lower() for x in department_list]
department_list

In [None]:
class_list = data['Class Name'].dropna().unique()
class_list = [x.lower() for x in class_list]
class_list

In [None]:
department_and_class = np.concatenate((department_list, class_list, ['dress', 'petite', 'petit', 'skirt', 'shirt', 'jacket', 'intimate', 'blouse', 'coat', 'sweater']), axis=0)
department_and_class

In [None]:
review_data = data[['Review Text','Recommended IND']]
review_data

In [None]:
review_data.isnull().sum().sort_values()

In [None]:
review_data.dropna(axis=0,inplace=True)

In [None]:
review_data

# Basic Visualization

In [None]:
#import for test train split and vect
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf(data):
    tfidf_vectorizer =TfidfVectorizer(min_df=3,  max_features=None, 
             analyzer='word', use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')


    train = tfidf_vectorizer.fit_transform(data)

    return train, tfidf_vectorizer

In [None]:
from sklearn.decomposition import  TruncatedSVD
import matplotlib
import matplotlib.patches as mpatches


def plot_LSA(test_data, test_labels):
        #reduce into 2 dimensions using svd 
        lsa = TruncatedSVD(n_components=2)
        #fits to the train data
        lsa.fit(test_data)
        lsa_scores = lsa.transform(test_data)
        color_mapper = {label:idx for idx,label in enumerate(set(test_labels))}
        color_column = [color_mapper[label] for label in test_labels]
        colors = ['orange','blue','blue']
        if plt:
            plt.scatter(lsa_scores[:,0], lsa_scores[:,1], s=8, alpha=.8, c=test_labels, cmap=matplotlib.colors.ListedColormap(colors))
            red_patch = mpatches.Patch(color='orange', label='Recommended IND = 0')
            blue_patch = mpatches.Patch(color='blue', label='Recommended IND = 1')
            plt.legend(handles=[red_patch, blue_patch], prop={'size': 12})

In [None]:
X = review_data["Review Text"]
y = review_data["Recommended IND"]

# Create sequence
tokenizer = Tokenizer()
tokenizer.fit_on_texts(review_data['Review Text'])
vocabulary_size = len(tokenizer.word_index) + 1
print(vocabulary_size)

# 限制最长长度为70，过长截断，过短就在后方（post）补齐
max_length = 70

sequences = tokenizer.texts_to_sequences(X)
features = pad_sequences(sequences, maxlen=max_length, padding='post')

# Review Text Feature Transformation

In [None]:
from string import punctuation
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re

# if you don't have stopwords and have some error, please use the download code bollow!
# nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
### Text Normalizing function. Part of the following function was taken from this link. 
def clean_text(text):
    
    ## Remove puncuation
    text = text.translate(punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    ## Remove stop words
    text = [w for w in text if not w in stop_words]
    
    text = " ".join(text)
    ## Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
#     text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    ## Stemming
    text = text.split()
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)
    return text

In [None]:
review_data['Review Text'] = review_data['Review Text'].map(lambda x: clean_text(x))

In [None]:
review_data

Tokenizer是一个用于向量化文本，或将文本转换为序列（即单词在字典中的下标构成的列表，从1算起）的类。

word_index: 字典，将单词（字符串）映射为它们的排名或者索引。仅在调用fit_on_texts之后设置。

texts_to_sequences(texts)

texts：待转为序列的文本列表

返回值：序列的列表，列表中每个序列对应于一段输入文本

pad_sequences 将多个序列截断或补齐为相同长度。

该函数将一个 num_samples 的序列（整数列表）转化为一个 2D Numpy 矩阵，其尺寸为 (num_samples, num_timesteps)。 num_timesteps 要么是给定的 maxlen 参数，要么是最长序列的长度。

比 num_timesteps 短的序列将在末端以 value 值补齐。

比 num_timesteps 长的序列将会被截断以满足所需要的长度。补齐或截断发生的位置分别由参数 pading 和 truncating 决定。

向前补齐为默认操作。

In [None]:
from keras.utils import to_categorical

X = review_data["Review Text"]
y = review_data["Recommended IND"]

# Create sequence
tokenizer = Tokenizer()
tokenizer.fit_on_texts(review_data['Review Text'])
vocabulary_size = len(tokenizer.word_index) + 1
print(vocabulary_size)

In [None]:
sequences = tokenizer.texts_to_sequences(review_data['Review Text'])
np.max([len(x) for x in sequences])

In [None]:
# 限制最长长度为70，过长截断，过短就在后方（post）补齐
max_length = 70
padded_features = pad_sequences(sequences, maxlen=max_length, padding='post')

In [None]:
plot_LSA(padded_features, y)
plt.show()

In [None]:
from scipy import interp
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn.metrics import roc_curve, auc

def plot_roc(n_classes, y_test, y_score, title, class_name_list):
    # Plot linewidth.
    lw = 2

    y_test = sentiment_test[1]
    y_score = test_score
    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

    # Compute macro-average ROC curve and ROC area

    # First aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

    # Then interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += interp(all_fpr, fpr[i], tpr[i])

    # Finally average it and compute AUC
    mean_tpr /= n_classes

    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

    # Plot all ROC curves
    plt.figure(1)
    plt.plot(fpr["micro"], tpr["micro"],
             label='micro-average ROC curve (area = {0:0.2f})'
                   ''.format(roc_auc["micro"]),
             color='deeppink', linestyle=':', linewidth=4)

    plt.plot(fpr["macro"], tpr["macro"],
             label='macro-average ROC curve (area = {0:0.2f})'
                   ''.format(roc_auc["macro"]),
             color='navy', linestyle=':', linewidth=4)

    colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
    for i, color in zip(range(n_classes), colors):
        plt.plot(fpr[i], tpr[i], color=color, lw=lw,
                 label='ROC curve of class {0} (area = {1:0.2f})'
                 ''.format(class_name_list[i], roc_auc[i]))

    plt.plot([0, 1], [0, 1], 'k--', lw=lw)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(title)
    plt.legend(loc="lower right")
    plt.show()

# Recommended IND Classification

In [None]:
from sklearn import model_selection

X_train, X_val, y_train, y_val = model_selection.train_test_split(review_data['Review Text'], review_data['Recommended IND'], test_size=0.2, random_state=666)
X_test, X_val, y_test, y_val = model_selection.train_test_split(X_val, y_val, test_size=0.5, random_state=888)

In [None]:
print(len(X_train))
print(len(X_val))
print(len(X_test))

弱智算法1：0.4undersampling，0.4oversamplling，0.2是取随机取句子的前半句或者后半句生成新的数据。这样的理由是，有部分用户会在评论一开始或者最后面强烈表达自己的情感。所以使用这样的方式来做数据增强有利于分类。

In [None]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler()
ros = RandomOverSampler()

In [None]:
X_under, X_cat, y_under, y_cat = model_selection.train_test_split(X_train, y_train, test_size=0.4, random_state=888)

In [None]:
print(X_under.shape,y_under.shape, X_cat.shape, y_cat.shape)

In [None]:
rus_sequences = tokenizer.texts_to_sequences(X_under)
rus_features = pad_sequences(rus_sequences, maxlen=max_length, padding='post')
train_X_rus, train_y_rus = rus.fit_sample(rus_features, y_under)

In [None]:
print(train_X_rus.shape, train_y_rus.shape)
train_y_rus.value_counts()

In [None]:
print(X_cat.shape, y_cat.shape)
y_cat.value_counts()

In [None]:
y_cat.value_counts()

In [None]:
cat_0_idx = y_cat[y_cat == 0]
cat_0_idx = list(cat_0_idx.keys())
cat_1_idx = y_cat[y_cat == 1]
cat_1_idx = list(cat_1_idx.keys())

In [None]:
X_cat_0 = X_cat[cat_0_idx]
X_cat_1 = X_cat[cat_1_idx]

In [None]:
count_0 = len(cat_0_idx)

In [None]:
import random
new_X_0 = []
for idx in cat_0_idx:
    cur = X_cat_0[idx]
    p = random.randint(0, 1)
    cur_idx = len(cur) // 2
    cur = cur[:cur_idx] if p == 0 else cur[cur_idx:]
    new_X_0.append(cur)
new_X_0.extend(list(X_cat_0.values))

new_X_1 = []
for idx in cat_1_idx:
    cur = X_cat_1[idx]
    p = random.randint(0, 1)
    cur_idx = len(cur) // 2
    cur = cur[:cur_idx] if p == 0 else cur[cur_idx:]
    new_X_1.append(cur)
new_X_1 = random.sample(new_X_1, count_0)
new_X_1.extend(random.sample(list(X_cat_1.values), count_0))

In [None]:
print(len(new_X_0))
print(len(new_X_1))

In [None]:
len(new_X_0 + new_X_1)

In [None]:
X_cat = pd.Series(new_X_0 + new_X_1)
y_cat = pd.Series([0] * count_0 * 2 + [1] * count_0 * 2)

In [None]:
cat_sequences = tokenizer.texts_to_sequences(X_cat)
cat_features = pad_sequences(cat_sequences, maxlen=max_length, padding='post')

In [None]:
cat_features.shape

In [None]:
cat_features

In [None]:
train_X_rus

In [None]:
features_all = np.concatenate((cat_features, train_X_rus))

In [None]:
features_all.shape

In [None]:
y_all = y_cat.append(train_y_rus)

In [None]:
labels_all = to_categorical(y_all)
labels_all[0]
print(labels_all.shape)

In [None]:
plot_LSA(features_all, y_all)
plt.show()

In [None]:
val_sequences = tokenizer.texts_to_sequences(X_val)
val_features = pad_sequences(val_sequences, maxlen=max_length, padding='post')
val_labels = to_categorical(y_val)
print(val_features.shape, val_labels.shape)

test_sequences = tokenizer.texts_to_sequences(X_test)
test_features = pad_sequences(test_sequences, maxlen=max_length, padding='post')
test_labels = to_categorical(y_test)
print(test_features.shape, test_labels.shape)

In [None]:
model = Sequential()

e = Embedding(vocabulary_size, 100, input_length=max_length, trainable=True)
model.add(e)
model.add(Bidirectional(LSTM(128, dropout=0.5, return_sequences=True)))
model.add(Bidirectional(LSTM(256, dropout=0.5)))
model.add(Dense(2, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(features_all, labels_all, epochs=num_epoch, batch_size=256, verbose=1,
          validation_data=(val_features, val_labels), shuffle=True)

score = model.evaluate(test_features, test_labels, verbose=1)

print('loss : {}, acc : {}'.format(score[0], score[1]))

In [None]:
test_score = model.predict(test_features)
test_predictions = np.argmax(test_score, axis=1)

class_names = ['(0) Not recommended class', '(1) Recommended class']
report = classification_report(np.argmax(test_labels, axis=1), test_predictions, target_names=class_names)
matrix = pd.DataFrame(confusion_matrix(y_true=np.argmax(test_labels, axis=1), y_pred=test_predictions), 
                                        index=class_names, columns=class_names)
print(matrix)
print(report)
f1_score(np.argmax(test_labels, axis=1), test_predictions, average='micro')   

In [None]:
skplt.metrics.plot_roc(np.argmax(test_labels, axis=1), model.predict_proba(test_features),
                      title='ROC Curves - hyper') 

In [None]:
model = Sequential()

class_weight = {0: 5, 1: 1}
e = Embedding(vocabulary_size, 100, input_length=max_length, trainable=True)
model.add(e)
model.add(Bidirectional(LSTM(128, dropout=0.5, return_sequences=True)))
model.add(Bidirectional(LSTM(256, dropout=0.5)))
model.add(Dense(2, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(features_all, labels_all, epochs=num_epoch, batch_size=256, verbose=1,
          validation_data=(val_features, val_labels), shuffle=True, class_weight=class_weight)

score = model.evaluate(test_features, test_labels, verbose=1)

print('loss : {}, acc : {}'.format(score[0], score[1]))

In [None]:
test_score = model.predict(test_features)
test_predictions = np.argmax(test_score, axis=1)

class_names = ['(0) Not recommended class', '(1) Recommended class']
report = classification_report(np.argmax(test_labels, axis=1), test_predictions, target_names=class_names)
matrix = pd.DataFrame(confusion_matrix(y_true=np.argmax(test_labels, axis=1), y_pred=test_predictions), 
                                        index=class_names, columns=class_names)
print(matrix)
print(report)
f1_score(np.argmax(test_labels, axis=1), test_predictions, average='micro')   

In [None]:
skplt.metrics.plot_roc(np.argmax(test_labels, axis=1), model.predict_proba(test_features),
                      title='ROC Curves - hyper') 

虽然好评的recall和差评的prec都下降了，好评的prec和差评的recall都很高，但是我觉得是合理的：如果要做评论精选，比如把好评放在前面，那么好评prec高是合理的，说明给用户看的评论基本都是好评；如果店家想看舆情分析，就是想看自己店铺的差评，那么差评recall高是合理的，说明此时给店家看的基本都是差评。
后期不知道多训练多几次，还有调整三种resample方式的比重，不知道效果如何。

In [None]:
model = Sequential()

class_weight = {0: 10, 1: 1}
e = Embedding(vocabulary_size, 100, input_length=max_length, trainable=True)
model.add(e)
model.add(Bidirectional(LSTM(128, dropout=0.5, return_sequences=True)))
model.add(Bidirectional(LSTM(256, dropout=0.5)))
model.add(Dense(2, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(features_all, labels_all, epochs=num_epoch, batch_size=256, verbose=1,
          validation_data=(val_features, val_labels), shuffle=True, class_weight=class_weight)

score = model.evaluate(test_features, test_labels, verbose=1)

print('loss : {}, acc : {}'.format(score[0], score[1]))

In [None]:
test_score = model.predict(test_features)
test_predictions = np.argmax(test_score, axis=1)

class_names = ['(0) Not recommended class', '(1) Recommended class']
report = classification_report(np.argmax(test_labels, axis=1), test_predictions, target_names=class_names)
matrix = pd.DataFrame(confusion_matrix(y_true=np.argmax(test_labels, axis=1), y_pred=test_predictions), 
                                        index=class_names, columns=class_names)
print(matrix)
print(report)
f1_score(np.argmax(test_labels, axis=1), test_predictions, average='micro')   

In [None]:
skplt.metrics.plot_roc(np.argmax(test_labels, axis=1), model.predict_proba(test_features),
                      title='ROC Curves - hyper') 

# Reference
1. https://medium.com/@sabber/classifying-yelp-review-comments-using-lstm-and-word-embeddings-part-1-eb2275e4066b
2. https://www.kaggle.com/rajmehra03/a-detailed-explanation-of-keras-embedding-layer
3. https://machinelearningmastery.com/develop-bidirectional-lstm-sequence-classification-python-keras/