In [None]:
# 是用预训练的word vector来构造DN

import pandas as pd
import numpy as np
from numpy import zeros
from numpy import array
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import seaborn as sns
from sklearn.model_selection import train_test_split

import gensim 
from gensim.models import Word2Vec 

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.python.keras import backend as k
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import GlobalMaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Activation, Dropout, Dense, LSTM


#stopwords_en = set(stopwords.words('english'))
stopwords_en = set()
TAG_RE = re.compile(r'<[^>]+>')

def column_text_to_sentence_array(df, column):
    sentence_array = []
    word_num = 0
    uniq_word_num = 0
    word_set = set()
    for line in df[column]:
        temp = []
        for word in word_tokenize(line):
                if word not in stopwords_en:
                    word_num += 1
                    word_set.add(word)
                    temp.append(word.lower())
        sentence_array.append(temp)
    uniq_word_num = len(word_set)
    print("total word : {}".format(word_num))
    print("uniq word : {}".format(uniq_word_num))
    return sentence_array

def remove_tags(text):
    return TAG_RE.sub('', text)

def preprocess_text(sen):
    # Removing html tags
    sentence = remove_tags(sen)
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)
    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)
    return sentence


In [None]:
movie_reviews = pd.read_csv("./input_data/IMDB_Dataset.csv")
movie_reviews.info()
# sns.countplot(x='sentiment', data=movie_reviews)

In [None]:
# 第一步先将text文本清理，转换sentiment 0/1
movie_reviews["review"] = movie_reviews["review"].map(lambda x: preprocess_text(x))
movie_reviews["sentiment"] = movie_reviews["sentiment"].map(lambda x: 1 if x=="positive" else 0)

movie_reviews.head()

In [None]:
# long time
# 第二步使用gensim训练词向量
data = column_text_to_sentence_array(movie_reviews, "review")
model_COBW = gensim.models.Word2Vec(data, min_count = 1,  
                              size = 100, window = 5) 
print("finish")

word_vector = model_COBW.wv
# like word -> vector dict
# word_vector["man"]
print(len(word_vector.index2word))

In [None]:
X = movie_reviews["review"]
Y = movie_reviews["sentiment"]

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

In [None]:
print(X_train[0])
print(X_train[1])
print(len(X_train[0]))
print(len(X_train[1]))

In [None]:
# 第三步将word数组转换为int数组
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
# 上面只保存5000个频率最高的映射？
print(len(tokenizer.word_index))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)


In [None]:
print(X_train[0])
print(X_train[1])
print(len(X_train[0]))
print(len(X_train[1]))

In [None]:
# 使用sequence model前要填充成一样的长度
maxlen = 100
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

print(len(X_train[0]))
print(len(X_train[1]))


In [None]:
# 建立int -> 词向量的映射
# ["I" "am" "a" "super" "star"] -> [1, 3, 4, 5, 7] -> [[0.1, 0.4], [0.21, 0.233] ...]
vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = zeros((vocab_size, 100))
# tokenizer.word_index 存着["I" "am" "a" "super" "star"] -> [1, 3, 4, 5, 7] 的映射
for word, index in tokenizer.word_index.items():
    try:
        embedding_vector = word_vector[word]
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector
    except KeyError:
        pass


In [None]:
model = Sequential()
# All that the Embedding layer does is to map the integer inputs to the vectors found at the 
# corresponding index in the embedding matrix, 
# i.e. the sequence [1, 2] would be converted to [embeddings[1], embeddings[2]]
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)
model.add(embedding_layer)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model.summary())


In [None]:
print(type(Y_train))
print(type(Y_test))

In [None]:
import matplotlib.pyplot as plt

if isinstance(Y_train, pd.core.series.Series):
    Y_train = Y_train.to_numpy()
if isinstance(Y_test, pd.core.series.Series):
    Y_test = Y_test.to_numpy()
# 这里应该是自动完成了映射X_train[0] = [1, 3, 4, 5, 7] ->[word_vector[1],word_vector[3]...]
history = model.fit(X_train, Y_train, batch_size=128, epochs=20, verbose=1, validation_split=0.2)

def epoch_performance(history):
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train','validation'], loc='upper left')
    plt.show()
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train','validation'], loc='upper left')
    plt.show()

epoch_performance(history)
    
score, accuracy = model.evaluate(X_test, Y_test, verbose=0)
print("Test Score:", score)
print("Test Accuracy:", accuracy)


In [None]:
from tensorflow.keras.layers import LSTM

model2 = Sequential()
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)
model2.add(embedding_layer)
model2.add(LSTM(128))

model2.add(Dense(1, activation='sigmoid'))
model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model2.summary())

history2 = model2.fit(X_train, Y_train, batch_size=128, epochs=3, verbose=1, validation_split=0.2)
epoch_performance(history2)
    
score2, accuracy2 = model2.evaluate(X_test, Y_test, verbose=0)
print("Test Score:", score2)
print("Test Accuracy:", accuracy2)


In [None]:
def predict_single(instance, model):
    instance = tokenizer.texts_to_sequences(instance)
    flat_list = []
    for sublist in instance:
        for item in sublist:
            flat_list.append(item)
    flat_list = [flat_list]
    instance = pad_sequences(flat_list, padding='post', maxlen=maxlen)
    print(model.predict(instance))

In [None]:
instance = X[57]
predict_single(instance, model2)