In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten
from keras.layers import GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer

import seaborn as sns

import gensim 
from gensim.models import Word2Vec 

stopwords_en = set(stopwords.words('english'))
TAG_RE = re.compile(r'<[^>]+>')

def column_text_to_sentence_array(df, column):
    sentence_array = []
    word_num = 0
    uniq_word_num = 0
    word_set = set()
    for line in df[column]:
        temp = []
        for word in word_tokenize(line):
                if word not in stopwords_en:
                    word_num += 1
                    word_set.add(word)
                    temp.append(word.lower())
        sentence_array.append(temp)
    uniq_word_num = len(word_set)
    print("total word : {}".format(word_num))
    print("uniq word : {}".format(uniq_word_num))
    return sentence_array

def remove_tags(text):
    return TAG_RE.sub('', text)

def preprocess_text(sen):
    # Removing html tags
    sentence = remove_tags(sen)
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)
    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)
    return sentence


In [None]:
movie_reviews = pd.read_csv("./input_data/IMDB_Dataset.csv")
movie_reviews.info()
sns.countplot(x='sentiment', data=movie_reviews)

In [None]:
movie_reviews.head()

In [None]:
movie_reviews["review"] = movie_reviews["review"].map(lambda x: preprocess_text(x))
movie_reviews["sentiment"] = movie_reviews["sentiment"].map(lambda x: 1 if x=="positive" else 0)

movie_reviews.head()

In [None]:
# long time
data = column_text_to_sentence_array(movie_reviews, "review")

print("finish")


In [None]:
# long time
model_COBW = gensim.models.Word2Vec(data, min_count = 1,  
                              size = 100, window = 5) 


In [None]:
X = movie_reviews["review"]
Y = movie_reviews["sentiment"]
print(X.shape)
print(Y.shape)

In [None]:
word_vector = model_COBW.wv
# like word -> vector dict
word_vector["man"]

In [None]:
print(len(word_vector.index2word))

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

In [None]:
print(X_train[0])
print(X_train[1])
print(len(X_train[0]))
print(len(X_train[1]))

In [None]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)


In [None]:
print(X_train[0])
print(X_train[1])
print(len(X_train[0]))
print(len(X_train[1]))

In [None]:
vocab_size = len(tokenizer.word_index) + 1

maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

print(len(X_train[0]))
print(len(X_train[1]))
