In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os

from IPython import display

In [2]:
path = "rt-polaritydata/rt-polaritydata/"

pos_path = os.path.join(path, 'rt-polarity.pos')
neg_path = os.path.join(path, 'rt-polarity.neg')

def load_review(path, is_pos=True):
    with open(path) as f:
        review = pd.DataFrame({'review':f.read().splitlines()})
    review['sentiment'] = 1 if is_pos else 0
    return review

pos_review = load_review(pos_path, is_pos=True)
neg_review = load_review(neg_path, is_pos=False)

all_reviews = pd.concat([pos_review, neg_review])
all_reviews.head()

Unnamed: 0,review,sentiment
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1


In [3]:
from gensim.models import KeyedVectors

word2vec_model = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin", binary=True)

In [4]:
from stop_words import get_stop_words

MAX_WORDS = 25
EMBEDDING_SIZE = 300

def process_review_dense(review, max_words=MAX_WORDS):
    review = tf.keras.preprocessing.text.text_to_word_sequence(review)
    review = [word for word in review if word not in get_stop_words('english')]
    review_ = []
    for r in review:
        try:
            review_.append(word2vec_model[r])
        except KeyError:
            pass # word not found in the model
    review_ = review_[:max_words]
    return np.sum(np.asarray(review_), axis=0) 

In [5]:
def process_review_conv(review, max_words=MAX_WORDS):
    review = tf.keras.preprocessing.text.text_to_word_sequence(review)
    review = [word for word in review if word not in get_stop_words('english')]
    review_ = []
    for r in review:
        try:
            review_.append(word2vec_model[r])
        except KeyError:
            review_.append(np.zeros(EMBEDDING_SIZE,)) # when word does not exists in vocab
    while len(review_) < max_words:
        review_.append(np.zeros(EMBEDDING_SIZE,))
    review_ = review_[:max_words]
    return review_

In [6]:
processed_review_conv = all_reviews.review.apply(
    lambda review: process_review_conv(review))
processed_review_dense = all_reviews.review.apply(
    lambda review: process_review_dense(review))

In [7]:
from sklearn.model_selection import train_test_split

X_dense = np.asarray(processed_review_dense).flatten()
X_dense_reshaped = np.zeros([len(X_dense), X_dense[0].shape[0]])
for i in range(len(X_dense)):
    X_dense_reshaped[i, :] = X_dense[i]

X_conv = processed_review_conv.to_numpy().tolist()
X_conv = tf.convert_to_tensor(X_conv)
X_conv_reshaped = tf.reshape(
    X_conv, [X_conv.shape[0], X_conv.shape[2], X_conv.shape[1], 1]) # one channel (black or white)


y = all_reviews.sentiment.values

In [8]:
X_dense_reshaped.shape, X_conv_reshaped.shape, y.shape

((10662, 300), TensorShape([10662, 300, 25, 1]), (10662,))

In [73]:
X_dense_reshaped_tf = tf.convert_to_tensor(X_dense_reshaped)
y_tf = tf.convert_to_tensor(y)

In [74]:
class ConvDenseTextModel(tf.keras.Model):
    def __init__(self, embedding_size=EMBEDDING_SIZE, max_words=MAX_WORDS):
        super(ConvDenseTextModel, self).__init__()
        self.concat_size = 128
        
        # convolutional part of the model
        self.conv1 = tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=(embedding_size, max_words, 1))
        self.pool1 = tf.keras.layers.MaxPool2D((2, 2))
        self.conv2 = tf.keras.layers.Conv2D(64, (3, 3), activation='relu')
        self.pool2 = tf.keras.layers.MaxPool2D((2, 2))
        self.dense1 = tf.keras.layers.Dense(512, activation='relu')
        
        # dense part of the model
        self.dense2 = tf.keras.layers.Dense(256, activation='relu', input_shape=(embedding_size,))
        self.dense3 = tf.keras.layers.Dense(128, activation='relu')
        self.dense4 = tf.keras.layers.Dense(64, activation='relu')
        
        self.concat_layer1 = tf.keras.layers.Dense(10, activation='relu')
        self.concat_layer2 = tf.keras.layers.Dense(10, activation='relu')
        self.dense5 = tf.keras.layers.Dense(1, activation='sigmoid')
        self.dropout = tf.keras.layers.Dropout(0.4)
        
    def call(self, inputs, training=False):
        X_conv, X_dense = inputs
        
        X_conv = self.conv1(X_conv)
        X_conv = self.pool1(X_conv)
        X_conv = self.conv2(X_conv)
        X_conv = self.pool2(X_conv)
        X_conv = tf.keras.layers.Flatten()(X_conv)
        if training:
            X_conv = self.dropout(X_conv)
        X_conv = self.dense1(X_conv)
        X_conv = self.concat_layer1(X_conv)
        
        X_dense = self.dense2(X_dense)
        if training:
            X_dense = self.dropout(X_dense)
        X_dense = self.dense3(X_dense)
        if training:
            X_dense = self.dropout(X_dense)
        X_dense = self.dense4(X_dense)
        X_dense = self.concat_layer2(X_dense)
        
        X = tf.concat([X_conv, X_dense], 0)
        return self.dense5(X)

In [75]:
model = ConvDenseTextModel()

In [76]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [81]:
model.summary()

Model: "conv_dense_text_model_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_26 (Conv2D)           multiple                  320       
_________________________________________________________________
max_pooling2d_26 (MaxPooling multiple                  0         
_________________________________________________________________
conv2d_27 (Conv2D)           multiple                  18496     
_________________________________________________________________
max_pooling2d_27 (MaxPooling multiple                  0         
_________________________________________________________________
dense_84 (Dense)             multiple                  9568768   
_________________________________________________________________
dense_85 (Dense)             multiple                  77056     
_________________________________________________________________
dense_86 (Dense)             multiple     

# TODO: https://www.tensorflow.org/tutorials/generative/pix2pix
Based on that tutorial create tf.keras.Model with custom loss and training loop 

In [None]:
EPOCHS = 10
BATCH_SIZE = 128

model.fit((X_conv_reshaped, X_dense_reshaped_tf),
          y_tf,
          epochs=EPOCHS)