In [None]:
VOCABULARY_SIZE = 4095
VALIDATION_SPLIT = 0.25

In [None]:
# basic imports
import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import keras
from keras import layers

from sklearn.model_selection import train_test_split

In [None]:
# a NLP library
# get pretrained model with embeddings and load

# if GPU is on, Internet connection is needed in Kaggle Kernels!
!python -m spacy download en_core_web_lg

import spacy
nlp = spacy.load('en_core_web_lg')

In [None]:
### plotting ###
# prettier and larger basic graphs
sns.set(rc={
    'figure.figsize':(18,8),
    'axes.titlesize':14,
})

### pandas ###
# no need to see many decimal places and makes nicer horizontal fits :-)
pd.options.display.float_format = '{:.3f}'.format
# pd.options.display.precision = 3
# make the tables more compact vertically, too
pd.options.display.max_rows = 20

### numpy ###
# same as for pandas - max. 3 decimal places
np.set_printoptions(formatter={'float_kind':'{:.3f}'.format})
# np.set_printoptions(precision=3)

In [None]:
print(os.listdir("../input"))

In [None]:
reviews_df = pd.read_csv('../input/drugsComTrain_raw.csv')
reviews_df

In [None]:
reviews_test_df = pd.read_csv('../input/drugsComTest_raw.csv')
reviews_test_df

The reviews are quite raw (containing unhealthy things for tokenizers such as `&#039;` *- an HTML code for apostrophe*),
I will quickly cook them a bit before progressing further...

In [None]:
## sanitizing review text

from html import unescape

def clean_review(text):
    """Replace HTML escaped characters
    and strip outer quotes and spaces"""
    return unescape(text.strip(' "\'')).replace('\r', ' ').replace('\n', ' ').replace('\ufeff1', '')

reviews_df.review = reviews_df.review.apply(clean_review)
reviews_test_df.review = reviews_test_df.review.apply(clean_review)

Today we have a soup made mostly with the following ingredients

In [None]:
from wordcloud import WordCloud

wordcloud = WordCloud(
    width=1000,
    height=500,
    random_state=hash('yummy'),
    max_font_size=110
)
wordcloud.generate('\n'.join(reviews_df.review))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
del wordcloud

In [None]:
## lemmatizing review text

def text2lexemes(text):
    """tokenize -> lemmatize"""
    
    tokens = nlp(
        # also split on "/"
        text.replace('/', ' / '),
        # we only need tokenizer and lemmas, so disable the rest
        disable=['tagger', 'parser', 'ner']
    )
    
    lexemes = []
    for token in tokens:
        
        if nlp.vocab[token.lemma_.lower()].has_vector:
            lexeme = token.lemma_.lower()
        elif nlp.vocab[token.norm_.lower()].has_vector:
            lexeme = token.norm_.lower()
        else:
            lexeme = token.lower_
        
        lexemes.append(lexeme)
        
    return lexemes

reviews_word_seq = [text2lexemes(review) for review in reviews_df.review]

In [None]:
# vocabulary map
# count occurences of each word
word_count = {}
for lemmas in reviews_word_seq:
    for lemma in lemmas:
        word_count[lemma] = word_count.get(lemma, 0) + 1
word_count = dict(sorted(word_count.items(), key=lambda pair: pair[1], reverse=True))

In [None]:
word_count

In [None]:
## word sequence to integer sequences

vocab = list(word_count)[:VOCABULARY_SIZE]
word2index = {word:index for index, word in enumerate(vocab, start=1)}

reviews_seq = [
    [
        word2index.get(word, 0)
        for word in review
        # dismiss out-of-vocabulary words
        if word in word2index
    ]
    for review in reviews_word_seq
]

In [None]:
plt.title('Vocabulary Distribution')

wc_vals = list(word_count.values())

# plt.plot(vf, '.', label='Words how they came')
plt.plot(wc_vals, 'o', label='Lexeme count')
plt.plot(sum(wc_vals)-np.cumsum(wc_vals), '.', label='Uncaptured words')
plt.axvline(x=VOCABULARY_SIZE, color='yellowgreen', label=f'Current vocabulary size ({VOCABULARY_SIZE})')

plt.gca().set_yscale('log')
# plt.gca().set_xscale('log')

plt.legend()
plt.xlabel('Vocabulary Size')
plt.ylabel('Amount')

plt.show()

In [None]:
plt.title('Distribution of Review Lengths')

review_lengths = np.array([len(s) for s in reviews_seq])

sequence_cutoff_legth = int(np.quantile(review_lengths, 0.99))

sns.distplot(
    review_lengths,
    hist_kws=dict(label='Normalized histogram'),
    kde=True,
    kde_kws=dict(label='Kernel density'),
    rug=True,
    norm_hist=False,
    rug_kws=dict(color='orangered', label='Review'),
    axlabel='Sequence Length',
)
plt.axvline(
    x=sequence_cutoff_legth,
    color='yellowgreen',
    label=f'Sequence cutoff length ({sequence_cutoff_legth})'
)

plt.xlabel('Review Length')
plt.ylabel('Density')
plt.legend();

In [None]:
## pad the sequences

reviews_seq = keras.preprocessing.sequence.pad_sequences(
    reviews_seq,
    maxlen=sequence_cutoff_legth
)

In [None]:
## scale values

# range will be 0.1 - 1.0
# very easy interpretation for the MAE metrics
reviews_df.rating /= 10

# standard scaling
reviews_df.usefulCount += reviews_df.usefulCount.mean()
reviews_df.usefulCount /= reviews_df.usefulCount.std()

In [None]:
# plotting

def plot_history(history, skip_first_n_epochs=0):
    """Show information about the training"""
    
    # plot every train-valid metric pair separately
    for metric in history:
        if not metric.startswith('val_'):
            x = np.arange(len(history[metric]))+1

            y_train = history[metric][skip_first_n_epochs:]
            y_valid = history['val_'+metric][skip_first_n_epochs:]

            # make the plots
            plt.plot(x, y_train)
            plt.plot(x, y_valid)

            # add a legend
            plt.legend([metric, 'val_'+metric], fontsize='large')

            # title with min/max stats
            plt.title(
                f'{metric.upper()} - '
                f'min/max [train: {min(y_train):.3f}/{max(y_train):.3f}, '
                f'valid: {min(y_valid):.3f}/{max(y_valid):.3f}]'
            )
            
            # label and show
            plt.xlabel('epoch')
            plt.show()

In [None]:
embedding_weights = np.zeros((
    VOCABULARY_SIZE+1, # indices/hashes
    nlp.vocab.vectors_length # embedding dimmension
))
for word, index in word2index.items():
    embedding_weights[index] = nlp.vocab[word].vector

In [None]:
input_reviews = layers.Input(shape=(sequence_cutoff_legth,), dtype='int32')

branch_b = layers.Embedding(
    *embedding_weights.shape,
    weights=[embedding_weights],
    input_length=sequence_cutoff_legth,
    trainable=True,
)(input_reviews)
branch_b = layers.GaussianNoise(0.05)(branch_b)
branch_b = layers.Conv1D(16, 3, padding='same', activation='relu')(branch_b)
branch_b = layers.BatchNormalization()(branch_b)
branch_b = layers.MaxPool1D(2)(branch_b)

branch_b = layers.Conv1D(32, 3, padding='same', activation='relu')(branch_b)
branch_b = layers.BatchNormalization()(branch_b)
branch_b = layers.MaxPool1D(2)(branch_b)

branch_b = layers.Conv1D(64, 3, padding='same', activation='relu')(branch_b)
branch_b = layers.BatchNormalization()(branch_b)
branch_b = layers.MaxPool1D(2)(branch_b)

branch_b = layers.Conv1D(128, 3, padding='same', activation='relu')(branch_b)
branch_b = layers.BatchNormalization()(branch_b)
branch_b = layers.GlobalMaxPool1D()(branch_b)

branch_b = layers.Dense(10, activation='relu')(branch_b)
branch_b = layers.BatchNormalization()(branch_b)
branch_b = layers.Dropout(0.1)(branch_b)


input_useful_count = layers.Input(shape=(1,))


model_top = layers.concatenate([branch_b, input_useful_count])
model_top = layers.Dense(10, activation='relu')(model_top)
model_top = layers.BatchNormalization()(model_top)
model_output = layers.Dense(1, activation='sigmoid')(model_top)

model = keras.models.Model(
    inputs=[input_reviews, input_useful_count],
    outputs=model_output,
)
model.summary()

In [None]:
(
    y_train, y_valid,
    x1_train, x1_valid,
    x2_train, x2_valid,
) = train_test_split(
    reviews_df.rating.values, # y
    reviews_seq,
    reviews_df.usefulCount.values,
    # options
    test_size=VALIDATION_SPLIT,
    stratify=reviews_df.rating.values
)
x_train = [x1_train, x2_train]
x_valid = [x1_valid, x2_valid]

In [None]:
model.compile(
    optimizer='adam',
    loss='mae',
)

In [None]:
history = model.fit(
    x=x_train,
    y=y_train,
    validation_data=(x_valid, y_valid),
    batch_size=256,
    epochs=120,
    verbose=1,
    callbacks=[
        keras.callbacks.ModelCheckpoint(
            'model-epoch_{epoch:02d}-val_loss_{val_loss:.2f}.hdf5',
            monitor='val_loss',
            verbose=0,
            save_best_only=True,
            save_weights_only=False,
            mode='auto',
            period=1,
        ),
    ],
)

plot_history(history.history)

In [None]:
x = [np.concatenate(z) for z in zip(x_train, x_valid)]
y = np.concatenate([y_train, y_valid])

color = ['#FFC07DA0']*len(y_train) + ['#988F32A0']*len(y_valid) # A276DD, 7347AD
y, (*x), color = zip(*sorted(zip(y, *x, color), key=lambda pair: pair[0]))
x = [np.asarray(i) for i in x]

yh = model.predict(x).ravel()

plt.scatter(range(len(y)), yh, c=color)
plt.scatter(range(len(y)), y, c='#73DA4D')