In [None]:
import tensorflow as tf
print(tf.__version__)

In [None]:
rom google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, GlobalAveragePooling1D, LSTM, Bidirectional
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.metrics import Precision
from tensorflow.keras.optimizers import RMSprop


In [None]:
class myCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs={}):
    if((logs.get('val_accuracy')>=0.9100) and (logs.get('val_precision')>=0.8000)):
      print("\nReached 91% accuracy, 80% precision,so cancelling training!")
      self.model.stop_training = True

In [None]:
mycallback = myCallback()
precision = Precision()

In [None]:
# define a function that accepts a DataFrame and adds new features
def pre_processing(df):
    
    # Subsetting the data
    df = df.loc[:,['description','points']]
    
    # Creating a new y target that is a diochotomous variable
    df['y'] = pd.cut(df.points, bins=[79,91.25,101], labels=False)
    
    # dropping the points variable
    df.drop(['points'], axis = 1, inplace=True)
    
    # string representation of the ingredient list
    df['description'] = df.description.astype('str')
    
    return df
    

In [None]:
df = pre_processing(pd.read_csv('/content/drive/MyDrive/Colab Notebooks/NLPSandBox/winereviews.csv', encoding='ISO-8859-1'))

In [None]:
    arr = np.arange(len(df))
    out = np.random.permutation(arr) # random shuffle
    df = df.loc[out]

In [None]:
y = df['y']
X = df['description']

In [None]:
# split up the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [None]:
MAX_VOCAB_SIZE = 60000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(X_train)
sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_test = tokenizer.texts_to_sequences(X_test)

In [None]:
word2idx = tokenizer.word_index
V = len(word2idx)
print('Found %s unique tokens.' % V)

In [None]:
data_train = pad_sequences(sequences_train)
print('Shape of data train tensor:', data_train.shape)

# get sequence length
T = data_train.shape[1]

In [None]:
data_test = pad_sequences(sequences_test, maxlen=T)
print('Shape of data test tensor:', data_test.shape)

In [None]:

# We get to choose embedding dimensionality
D = 20


i = Input(shape=(T,))
x = Embedding(V + 1, D)(i)
x = Bidirectional(LSTM(64, return_sequences=True))(x)
x = Dropout(0.40)(x)
x = Bidirectional(LSTM(128))(x)
x = Dropout(0.40)(x)
x = Dense(64, activation='swish')(x)
x = Dropout(0.40)(x)
x = Dense(1, activation='sigmoid')(x)

model = Model(i, x)


In [None]:
# Compile and fit
model.compile(
  loss='binary_crossentropy',
  optimizer='adam',
  metrics=['accuracy', precision]
)


print('Training model...')
r = model.fit(
  data_train,
  y_train,
  epochs=5,
  validation_data=(data_test, y_test),
  callbacks=[mycallback])