In [None]:
# download dataset from the UCI website
!curl -o uci-labelled-sentences.zip https://archive.ics.uci.edu/static/public/331/sentiment+labelled+sentences.zip

# unzip dataset in Colab
!unzip uci-labelled-sentences.zip

import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.callbacks import EarlyStopping

df_list = []

# Yelp
df_yelp = pd.read_csv('sentiment labelled sentences/yelp_labelled.txt', names=['sentence', 'label'], sep='\t')
df_yelp['source'] = 'yelp'
df_list.append(df_yelp)

# Amazon
df_amazon = pd.read_csv('sentiment labelled sentences/amazon_cells_labelled.txt', names=['sentence', 'label'], sep='\t')
df_amazon['source'] = 'amazon'
df_list.append(df_amazon)

# IMDB
df_imdb = pd.read_csv('sentiment labelled sentences/imdb_labelled.txt', names=['sentence', 'label'], sep='\t')
df_imdb['source'] = 'imdb'
df_list.append(df_imdb)

# Concatenate the dataframes
df = pd.concat(df_list)

df.head()

max_features = 2000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(df['sentence'].values)
X = tokenizer.texts_to_sequences(df['sentence'].values)
X = pad_sequences(X)
y = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.12)

def create_model():
  model = Sequential()
  model.add(Embedding(max_features, 64, input_length=X.shape[1]))
  model.add(LSTM(16))
  model.add(Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

model = create_model()

model.fit(X_train, y_train, epochs=6, batch_size=16, validation_data=(X_test, y_test), callbacks = [EarlyStopping(monitor='val_accuracy', min_delta=0.001, patience=2, verbose=1)])

model.save("uci_sentimentanalysis.h5")

with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.DEFAULT_PROTOCOL)

import pickle
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing import sequence

model = load_model('uci_sentimentanalysis.h5')
with open( 'tokenizer.pickle', 'rb') as handle:
  tokenizer = pickle.load(handle)

negative_test_input = """
this is bad junk.
"""

positive_test_input = """
I love this!
"""

def sentiment_analysis(input):
  user_sequences = tokenizer.texts_to_sequences([input]) # Use the tokenizer that we loaded using pickle
  user_sequences_matrix = sequence.pad_sequences(user_sequences, maxlen=1225)
  prediction = model.predict(user_sequences_matrix) # use the loaded model that we trained in the other notebook
  return prediction[0][0]

print("Probability that `{}` is positive: {}".format(negative_test_input, sentiment_analysis(negative_test_input)))

print("Probability that `{}` is positive: {}".format(positive_test_input, sentiment_analysis(positive_test_input)))